Commit fa13d7b4 authored by Federico Julian Camerota Verdu's avatar Federico Julian Camerota Verdu
Browse files

Updated notebooks

parent fe0997f7
......@@ -44,21 +44,14 @@
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:2b526c33 tags:
``` python
%%time
Ref = md.load(topol)
```
%% Cell type:code id:a3a47162 tags:
``` python
%%time
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:markdown id:2ed57674 tags:
......@@ -68,34 +61,33 @@
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = cu.asarray(a, dtype=cu.float64)
b = cu.asarray(b, dtype=cu.float64)
a = cu.array(a)
b = cu.array(b)
N = b.shape[0]
if a.shape[1:] != b.shape[1:]:
raise ValueError('a and b must have same shape')
return cu.sqrt(cu.sum((a - b) ** 2) / N)
a -= b
return cu.sqrt(cu.sum(cu.power(a, 2)) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
%%time
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
%%timeit
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz)
```
%% Cell type:markdown id:5147257b tags:
......@@ -103,44 +95,49 @@
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
%%timeit
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz)
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:a606f5c0 tags:
%% Cell type:code id:b285e57b tags:
``` python
%%timeit
md_pca = cuml.PCA(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
#df = cudf.DataFrame(pca_trj)
df = cu.array(pca_trj)
proj = md_pca.fit_transform(df)
```
%% Cell type:markdown id:d577fc8b tags:
%%time
#md_pca = cuml.PCA(n_components=2)
md_pca = cuml.KMeans(n_clusters=2)
```
%% Cell type:code id:ed7f2f68 tags:
%% Cell type:markdown id:1422d791 tags:
``` python
%%time
# Snapshots are already loaded at 500ps intervals
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
df = cudf.DataFrame(pd.DataFrame(pca_trj))
```
%% Cell type:code id:anticipated-edition tags:
%% Cell type:markdown id:336698c8 tags:
``` python
%%time
proj = md_pca.fit_transform(df)
```
%% Cell type:code id:02894afa tags:
``` python
......
......@@ -24,11 +24,11 @@
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array
from dask.dataframe import from_array, from_pandas
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
......@@ -77,27 +77,24 @@
``` python
cluster = LocalCUDACluster()
client = Client(cluster)
```
%% Cell type:code id:0a1e441c tags:
``` python
client
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = dask.delayed(a, pure=True)
b = dask.delayed(b, pure = True)
a = da.from_array(cu.array(a))
#a = a.map_blocks(cu.array)
#b = da.from_array(b)
b = da.from_array(cu.array(b))
#b = b.map_blocks(cu.array)
N = b.shape[0]
#if a.shape[1:] != b.shape[1:]:
# raise ValueError('a and b must have same shape')
return da.sqrt(da.sum((a - b) ** 2) / N)
......@@ -130,22 +127,23 @@
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz)
rmsd(chunks.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
kmeans_model = dask_kmeans(n_clusters=2)
#kmeans_model = dask_kmeans(n_clusters=2)
kmeans_model = dask_pca(n_components=2)
```
%% Cell type:code id:ed7f2f68 tags:
``` python
......@@ -157,11 +155,11 @@
%% Cell type:code id:d7b70474 tags:
``` python
%time
dask_df = from_array(pca_trj)
dask_df = from_pandas(pd.DataFrame(pca_trj), npartitions=4)
dask_df = dask_df.map_partitions(cudf.from_pandas)
```
%% Cell type:code id:anticipated-edition tags:
......@@ -172,11 +170,11 @@
%% Cell type:code id:651d9a95 tags:
``` python
%%time
proj = proj.to_csv('kmeansOut_dask_5Gb.csv')
proj = proj.compute()
```
%% Cell type:code id:0beba1b7 tags:
``` python
......
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import cudf
import cuml
from cuml.dask.cluster import KMeans as dask_kmeans
from cuml.dask.decomposition import PCA as dask_pca
import cupy as cu
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array, from_pandas
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
import os
import pandas as pd
from sklearn.decomposition import PCA
import time
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:a3a47162 tags:
``` python
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:code id:710a340a tags:
``` python
cluster = dask.distributed.LocalCluster()
client = Client(cluster)
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = da.from_array(a)
b = da.from_array(b)
N = b.shape[0]
#if a.shape[1:] != b.shape[1:]:
# raise ValueError('a and b must have same shape')
return da.sqrt(da.sum((a - b) ** 2) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
tic = time.time()
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz).compute()
toc = time.time()
toc - tic
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = from_pandas(pd.DataFrame(pca_trj), npartitions=4)
proj = kmeans_model.fit_transform(dask_df)
proj = proj.compute()
```
%% Cell type:code id:cc4b204e tags:
``` python
```
%% Cell type:code id:9b8064a8 tags:
``` python
```
%% Cell type:code id:353ca7d0 tags:
``` python
```
%% Cell type:code id:f1a4d919 tags:
``` python
```
%% Cell type:code id:ad343b32 tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
cluster = LocalCUDACluster()
client = Client(cluster)
trj = md.load(trajs,top=topol)
trj.center_coordinates()
#kmeans_model = dask_kmeans(n_clusters=2)
kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = from_array(pca_trj[:10000])
dask_df = dask_df.map_partitions(cudf.from_pandas)
#proj = kmeans_model.fit_predict(dask_df)
proj = kmeans_model.fit(dask_df)
proj = proj.compute()
proj
```
%% Cell type:code id:762c3516 tags:
``` python
```
......@@ -43,73 +43,90 @@
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:diagnostic-dimension tags:
``` python
%%time
Ref = md.load(topol)
```
%% Cell type:code id:ba5e890d tags:
``` python
%%time
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:markdown id:a06b09ed tags:
### Load and then compute
%% Cell type:code id:e72e3849 tags:
``` python
%%timeit
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:7ca71b1c tags:
``` python
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:3e9fc85a tags:
``` python
%%timeit
trj.center_coordinates()
md.rmsd(trj,Ref,parallel=True,precentered=True)
```
%% Cell type:markdown id:46296a36 tags:
### Iterative load and compute
%% Cell type:code id:qualified-house tags:
``` python
%%time
%%timeit
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
md.rmsd(chunks,Ref,parallel=True,precentered=True)
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:empty-finland tags:
%% Cell type:code id:9129076c tags:
``` python
%%time
trj = md.load(trajs,top=topol)
%%timeit
md_pca = PCA(n_clusters=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
proj = md_pca.fit_transform(pca_trj)
```
%% Cell type:code id:24b4e8ef tags:
%% Cell type:markdown id:0475e0a8 tags:
``` python
%%time
#md_pca = PCA(n_components=20)
md_pca = KMeans(n_clusters=2)
```
%% Cell type:code id:3fc13176 tags:
%% Cell type:markdown id:0bb7f6e7 tags:
``` python
%%time
# Snapshots are already loaded at 500ps intervals
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
```
%% Cell type:code id:anticipated-edition tags:
%% Cell type:markdown id:7a7dd465 tags:
``` python
%%time
# fit_transform gives us PCA projections (/PCA space)
proj = md_pca.fit_transform(pca_trj)
```
%% Cell type:code id:d5d00107 tags:
``` python
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment