Commit 9e7fe362 authored by Federico Julian Camerota Verdu's avatar Federico Julian Camerota Verdu
Browse files

Added notebooks for dask

parent 697818c7
......@@ -35,11 +35,11 @@
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data'
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
......@@ -117,31 +117,31 @@
%% Cell type:code id:a606f5c0 tags:
``` python
%%time
md_pca = cuml.PCA(n_components=20)
#md_pca = cuml.PCA(n_components=2)
md_pca = cuml.KMeans(n_clusters=2)
```
%% Cell type:code id:ed7f2f68 tags:
``` python
%%time
# Snapshots are already loaded at 500ps intervals
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
df = cudf.DataFrame(pca_trj)
df = cudf.DataFrame(pd.DataFrame(pca_trj))
```
%% Cell type:code id:anticipated-edition tags:
``` python
%%time
# fit_transform gives us PCA projections (/PCA space)
proj = md_pca.fit_transform(df)
```
%% Cell type:code id:ef498b33 tags:
%% Cell type:code id:02894afa tags:
``` python
```
......
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import cudf
import cuml
from cuml.dask.cluster import KMeans as dask_kmeans
from cuml.dask.decomposition import PCA as dask_pca
import cupy as cu
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array, from_pandas
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
import os
import pandas as pd
from sklearn.decomposition import PCA
import time
```
%% Cell type:code id:2b7768c1 tags:
``` python
dask.config.config
```
%% Cell type:code id:41faa054 tags:
``` python
dask.config.get('distributed.comm.timeouts.connect')
```
%% Cell type:code id:c1103a7c tags:
``` python
dask.config.set({'distributed.comm.timeouts.connect': '120s'})
```
%% Cell type:code id:7e574ce4 tags:
``` python
dask.config.get('distributed.comm.timeouts.connect')
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
#fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:2b526c33 tags:
``` python
%%time
Ref = md.load(topol)
```
%% Cell type:code id:a3a47162 tags:
``` python
%%time
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:code id:710a340a tags:
``` python
cluster = LocalCUDACluster()
client = Client(cluster, timeout=500)
```
%% Cell type:code id:0a1e441c tags:
``` python
client
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = dask.delayed(a, pure=True)
b = dask.delayed(b, pure = True)
N = b.shape[0]
#if a.shape[1:] != b.shape[1:]:
# raise ValueError('a and b must have same shape')
return da.sqrt(da.sum((a - b) ** 2) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
%%time
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz)
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
kmeans_model = dask_kmeans(n_clusters=2)
```
%% Cell type:code id:ed7f2f68 tags:
``` python
%%time
# Snapshots are already loaded at 500ps intervals
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
```
%% Cell type:code id:709edb34 tags:
``` python
%%time
dask_df = from_pandas(pd.DataFrame(pca_trj), npartitions=6)
dask_df = dask_df.map_partitions(cudf.from_pandas)
```
%% Cell type:code id:3a800abd tags:
``` python
dask_df
```
%% Cell type:markdown id:2233bc47 tags:
%%time
dask_df = dask_cudf.from_cudf(cudf.from_pandas(pd.DataFrame(pca_trj)), npartitions=4)
%% Cell type:markdown id:06705619 tags:
%time
dask_df = from_array(pca_trj)
dask_df = dask_df.map_partitions(cudf.from_pandas)
%% Cell type:code id:anticipated-edition tags:
``` python
%%time
proj = kmeans_model.fit_transform(dask_df)
```
%% Cell type:code id:595073a9 tags:
``` python
%%time
proj = proj.compute()
```
%% Cell type:markdown id:2ef51b65 tags:
%%time
proj = proj.to_csv('kmeansOut_dask_5Gb.csv')
%% Cell type:code id:0beba1b7 tags:
``` python
```
%% Cell type:code id:cc4b204e tags:
``` python
%%time
kmeans_model.fit_transform(from_array(pca_trj).map_partitions(cudf.from_pandas)).compute()
```
%% Cell type:code id:9b8064a8 tags:
``` python
```
%% Cell type:code id:f1a4d919 tags:
``` python
```
%% Cell type:code id:af099f63 tags:
``` python
%%timeit -n 3
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
trj = md.load(trajs,top=topol)
trj.center_coordinates()
kmeans_model = dask_kmeans(n_clusters=2)
#kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = dask_cudf.from_cudf(cudf.from_pandas(pd.DataFrame(pca_trj)), npartitions=4)
#proj = kmeans_model.fit_predict(dask_df)
proj = kmeans_model.fit(dask_df)
proj = proj.compute()
```
%% Cell type:code id:762c3516 tags:
``` python
```
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import cudf
import cuml
from cuml.dask.cluster import KMeans as dask_kmeans
from cuml.dask.decomposition import PCA as dask_pca
import cupy as cu
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
import os
import pandas as pd
from sklearn.decomposition import PCA
import time
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:2b526c33 tags:
``` python
%%time
Ref = md.load(topol)
```
%% Cell type:code id:a3a47162 tags:
``` python
%%time
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:code id:710a340a tags:
``` python
cluster = LocalCUDACluster()
client = Client(cluster)
```
%% Cell type:code id:0a1e441c tags:
``` python
client
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = dask.delayed(a, pure=True)
b = dask.delayed(b, pure = True)
N = b.shape[0]
#if a.shape[1:] != b.shape[1:]:
# raise ValueError('a and b must have same shape')
return da.sqrt(da.sum((a - b) ** 2) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
%%time
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz)
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
kmeans_model = dask_kmeans(n_clusters=2)
```
%% Cell type:code id:ed7f2f68 tags:
``` python
%%time
# Snapshots are already loaded at 500ps intervals
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
```
%% Cell type:code id:d7b70474 tags:
``` python
%time
dask_df = from_array(pca_trj)
dask_df = dask_df.map_partitions(cudf.from_pandas)
```
%% Cell type:code id:anticipated-edition tags:
``` python
%%time
proj = kmeans_model.fit_transform(dask_df)
```
%% Cell type:code id:651d9a95 tags:
``` python
%%time
proj = proj.to_csv('kmeansOut_dask_5Gb.csv')
```
%% Cell type:code id:0beba1b7 tags:
``` python
```
%% Cell type:code id:cc4b204e tags:
``` python
```
%% Cell type:code id:9b8064a8 tags:
``` python
```
%% Cell type:code id:353ca7d0 tags:
``` python
```
%% Cell type:code id:f1a4d919 tags:
``` python
```
%% Cell type:code id:ad343b32 tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
cluster = LocalCUDACluster()
client = Client(cluster)
trj = md.load(trajs,top=topol)
trj.center_coordinates()
#kmeans_model = dask_kmeans(n_clusters=2)
kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = from_array(pca_trj[:10000])
dask_df = dask_df.map_partitions(cudf.from_pandas)
#proj = kmeans_model.fit_predict(dask_df)
proj = kmeans_model.fit(dask_df)
proj = proj.compute()
proj
```
%% Cell type:code id:762c3516 tags:
``` python
```
......@@ -21,10 +21,12 @@
np.set_printoptions(suppress=True)
import time
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import os
```
%% Cell type:markdown id:fe49c4d6 tags:
......@@ -32,11 +34,11 @@
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data'
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory