Commit 7ec53dab authored by Federico Julian Camerota Verdu's avatar Federico Julian Camerota Verdu
Browse files

Added notebooks for the big trajectory

parent 51060640
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import mdtraj as md
import pandas as pd
import numpy as np
import cupy as cu
np.set_printoptions(suppress=True)
import time
from sklearn.decomposition import PCA
import os
import cudf
import cuml
```
%% Cell type:code id:bf0f03fd tags:
``` python
import numba.cuda
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
#fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:a3a47162 tags:
``` python
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = cu.array(a)
b = cu.array(b)
N = b.shape[0]
if a.shape[1:] != b.shape[1:]:
raise ValueError('a and b must have same shape')
a -= b
return cu.sqrt(cu.sum(cu.power(a, 2), axis=(1,2)) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz)
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz)
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:b285e57b tags:
``` python
%%time
model = cuml.PCA(n_components=2)
model_trj = trj.xyz[:,:,:]
model_trj = model_trj.reshape(model_trj.shape[0] , -1 )
#df = cudf.DataFrame(model_trj)
#df = cu.array(model_trj)
#df = cudf.from_pandas(pd.DataFrame(model_trj))
#df = cudf.DataFrame(numba.cuda.to_device(model_trj))
df = numba.cuda.to_device(model_trj)
proj = model.fit_transform(df)
```
%% Cell type:markdown id:d577fc8b tags:
%%time
#md_pca = cuml.PCA(n_components=2)
md_pca = cuml.KMeans(n_clusters=2)
%% Cell type:markdown id:1422d791 tags:
%%time
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
df = cudf.DataFrame(pd.DataFrame(pca_trj))
%% Cell type:markdown id:336698c8 tags:
%%time
proj = md_pca.fit_transform(df)
%% Cell type:code id:02894afa tags:
``` python
```
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import cudf
import cuml
from cuml.dask.cluster import KMeans as dask_kmeans
from cuml.dask.decomposition import PCA as dask_pca
import cupy as cu
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array, from_pandas
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
import os
import pandas as pd
from sklearn.decomposition import PCA
import time
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
#fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:2b526c33 tags:
``` python
Ref = md.load(topol)
```
%% Cell type:code id:a3a47162 tags:
``` python
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:code id:710a340a tags:
``` python
cluster = LocalCUDACluster(n_workers=4, protocol="ucx", enable_nvlink=True, rmm_pool_size="20GB")
#cluster = LocalCUDACluster(n_workers=4)
client = Client(cluster)
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = da.from_array(a)
a = a.map_blocks(cu.array)
b = da.from_array(b)
b = b.map_blocks(cu.array)
N = b.shape[0]
a -= b
return da.sqrt(da.sum(da.power(a, 2)) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
#kmeans_model = dask_kmeans(n_clusters=2)
model = dask_pca(n_components=2)
model_trj = trj.xyz[:,:,:]
model_trj = model_trj.reshape(model_trj.shape[0] , -1 )
dask_df = from_pandas(pd.DataFrame(model_trj), npartitions=4)
dask_df = dask_df.map_partitions(cudf.from_pandas)
proj = model.fit_transform(dask_df)
proj = da.sum(proj).compute()
```
%% Cell type:code id:0beba1b7 tags:
``` python
```
%% Cell type:code id:cc4b204e tags:
``` python
print("hello")
```
%% Cell type:code id:9b8064a8 tags:
``` python
```
%% Cell type:code id:353ca7d0 tags:
``` python
```
%% Cell type:code id:f1a4d919 tags:
``` python
```
%% Cell type:code id:ad343b32 tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
cluster = LocalCUDACluster()
client = Client(cluster)
trj = md.load(trajs,top=topol)
trj.center_coordinates()
#kmeans_model = dask_kmeans(n_clusters=2)
kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = from_array(pca_trj[:10000])
dask_df = dask_df.map_partitions(cudf.from_pandas)
#proj = kmeans_model.fit_predict(dask_df)
proj = kmeans_model.fit(dask_df)
proj = proj.compute()
proj
```
%% Cell type:code id:762c3516 tags:
``` python
```
%% Cell type:code id:355ef985 tags:
``` python
import sys
sys.path
sys.path.append('/dgx/home/userinternal/fcamerot/.conda/envs/mdtraj37/lib/python3.7/site-packages')
print(sys.path)
```
%% Cell type:markdown id:42ef43af tags:
## IMPORTS
%% Cell type:code id:worthy-electric tags:
``` python
import cudf
import cuml
from cuml.dask.cluster import KMeans as dask_kmeans
from cuml.dask.decomposition import PCA as dask_pca
import cupy as cu
import dask
import dask.array as da
from dask_cuda import LocalCUDACluster
import dask_cudf
from dask.dataframe import from_array, from_pandas
from dask.distributed import Client
import mdtraj as md
import numpy as np
np.set_printoptions(suppress=True)
import os
import pandas as pd
from sklearn.decomposition import PCA
import time
```
%% Cell type:markdown id:fe49c4d6 tags:
## INITIAL PREPARATIONS
%% Cell type:code id:narrow-retention tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
#fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
```
%% Cell type:code id:a3a47162 tags:
``` python
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
```
%% Cell type:code id:710a340a tags:
``` python
cluster = dask.distributed.LocalCluster(n_workers=4)
client = Client(cluster)
```
%% Cell type:markdown id:2ed57674 tags:
## RMSD ANALYSIS
%% Cell type:code id:8935173a tags:
``` python
def rmsd(a, b):
a = da.from_array(a)
b = da.from_array(b)
N = b.shape[0]
#if a.shape[1:] != b.shape[1:]:
# raise ValueError('a and b must have same shape')
a -= b
return da.sqrt(da.sum(da.power(a, 2), axis=(1,2)) / N)
```
%% Cell type:markdown id:d75ca8e1 tags:
### First load, then compute
%% Cell type:code id:3abf6f02 tags:
``` python
trj = md.load(trajs,top=topol)
```
%% Cell type:code id:qualified-house tags:
``` python
%%time
trj.center_coordinates()
rmsd(trj.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:5147257b tags:
### Iterate and compute
%% Cell type:code id:d5b26b6f tags:
``` python
%%time
for chunks in md.iterload(trajs, top=top):
chunks.center_coordinates()
rmsd(chunks.xyz, Ref.xyz).compute()
```
%% Cell type:markdown id:78c9f4d4 tags:
## PCA ANALYSIS
%% Cell type:code id:bd26ba99 tags:
``` python
%%time
model = dask_pca(n_components=2)
model_trj = trj.xyz[:,:,:]
model_trj = model_trj.reshape(model_trj.shape[0] , -1 )
dask_df = from_pandas(pd.DataFrame(model_trj), npartitions=4)
proj = model.fit_transform(dask_df)
proj = proj.compute()
```
%% Cell type:code id:cc4b204e tags:
``` python
```
%% Cell type:code id:9b8064a8 tags:
``` python
```
%% Cell type:code id:353ca7d0 tags:
``` python
```
%% Cell type:code id:f1a4d919 tags:
``` python
```
%% Cell type:code id:ad343b32 tags:
``` python
dataDir = '/dgx/home/userinternal/fcamerot/ligatetrajectorybenchmark/data/'
fileName = dataDir + 'nsp16_nsp10_6wkq' #~5GB trajectory
#fileName = dataDir + 'NSP12-7-8_6M71' #~10GB trajectory
topol = fileName + '.pdb'
trajs = fileName + '.xtc'
Ref = md.load(topol)
Ref.center_coordinates()
top = Ref.topology
cluster = LocalCUDACluster()
client = Client(cluster)
trj = md.load(trajs,top=topol)
trj.center_coordinates()
#kmeans_model = dask_kmeans(n_clusters=2)
kmeans_model = dask_pca(n_components=2)
pca_trj = trj.xyz[:,:,:]
pca_trj = pca_trj.reshape(pca_trj.shape[0] , -1 )
dask_df = from_array(pca_trj[:10000])
dask_df = dask_df.map_partitions(cudf.from_pandas)
#proj = kmeans_model.fit_predict(dask_df)
proj = kmeans_model.fit(dask_df)
proj = proj.compute()
proj
```
%% Cell type:code id:762c3516 tags:
``` python
```