Query individual files#

Here, we’ll query individual files and inspect their metadata.

This guide can be skipped if you are only interested in how to leverage the overall dataset.

import lamindb as ln
import lnschema_bionty as lb
import anndata as ad

💡 loaded instance: testuser1/test-scrna (lamindb 0.56a1)

ln.track()

💡 notebook imports: anndata==0.9.2 lamindb==0.56a1 lnschema_bionty==0.32.0

💡 Transform(id=3, uid='agayZTonayqAz8', name='Query individual files', short_name='scrna3', version='0', type=notebook, updated_at=2023-10-16 21:47:54, created_by_id=1)

💡 Run(id=3, uid='1z1pcGU3opPZ1kawQaj8', run_at=2023-10-16 21:47:54, transform_id=3, created_by_id=1)

Access #

Query files by provenance metadata#

users = ln.User.lookup()

ln.Transform.filter(created_by=users.testuser1).search("scrna")

	id	__ratio__
name
scRNA-seq	1	90.0
Append a new batch of data	2	36.0
Query individual files	3	36.0

transform = ln.Transform.filter(uid="Nv48yAceNSh8z8").one()

ln.File.filter(transform=transform).df()

	uid	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
1	MpmFBPjOegAuuyRzbGAq	1	None	.h5ad	AnnData	Conde22	None	57615999	6Hu1BywwK6bfIU2Dpku2xZ	sha1-fl	1	1	None	2023-10-16 21:47:05	1

Query files based on biological metadata#

assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()

query = ln.File.filter(
    experimental_factors=assays.single_cell_rna_sequencing,
    species=species.human,
    cell_types=cell_types.gamma_delta_t_cell,
)

query.df()

	uid	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
1	MpmFBPjOegAuuyRzbGAq	1	None	.h5ad	AnnData	Conde22	None	57615999	6Hu1BywwK6bfIU2Dpku2xZ	sha1-fl	1	1	None	2023-10-16 21:47:05	1

Transform #

Compare gene sets#

Get file objects:

query = ln.File.filter()

file1, file2 = query.list()

file1.describe()

File(id=1, uid='MpmFBPjOegAuuyRzbGAq', suffix='.h5ad', accessor='AnnData', description='Conde22', size=57615999, hash='6Hu1BywwK6bfIU2Dpku2xZ', hash_type='sha1-fl', updated_at=2023-10-16 21:47:05)

Provenance:
  🗃️ storage: Storage(id=1, uid='bEvhinvc', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-16 21:45:55, created_by_id=1)
  📔 transform: Transform(id=1, uid='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-10-16 21:46:01, created_by_id=1)
  👣 run: Run(id=1, uid='Tcf2JasApz9P8RE7gqna', run_at=2023-10-16 21:46:01, transform_id=1, created_by_id=1)
  👤 created_by: User(id=1, uid='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-16 21:45:55)
  ⬇️ input_of (core.Run): ['2023-10-16 21:47:14']
Features:
  var: FeatureSet(id=1, uid='ISCdUu2vePwTzsv99UmG', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-10-16 21:46:57, modality_id=1, created_by_id=1)
    'MIR1302-2HG', 'FAM138A', 'OR4F5', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'OR4F29', 'None', 'OR4F16', 'None', 'LINC01409', 'FAM87B', 'LINC01128', 'LINC00115', 'FAM41C', 'None', ...
  obs: FeatureSet(id=2, uid='AeMRCgTDsUFRNwXFlqD9', n=4, registry='core.Feature', hash='xPTyeKYm-_4RH5MEI97t', updated_at=2023-10-16 21:46:58, modality_id=2, created_by_id=1)
    🔗 cell_type (32, bionty.CellType): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage', ...
    🔗 assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 3' v3', '10x 5' v2', '10x 5' v1'
    🔗 tissue (17, bionty.Tissue): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow', ...
    🔗 donor (12, core.ULabel): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C', ...
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ tissues (17, bionty.Tissue): 'blood', 'thoracic lymph node', 'spleen', 'lung', 'mesenteric lymph node', 'lamina propria', 'liver', 'jejunal epithelium', 'omentum', 'bone marrow', ...
  🏷️ cell_types (32, bionty.CellType): 'classical monocyte', 'T follicular helper cell', 'memory B cell', 'alveolar macrophage', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated', 'alpha-beta T cell', 'CD4-positive helper T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'macrophage', ...
  🏷️ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 3' v3', '10x 5' v2', '10x 5' v1'
  🏷️ ulabels (12, core.ULabel): 'D496', '621B', 'A29', 'A36', 'A35', '637C', 'A52', 'A37', 'D503', '640C', ...

file1.view_flow()

https://d33wubrfki0l68.cloudfront.net/bcac811fef9a79b02856a329ebf96e4ed6e712f1/48dd4/_images/0caeca42f2e6c362870efbf80fc2e6c3d2ca57853f8c51826736a7327de21070.svg

file2.describe()

File(id=2, uid='qscOv2TpO4PaevwVlwRs', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=857752, hash='j6o6e27xPdqHQyT7Em_7MQ', hash_type='md5', updated_at=2023-10-16 21:47:42)

Provenance:
  🗃️ storage: Storage(id=1, uid='bEvhinvc', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-16 21:45:55, created_by_id=1)
  📔 transform: Transform(id=2, uid='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna2', version='0', type='notebook', updated_at=2023-10-16 21:47:14, created_by_id=1)
  👣 run: Run(id=2, uid='xkYC1M6aAdWdAwNLWCmV', run_at=2023-10-16 21:47:14, transform_id=2, created_by_id=1)
  👤 created_by: User(id=1, uid='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-16 21:45:55)
Features:
  var: FeatureSet(id=4, uid='tPDeC1XYr2fGU3UipJot', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-10-16 21:47:42, modality_id=1, created_by_id=1)
    'IL18', 'NPM3', 'S100A9', 'S100A8', 'CNN2', 'ARHGAP45', 'RNF34', 'GPX4', 'S100A6', 'ADISSP', 'S100A4', 'FAM174C', 'SIT1', 'CCDC107', 'RSL1D1', 'TLN1', 'HES4', 'TNFRSF17', 'PCNA', 'RAB13', ...
  obs: FeatureSet(id=5, uid='6AEOiT1l8HTdqkAVleqZ', n=1, registry='core.Feature', hash='PFicj8Uq94k6vPsRmJvl', updated_at=2023-10-16 21:47:42, modality_id=2, created_by_id=1)
    🔗 cell_type (9, bionty.CellType): 'B cell, CD19-positive', 'dendritic cell', 'central memory CD8-positive, alpha-beta T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'monocyte', 'mature T cell'
  external: FeatureSet(id=6, uid='unwxLOfuEVBcpuW10Uhu', n=2, registry='core.Feature', hash='m_3u0np1BS5T5HSnawJY', updated_at=2023-10-16 21:47:42, modality_id=2, created_by_id=1)
    🔗 assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
    🔗 species (1, bionty.Species): 'human'
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ cell_types (9, bionty.CellType): 'B cell, CD19-positive', 'dendritic cell', 'central memory CD8-positive, alpha-beta T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'monocyte', 'mature T cell'
  🏷️ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'

file2.view_flow()

https://d33wubrfki0l68.cloudfront.net/a790fa5282d5a89c89c12610211984d140300446/91f19/_images/edf1e0c8308f7f8890550d899e3fe69163c1e0523beeffa1edb67e521f3a1f36.svg

Load files into memory:

file1_adata = file1.load()
file2_adata = file2.load()

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
len(shared_genes)

shared_genes.list("symbol")[:10]

['HES4',
 'TNFRSF4',
 'SSU72',
 'PARK7',
 'RBP7',
 'SRM',
 'MAD2L2',
 'AGTRAP',
 'TNFRSF1B',
 'EFHD2']

Compare cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

['CD8-positive, alpha-beta memory T cell',
 'CD16-positive, CD56-dim natural killer cell, human']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]

Concatenate subsetted datasets:

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

AnnData object with n_obs × n_vars = 244 × 749
    obs: 'cell_type', 'file'
    obsm: 'X_umap'

adata_concat.obs.value_counts()

cell_type                                           file               
CD8-positive, alpha-beta memory T cell              Conde22                120
CD16-positive, CD56-dim natural killer cell, human  Conde22                114
CD8-positive, alpha-beta memory T cell              10x reference adata      7
CD16-positive, CD56-dim natural killer cell, human  10x reference adata      3
dtype: int64