Probability of Generation

The generation probability, Pgen, defined by Walczyk and colleagues and implemented in Python by Sethna et al. (2019) is another way of filtering TCRs and TCR neighborhoods that are more frequent than expected based on their probability of being produced through recombination in early phases of immune development.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
How to add pgen estimates to human alpha/beta CDR3s
"""
import pandas as pd
from tcrdist.pgen import OlgaModel
from tcrdist import mappers 
from tcrdist.repertoire import TCRrep
from tcrdist.setup_tests import download_and_extract_zip_file

df = pd.read_csv("dash_human.csv")

tr = TCRrep(cell_df = df.sample(5, random_state = 3), 
            organism = 'human', 
            chains = ['alpha','beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv', 
            store_all_cdr = False)

olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")
olga_alpha = OlgaModel(chain_folder = "human_T_alpha", recomb_type="VJ")

tr.clone_df['pgen_cdr3_b_aa'] = olga_beta.compute_aa_cdr3_pgens(
    CDR3_seq = tr.clone_df.cdr3_b_aa)

tr.clone_df['pgen_cdr3_a_aa'] = olga_alpha.compute_aa_cdr3_pgens(
    CDR3_seq = tr.clone_df.cdr3_a_aa)

tr.clone_df[['cdr3_b_aa', 'pgen_cdr3_b_aa', 'cdr3_a_aa','pgen_cdr3_a_aa']]
"""
            cdr3_b_aa  pgen_cdr3_b_aa          cdr3_a_aa  pgen_cdr3_a_aa
0    CASSETSGRSPYEQYF    1.922623e-09    CAVRPGYSSASKIIF    1.028741e-06
1  CASSPGLASPYSYNEQFF    1.554569e-11    CAVRVLMEYGNKLVF    1.128865e-08
2       CASSSRSTDTQYF    5.184760e-07     CAGAGGGSQGNLIF    2.512580e-06
3        CASSIGVYGYTF    8.576919e-08  CAFMSNAGGTSYGKLTF    1.832054e-07
4  CASSLLVSGVSSTDTQYF    2.143508e-11       CAVIGEGGKLTF    5.698448e-12
"""

If you want to compute many pgens this example shows how you can use parmap to accelerate the computation if you have multiple cores.

1
2
3
4
5
6
7
8
"""
Really simple example of using multiple cpus to 
speed up computation of pgens with olga.
"""
import parmap
from tcrdist.pgen import OlgaModel
olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")
parmap.map(olga_beta.compute_aa_cdr3_pgen, ['CASSYRVGTDTQYF', 'CATSTNRGGTPADTQYF', 'CASQGDSFNSPLHF', 'CASSPWTGSMALHF'])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""
Test speed up of computation of many pgens using parmap to make use of more than one cpu

For 1000 CDR3 

Finished 'olga_in_series' in 26.3842 secs with 1 core
Finished 'olga_in_parmap' in 6.2384 secs with 6 cores
"""
import numpy as np
import pandas as pd
import parmap
from tcrdist.pgen import OlgaModel
from tcrdist.speed import timer
from tcrdist.adpt_funcs import _valid_cdr3

from tcrdist.setup_tests import download_and_extract_zip_file
download_and_extract_zip_file( 'cdr3_beta_500K.zip', source = "dropbox", dest = ".")

olga_beta  = OlgaModel(chain_folder = "human_T_beta", recomb_type="VDJ")

n = 1000
df = pd.read_csv('cdr3_beta_500K.csv')
inputlist = df.iloc[:,0].to_list()[0:n]
inputlist  = [x for x in inputlist if _valid_cdr3(x)]

@timer
def olga_in_series(f = olga_beta.compute_aa_cdr3_pgen, input = inputlist):
    return [f(x) for x in input]

@timer
def olga_in_parmap(f = olga_beta.compute_aa_cdr3_pgen, input = inputlist, **kwargs):
    return parmap.map(f, input, pm_pbar=True, **kwargs)

r1 = olga_in_series(f = olga_beta.compute_aa_cdr3_pgen, input = inputlist)
r2 = olga_in_parmap(f = olga_beta.compute_aa_cdr3_pgen, input = inputlist)

assert np.all(r1 == r2)

References

Zachary Sethna, Yuval Elhanati, Curtis G Callan, Aleksandra M Walczak, Thierry Mora Bioinformatics (2019) OLGA: fast computation of generation probabilities of B- and T-cell receptor amino acid sequences and motifs