Simulate CDR3s

Simulation of 1 Million CDRs for both Human and Mouse (beta only).

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
import olga.load_model as load_model
import olga.generation_probability as pgen
import olga.sequence_generation as seq_gen
import pandas as pd


def generate_simulated_beta_seqs(params_file_name = 'tcrdist/default_models/human_T_beta/model_params.txt',
                            marginals_file_name = 'tcrdist/default_models/human_T_beta/model_marginals.txt',
                            V_anchor_pos_file ='tcrdist/default_models/human_T_beta/V_gene_CDR3_anchors.csv',
                            J_anchor_pos_file = 'tcrdist/default_models/human_T_beta/J_gene_CDR3_anchors.csv',
                            output_cols = ['cdr3_b_aa', "v_b_gene",'j_b_gene'],
                            n = 100000):
    #Load data
    genomic_data = load_model.GenomicDataVDJ()
    genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
    #Load model
    generative_model = load_model.GenerativeModelVDJ()
    generative_model.load_and_process_igor_model(marginals_file_name)
    seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)

    #Generate some random sequences


    vs=[x[0] for x in genomic_data.__dict__['genV']]
    js=[x[0] for x in genomic_data.__dict__['genJ']]
    vs = {i:k for i,k in enumerate(vs)}
    js = {i:k for i,k in enumerate(js)}

    sim_cdr3 = [seq_gen_model.gen_rnd_prod_CDR3()[1:4] for x in range(n)]
    sim_cdr3_long = [(i,vs[v],js[j]) for i,v,j in sim_cdr3 ]

    df = pd.DataFrame(sim_cdr3_long, columns = output_cols)
    return df

def generate_simulated_alpha_seqs(params_file_name = 'tcrdist/default_models/human_T_alpha/model_params.txt',
                            marginals_file_name = 'tcrdist/default_models/human_T_alpha/model_marginals.txt',
                            V_anchor_pos_file ='tcrdist/default_models/human_T_alpha/V_gene_CDR3_anchors.csv',
                            J_anchor_pos_file = 'tcrdist/default_models/human_T_alpha/J_gene_CDR3_anchors.csv',
                            output_cols = ['cdr3_a_aa', "v_a_gene",'j_a_gene'],
                            n = 100000):
    #Load data
    genomic_data = load_model.GenomicDataVJ()
    genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
    #Load model
    generative_model = load_model.GenerativeModelVJ()
    generative_model.load_and_process_igor_model(marginals_file_name)
    seq_gen_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data)

    #Generate some random sequences
    vs=[x[0] for x in genomic_data.__dict__['genV']]
    js=[x[0] for x in genomic_data.__dict__['genJ']]
    vs = {i:k for i,k in enumerate(vs)}
    js = {i:k for i,k in enumerate(js)}

    sim_cdr3 = [seq_gen_model.gen_rnd_prod_CDR3()[1:4] for x in range(n)]
    sim_cdr3_long = [(i,vs[v],js[j]) for i,v,j in sim_cdr3 ]

    df = pd.DataFrame(sim_cdr3_long, columns = output_cols)
    return df

if __name__ == "__main__":
    """
    Using Olga See: 
    ---------------
    Zachary Sethna, Yuval Elhanati, Curtis G Callan, Aleksandra M Walczak, Thierry Mora
    `Bioinformatics (2019) <https://doi.org/10.1093/bioinformatics/btz035>`_ 
    OLGA: fast computation of generation probabilities of B- and T-cell receptor amino acid sequences and motifs


    Generate 1000K (1M) CDR3s using default Olga Models
    Human (Alpha/Beta) and Mouse (Beta)

    human_T_alpha_sim1000K.csv
    human_T_beta_sim1000K.csv
    mouse_T_beta_sim1000K.csv
    
    contained in: 
    olga_T_alpha_beta_1000K_simulated_cdr3.zip 
    """
    dfb= generate_simulated_beta_seqs(params_file_name = 'tcrdist/default_models/human_T_beta/model_params.txt',
                                marginals_file_name = 'tcrdist/default_models/human_T_beta/model_marginals.txt',
                                V_anchor_pos_file ='tcrdist/default_models/human_T_beta/V_gene_CDR3_anchors.csv',
                                J_anchor_pos_file = 'tcrdist/default_models/human_T_beta/J_gene_CDR3_anchors.csv',
                                output_cols = ['cdr3_b_aa', "v_b_gene",'j_b_gene'], n = 1000000)
    dfb.to_csv('human_T_beta_sim1000K.csv', index = False)

    dfa = generate_simulated_alpha_seqs(params_file_name = 'tcrdist/default_models/human_T_alpha/model_params.txt',
                                marginals_file_name = 'tcrdist/default_models/human_T_alpha/model_marginals.txt',
                                V_anchor_pos_file ='tcrdist/default_models/human_T_alpha/V_gene_CDR3_anchors.csv',
                                J_anchor_pos_file = 'tcrdist/default_models/human_T_alpha/J_gene_CDR3_anchors.csv',
                                output_cols = ['cdr3_a_aa', "v_a_gene",'j_a_gene'],
                                n = 1000000)

    dfa.to_csv('human_T_alpha_sim1000K.csv', index = False)                         

    dfb= generate_simulated_beta_seqs(params_file_name = 'tcrdist/default_models/mouse_T_beta/model_params.txt',
                                marginals_file_name = 'tcrdist/default_models/mouse_T_beta/model_marginals.txt',
                                V_anchor_pos_file ='tcrdist/default_models/mouse_T_beta/V_gene_CDR3_anchors.csv',
                                J_anchor_pos_file = 'tcrdist/default_models/mouse_T_beta/J_gene_CDR3_anchors.csv',
                                output_cols = ['cdr3_b_aa', "v_b_gene",'j_b_gene'], n = 1000000)

These files can be downloaded directly using:

from tcrdist.setup_tests import download_and_extract_zip_file
download_and_extract_zip_file('olga_T_alpha_beta_1000K_simulated_cdr3.zip')

Simulation of CDRs rely on the OLGA by Sethna and colleages (2019).

References

Zachary Sethna, Yuval Elhanati, Curtis G Callan, Aleksandra M Walczak, Thierry Mora Bioinformatics (2019) OLGA: fast computation of generation probabilities of B- and T-cell receptor amino acid sequences and motifs