Meta-Clonotypes

The script for meta-clonotype discovery contains a function (find_metaclonotypes). It encapsulates a workflow we used for finding meta-clonotypes in antigen-enriched data as described in ‘TCR meta-clonotypes for biomarker discovery with tcrdist3: quantification of public, HLA-restricted TCR biomarkers of SARS-CoV-2 infection’ bioRxiv 2020 .

To ensure you have the necessary background reference files in your environment or the tcrdist3 docker container, run the following before getting started:

apt-get install unzip
python3 -c "from tcrsampler.setup_db import install_all_next_gen; install_all_next_gen(dry_run = False)"

Meta-Clonotype Discovery

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
"""
2020-12-18
Seattle, WA
HLA CLASS I restricted meta-clonotype discovery.

This functions encapsulates a complete work flow for finding meta-clonotypes 
in antigen-enriched data.

prepare docker container:
apt-get install unzip
python3 -c "from tcrsampler.setup_db import install_all_next_gen; install_all_next_gen(dry_run = False)"
"""
import sys
import os
import numpy as np
import pandas as pd
import scipy.sparse
import matplotlib.pyplot as plt
from tcrdist.paths import path_to_base
from tcrdist.repertoire import TCRrep
from tcrdist.automate import auto_pgen
from tcrsampler.sampler import TCRsampler
from tcrdist.background import get_stratified_gene_usage_frequency
from tcrdist.background import sample_britanova
from tcrdist.background import make_gene_usage_counter, get_gene_frequencies
from tcrdist.background import calculate_adjustment, make_gene_usage_counter
from tcrdist.background import make_vj_matched_background, make_flat_vj_background
from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs
from tcrdist.centers import calc_radii
from tcrdist.public import _neighbors_variable_radius
from tcrdist.public import _neighbors_sparse_variable_radius
from tcrdist.regex import _index_to_regex_str

def find_metaclonotypes(
    project_path = "tutorial48",
    source_path = os.path.join(path_to_base,'tcrdist','data','covid19'),
    antigen_enriched_file = 'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv',
    ncpus = 4, 
    seed = 3434):
    """
    This functions encapsulates a complete 
    workflow for finding meta-clonotypes in antigen-enriched data.
    """
    np.random.seed(seed)
    if not os.path.isdir(project_path):
        os.mkdir(project_path)
    ############################################################################
    # Step 1: Select and load a antigen-enriched (sub)repertoire.           ####
    ############################################################################
    print(f"INITIATING A TCRrep() with {antigen_enriched_file}")
    assert os.path.isfile(os.path.join(source_path, antigen_enriched_file))
        # Read file into a Pandas DataFrame <df>
    df = pd.read_csv(os.path.join(source_path, antigen_enriched_file))
        # Drop cells without any gene usage information
    df = df[( df['v_b_gene'].notna() ) & (df['j_b_gene'].notna()) ]
        # Initialize a TCRrep class, using ONLY columns that are complete and unique define a a clone.
        # Class provides a 'count' column if non is present
        # Counts of identical subject:VCDR3 'clones' will be aggregated into a TCRrep.clone_df.
    from tcrdist.repertoire import TCRrep
    tr = TCRrep(cell_df = df[['subject','cell_type','v_b_gene', 'j_b_gene', 'cdr3_b_aa']], 
                organism = "human", 
                chains = ['beta'], 
                compute_distances = True)
    tr.cpus = ncpus
    ############################################################################
    # Step 1.1: Estimate Probability of Generation                          ####
    ############################################################################
    ### It will be useful later to know the pgen of each
    from tcrdist.automate import auto_pgen
    print(f"COMPUTING PGEN WITH OLGA (Sethna et al 2018)")
    print("FOR ANTIGEN-ENRICHED CLONES TO BE USED FOR SUBSEQUENT ANALYSES")
    auto_pgen(tr)

    # Tip: Users of tcrdist3 should be aware that by default a <TCRrep.clone_df> 
    # DataFrame is created out of non-redundant cells in the cell_df, and 
    # pairwise distance matrices automatically computed.
    # Notice that attributes <tr.clone_df>  and  <tr.pw_beta> , <tr.pw_cdr3_b_aa>, 
    # are immediately accessible.
    # Attributes <tr.pw_pmhc_b_aa>, <tr.pw_cdr2_b_aa>, and <tr.pw_cdr1_b_aa>  
    # are also available if <TCRrep.store_all_cdr> is set to True.
    # For large datasets, i.e., >15,000 clones, this approach may consume too much 
    # memory so <TCRrep.compute_distances> is automatically set to False. 
                                    
    ############################################################################
    # Step 2: Synthesize an Inverse Probability Weighted VJ Matched Background #
    ############################################################################
    # Generating an appropriate set of unenriched reference TCRs is important; for
    # each set of antigen-associated TCRs, discovered by MIRA, we created a two part
    # background. One part consists of 100,000 synthetic TCRs whose V-gene and J-gene
    # frequencies match those in the antigen-enriched repertoire, using the software
    # OLGA (Sethna et al. 2019; Marcou et al. 2018). The other part consists of
    # 100,000 umbilical cord blood TCRs sampled uniformly from 8 subjects (Britanova
    # et al., 2017). This mix balances dense sampling of sequences near the
    # biochemical neighborhoods of interest with broad sampling of TCRs from an
    # antigen-naive repertoire. Importantly, we adjust for the biased sampling by
    # using the V- and J-gene frequencies observed in the cord-blood data (see
    # Methods for details about inverse probability weighting adjustment). Using this
    # approach we are able to estimate the abundance of TCRs similar to a centroid
    # TCR in an unenriched background repertoire of ~1,000,000 TCRs, using a
    # comparatively modest background dataset of 200,000 TCRs. While this estimate
    # may underestimate the true specificity, since some of the neighborhood TCRs in
    # the unenriched background repertoire may in fact recognize the antigen of
    # interest, it is useful for prioritizing neighborhoods and selecting a radius
    # for each neighborhood that balances sensitivity and specificity.
    # Initialize a TCRsampler -- human, beta, umbilical cord blood from 8 people.
    print(f"USING tcrsampler TO CONSTRUCT A CUSTOM V-J MATCHED BACKGROUND")
    from tcrsampler.sampler import TCRsampler
    ts = TCRsampler(default_background = 'britanova_human_beta_t_cb.tsv.sampler.tsv')
    # Stratify sample so that each subject contributes similarly to estimate of 
    # gene usage frequency
    from tcrdist.background import get_stratified_gene_usage_frequency
    ts = get_stratified_gene_usage_frequency(ts = ts, replace = True) 
    # Synthesize an inverse probability weighted V,J gene background that matches 
    # usage in your enriched repertoire 
    df_vj_background = tr.synthesize_vj_matched_background(ts = ts, chain = 'beta')
    # Get a randomly drawn stratified sampler of beta, cord blood from 
    # Britanova et al. 2016 
    # Dynamics of Individual T Cell Repertoires: From Cord Blood to Centenarians
    from tcrdist.background import  sample_britanova
    df_britanova_100K = sample_britanova(size = 100000)
    # Append frequency columns using, using sampler above
    df_britanova_100K = get_gene_frequencies(ts = ts, df = df_britanova_100K)
    df_britanova_100K['weights'] = 1
    df_britanova_100K['source'] = "stratified_random"
    # Combine the two parts of the background into a single DataFrame
    df_bkgd = pd.concat([df_vj_background.copy(), df_britanova_100K.copy()], axis = 0).\
        reset_index(drop = True)                                              
    # Assert that the backgrounds have the expected number of rows.
    assert df_bkgd.shape[0] == 200000
    # Save the background for future use
    background_outfile = os.path.join(project_path, f"{antigen_enriched_file}.olga100K_brit100K_bkgd.csv")
    print(f'WRITING {background_outfile}')
    df_bkgd.to_csv(background_outfile, index = False)
    # Load the background to a TCRrep without computing pairwise distances 
    # (i.e., compute_distances = False)
    tr_bkgd = TCRrep(
        cell_df = df_bkgd,
        organism = "human", 
        chains = ['beta'], 
        compute_distances = False)
    # Compute rectangular distances. Those are, distances between each clone in 
    # the antigen-enriched repertoire and each TCR in the background.
    # With a single 1 CPU and < 10GB RAM, 5E2x2E5 = 100 million pairwise distances, 
    # across CDR1, CDR2, CDR2.5, and CDR3 
    # 1min 34s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) 
    # %timeit -r 1 tr.compute_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df, store = False)
    ############################################################################
    # Step 4: Calculate Distances                                          #####
    ############################################################################
    print(f"COMPUTING RECTANGULARE DISTANCE")
    tr.compute_sparse_rect_distances(
        df = tr.clone_df, 
        df2 = tr_bkgd.clone_df,
        radius=50,
        chunk_size = 100)
    scipy.sparse.save_npz(os.path.join(project_path, f"{antigen_enriched_file}.rw_beta.npz"), tr.rw_beta)
        # Tip: For larger dataset you can use a sparse implementation: 
        # 30.8 s ± 0 ns per loop ; tr.cpus = 6
        # %timeit -r tr.compute_sparse_rect_distances(df = tr.clone_df, df2 = tr_bkdg.clone_df,radius=50, chunk_size=85)
    ############################################################################
    # Step 5: Examine Density ECDFS                                        #####
    ############################################################################
        # Investigate the density of neighbors to each TCR, based on expanding 
        # distance radius.
    from tcrdist.ecdf import distance_ecdf, _plot_manuscript_ecdfs
    import matplotlib.pyplot as plt
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against itself).
    thresholds, antigen_enriched_ecdf = distance_ecdf(
        tr.pw_beta,
        thresholds=range(0,50,2))
        # Compute empirical cumulative density function (ecdf)
        # Compare Antigen Enriched TCRs (against) 200K probability 
        # inverse weighted background
    thresholds, background_ecdf = distance_ecdf(
        tr.rw_beta,
        thresholds=range(0,50,2),
        weights= tr_bkgd.clone_df['weights'], 
        absolute_weight = True)
        # plot_ecdf similar to tcrdist3 manuscript #
    antigen_enriched_ecdf[antigen_enriched_ecdf == antigen_enriched_ecdf.min()] = 1E-10
    f1 = _plot_manuscript_ecdfs(
        thresholds, 
        antigen_enriched_ecdf, 
        ylab= 'Proportion of Antigen Enriched TCRs', 
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(), 
        min_freq=1E-10)
    f1.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_AER_plot.png'))
    f2 = _plot_manuscript_ecdfs(
        thresholds,
        background_ecdf,
        ylab= 'Proportion of Reference TCRs',
        cdr3_len=tr.clone_df.cdr3_b_aa.str.len(),
        min_freq=1E-10)
    f2.savefig(os.path.join(project_path, f'{antigen_enriched_file}.ecdf_BUR_plot.png'))
    ############################################################################
    # Step 6: Find optimal radii  (theta = 1E5                             #####
    ############################################################################
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.                                                                                                                                  
    #  d888   .d8888b.  8888888888     888888888  
    # d8888  d88P  Y88b 888            888        
    #   888  888    888 888            888        
    #   888  888    888 8888888        8888888b.  
    #   888  888    888 888                 "Y88b 
    #   888  888    888 888      888888       888 
    #   888  Y88b  d88P 888            Y88b  d88P 
    # 8888888 "Y8888P"  8888888888      "Y8888P"                                         
   
    level_tag = '1E5'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-5, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)

    ############################################################################
    # Step 6.2: (theta = 1E5) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.3:  (theta = 1E5) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    ############################################################################
    # Step 6.4: (theta = 1E5) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each NR meta-clonotype
    if ranked_centers_df.shape[0] > 0:
        from progress.bar import IncrementalBar
        from tcrdist.public import make_motif_logo
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa','v_b_gene', 'j_b_gene', 'pgen',
                'radius', 'regex','nsubject','K_neighbors', 
                'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")
    # To ascertain which meta-clonotypes are likely to be most specific, 
    # take advantage of an existing function <bkgd_cntrl_nn2>.       
    #  d888   .d8888b.  8888888888       .d8888b.  
    # d8888  d88P  Y88b 888             d88P  Y88b 
    #   888  888    888 888             888        
    #   888  888    888 8888888         888d888b.  
    #   888  888    888 888             888P "Y88b 
    #   888  888    888 888      888888 888    888 
    #   888  Y88b  d88P 888             Y88b  d88P 
    # 8888888 "Y8888P"  8888888888       "Y8888P" 
    ############################################################################
    # Step 6.5: Find optimal radii  (theta = 1E6)                            ###
    ############################################################################
    level_tag = '1E6'
    from tcrdist.neighbors import bkgd_cntl_nn2
    centers_df  = bkgd_cntl_nn2(
        tr               = tr,
        tr_background    = tr_bkgd,
        weights          = tr_bkgd.clone_df.weights,
        ctrl_bkgd        = 10**-6, 
        col              = 'cdr3_b_aa',
        add_cols         = ['v_b_gene', 'j_b_gene'],
        ncpus            = 4,
        include_seq_info = True,
        thresholds       = [x for x in range(0,50,2)],
        generate_regex   = True,
        test_regex       = True,
        forced_max_radius = 36)
    ############################################################################
    # Step 6.6: (theta = 1E6) ALL meta-clonotypes .tsv file                   ##
    ############################################################################
    # save center to project_path for future use
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )
    
    # Many of meta-clonotypes contain redundant information. 
    # We can winnow down to less-redundant list. We do this 
    # by ranking clonotypes from most to least specific. 
        # <min_nsubject> is minimum publicity of the meta-clonotype,  
        # <min_nr> is minimum non-redundancy
    # Add neighbors, K_neighbors, and nsubject columns
    from tcrdist.public import _neighbors_variable_radius, _neighbors_sparse_variable_radius
    centers_df['neighbors'] = _neighbors_variable_radius(pwmat=tr.pw_beta, radius_list = centers_df['radius'])
    centers_df['K_neighbors'] = centers_df['neighbors'].apply(lambda x : len(x))
    # We determine how many <nsubjects> are in the set of neighbors 
    centers_df['nsubject']  = centers_df['neighbors'].\
            apply(lambda x: tr.clone_df['subject'].iloc[x].nunique())
    centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    from tcrdist.centers import rank_centers
    ranked_centers_df = rank_centers(
        centers_df = centers_df, 
        rank_column = 'chi2joint', 
        min_nsubject = 2, 
        min_nr = 1)
    ############################################################################
    # Step 6.7:  (theta = 1E6) NR meta-clonotypes .tsv file                  ###
    ############################################################################
    # Output, ready to search bulk data.
    ranked_centers_df.to_csv( os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.tsv'), sep = "\t" )

    ############################################################################
    # Step 6.8: (theta = 1E6) Output Meta-Clonotypes HTML Summary            ###
    ############################################################################
    # Here we can make a svg logo for each meta-clonotype
    from progress.bar import IncrementalBar
    from tcrdist.public import make_motif_logo
    if ranked_centers_df.shape[0] > 0:
        cdr3_name = 'cdr3_b_aa'
        v_gene_name = 'v_b_gene'
        svgs = list()
        svgs_raw = list()
        bar = IncrementalBar('Processing', max = ranked_centers_df.shape[0])
        for i,r in ranked_centers_df.iterrows():
            bar.next()
            centroid = r[cdr3_name]
            v_gene   = r[v_gene_name]
            svg, svg_raw = make_motif_logo( tcrsampler = ts, 
                                            pwmat = tr.pw_beta,
                                            clone_df = tr.clone_df,
                                            centroid = centroid ,
                                            v_gene = v_gene ,
                                            radius = r['radius'],
                                            pwmat_str = 'pw_beta',
                                            cdr3_name = 'cdr3_b_aa',
                                            v_name = 'v_b_gene',
                                            gene_names = ['v_b_gene','j_b_gene'])
            svgs.append(svg)
            svgs_raw.append(svg_raw)
        bar.next();bar.finish()
        ranked_centers_df['svg']      = svgs
        ranked_centers_df['svg_raw'] = svgs_raw

        def shrink(s):
            return s.replace('height="100%"', 'height="20%"').replace('width="100%"', 'width="20%"')
        labels =['cdr3_b_aa', 'v_b_gene', 'j_b_gene', 'pgen', 'radius', 'regex','nsubject','K_neighbors', 'bkgd_hits_weighted','chi2dist','chi2re','chi2joint']
        
        output_html_name = os.path.join(project_path, f'{antigen_enriched_file}.ranked_centers_bkgd_ctlr_{level_tag}.html')
        # 888    888 88888888888 888b     d888 888      
        # 888    888     888     8888b   d8888 888      
        # 888    888     888     88888b.d88888 888      
        # 8888888888     888     888Y88888P888 888      
        # 888    888     888     888 Y888P 888 888      
        # 888    888     888     888  Y8P  888 888      
        # 888    888     888     888   "   888 888      
        # 888    888     888     888       888 88888888     
        with open(output_html_name, 'w') as output_handle:
            for i,r in ranked_centers_df.iterrows():
                #import pdb; pdb.set_trace()
                svg, svg_raw = r['svg'],r['svg_raw']
                output_handle.write("<br></br>")
                output_handle.write(shrink(svg))
                output_handle.write(shrink(svg_raw))
                output_handle.write("<br></br>")
                output_handle.write(pd.DataFrame(r[labels]).transpose().to_html())
                output_handle.write("<br></br>")




if __name__ == "__main__":
    # 888      888                               888             
    # 888      888                               888             
    # 888      888                               888             
    # 88888b.  888  8888b.     .d8888b   .d88b.  888888 .d8888b  
    # 888 "88b 888     "88b    88K      d8P  Y8b 888    88K      
    # 888  888 888 .d888888    "Y8888b. 88888888 888    "Y8888b. 
    # 888  888 888 888  888         X88 Y8b.     Y88b.       X88 
    # 888  888 888 "Y888888     88888P'  "Y8888   "Y888  88888P' 
    # December 2020 
    # Seattle, WA
    
    # If run as a script, this will find meta-clonotypes for the 18
    # SARS-CoV-2 MIRA sets with the most prior evidence of restriction
    # by as specific HLA-alleles.

    from collections import defaultdict 
    import os 
    import pandas as pd
    import re
    # <path> where files reside
    path = os.path.join(path_to_base,'tcrdist','data','covid19')
    # <all_files> list of all files
    all_files = [f for f in os.listdir(path) if f.endswith('.tcrdist3.csv')]
    # <restriction> list of tuples from Supporting Table S5
    restriction = \
    [('m_55_524_ALR','A*01'),
    ('m_1_8260_HTT','A*01'),
    ('m_45_689_SYF','A*01'),
    ('m_10_2274_LSP','B*07'),
    ('m_155_59_RAR','B*07'),
    ('m_133_102_NQK','B*15'),
    ('m_48_610_YLQ','A*02'),
    ('m_111_146_AEI','A*11'),
    ('m_53_532_NYL','A*24'),
    ('m_90_216_GYQ','C*07'),
    ('m_140_92_NSS','A*01'),
    ('m_55_524_ALR','B*08'),
    ('m_183_39_RIR','A*03'),
    ('m_10_2274_LSP','C*07'),
    ('m_99_191_QEC','B*40'),
    ('m_155_59_RAR','C*07'),
    ('m_185_39_ASQ','B*15'),
    ('m_147_73_DLF','B*08'),
    ('m_110_148_ELI','B*44'),
    ('m_51_546_AYK','A*03'),
    ('m_44_697_FPP','B*35'),
    ('m_118_136_ALN','A*11'),
    ('m_176_44_SST','A*11'),
    ('m_30_1064_KAY','B*57'),
    ('m_192_31_FQP','B*15'),
    ('m_70_345_DTD','A*01')]
    # <restrictions_dict> convert list to dictionary
    restrictions_dict = defaultdict(list)
    for k,v in restriction:
        restrictions_dict[k].append(v)
    # Loop through all files to construct a Dataframe of only those with strongest evidence of HLA-restriction
    cache = list()
    for f in all_files:
        rgs = re.search(pattern ='(mira_epitope)_([0-9]{1,3})_([0-9]{1,6})_([A-Z]{1,3})', 
            string = f).groups()    
        rgs4 = re.search(pattern ='(mira_epitope)_([0-9]{1,3})_([0-9]{1,6})_([A-Z]{1,4})', 
            string = f).groups()
        key3 = f'm_{rgs[1]}_{rgs[2]}_{rgs[3]}'
        key4 = f'm_{rgs4[1]}_{rgs4[2]}_{rgs4[3]}'
        setkey = f"M_{rgs[1]}"
        include = key3 in restrictions_dict.keys()
        alleles = restrictions_dict.get(key3)
        cache.append((setkey, key3, key4, f, int(rgs[1]), int(rgs[2]), include, alleles))
    # <hla_df>
    hla_df = pd.DataFrame(cache, columns = ['set', 'key3','key4','filename','set_rank','clones','hla_restricted','alleles']).\
        sort_values(['hla_restricted','clones'], ascending = True).\
        query('hla_restricted == True').reset_index(drop = True)
    
    from tcrdist.paths import path_to_base
    for ind,row in hla_df.iterrows():
        file = row['filename']
        print(file, ind)
        print(row)
        find_metaclonotypes(project_path = "hla_restricted_meta_clonotypes",
                            source_path = os.path.join(path_to_base,'tcrdist','data','covid19'),
                            antigen_enriched_file = file,
                            ncpus = 6, 
                            seed = 3434)


# 8888888888 888b    888 8888888b.  
# 888        8888b   888 888  "Y88b 
# 888        88888b  888 888    888 
# 8888888    888Y88b 888 888    888 
# 888        888 Y88b888 888    888 
# 888        888  Y88888 888    888 
# 888        888   Y8888 888  .d88P 
# 8888888888 888    Y888 8888888P"  

For each run, the function will write the following outputs:

project_folder/
├── .centers_bkgd_ctlr_1E5.tsv
├── .centers_bkgd_ctlr_1E6.tsv
├── .ecdf_AER_plot.png
├── .ecdf_BUR_plot.png
├── .olga100K_brit100K_bkgd.csv
├── .ranked_centers_bkgd_ctlr_1E5.html
├── .ranked_centers_bkgd_ctlr_1E5.tsv
├── .ranked_centers_bkgd_ctlr_1E6.html
├── .ranked_centers_bkgd_ctlr_1E6.tsv
└── .rw_beta.npz

ecdf1 ecdf2

One of the outputs is an html report of non-redundant meta-clonotypes .ranked_centers_bkgd_ctlr_1E6.html.

Meta-Clonotype Tabulation

The .ranked_centers_bkgd_ctlr_1E6.tsv files produced above can be used to directly search for the presence of meta-clonotypes is bulk data. How do we do this? Before showing how to do this for many files consider doing it for a single bulk repertoire:

In this example:

  1. We download a raw ImmunoSEQ file.
  2. Format it for use in tcrdist3.
  3. Search it with one of the meta-clonotypes file we made from above.
import os
import pandas as pd 
from tcrdist.adpt_funcs import import_adaptive_file
from tcrdist.repertoire import TCRrep
from tcrdist.tabulate import tabulate
import math
import time
"""
You can download the file with wget or follow the link
!wget https://www.dropbox.com/s/1zbcf1ep8kgmlzy/KHBR20-00107_TCRB.tsv
"""
bulk_file = 'KHBR20-00107_TCRB.tsv'
df_bulk = import_adaptive_file(bulk_file)
df_bulk = df_bulk[['cdr3_b_aa',
                    'v_b_gene',
                    'j_b_gene',
                    'templates',
                    'productive_frequency',
                    'valid_cdr3']].\
        rename(columns = {'templates':'count'})

df_bulk = df_bulk[(df_bulk['v_b_gene'].notna()) & (df_bulk['j_b_gene'].notna())].reset_index()
tr_bulk = TCRrep(cell_df = df_bulk,
                 organism = 'human',
                 chains = ['beta'],
                 db_file = 'alphabeta_gammadelta_db.tsv',
                 compute_distances = False)

search_file = os.path.join(
    'tcrdist',
    'data',
    'covid19',
    'mira_epitope_48_610_YLQPRTFL_YLQPRTFLL_YYVGYLQPRTF.tcrdist3.csv.ranked_centers_bkgd_ctlr_1E6.tsv')

df_search = pd.read_csv(search_file, sep = '\t')
df_search = df_search[['cdr3_b_aa','v_b_gene','j_b_gene','pgen','radius','regex']]
tr_search = TCRrep(cell_df = df_search,
                   organism = 'human',
                   chains = ['beta'],
                   db_file = 'alphabeta_gammadelta_db.tsv',
                   compute_distances = False)
tr_search.cpus = 1
tic = time.perf_counter()
tr_search.compute_sparse_rect_distances(df = tr_search.clone_df, df2 = tr_bulk.clone_df, chunk_size = 50, radius = 50) 
results = tabulate(clone_df1 = tr_search.clone_df, clone_df2 = tr_bulk.clone_df, pwmat = tr_search.rw_beta)
toc = time.perf_counter()
print(f"TABULATED IN {toc - tic:0.4f} seconds")

The results, which can be saved as .tsv include the following:

  • cdr3_b_aa - meta-clonotype centroid CDR3
  • v_b_gene - meta-clonotype centroid TRBV
  • j_b_gene - meta-clonotype centroid TRBJ
  • pgen. - meta-clonotype centroid TRBV, CDR3 Pgen (Probability of Generation, estimated with OGLA)
  • radius - meta-clonotype maximum TCRdist to find a neighbor
  • regex - pattern of conserved positions learned from the all those sequences with <radius> of the centroid in the antigen enriched dataset.
  • cdr1_b_aa - meta-clonotype centroid CDR1
  • cdr2_b_aa - meta-clonotype centroid CDR2
  • pmhc_b_aa - meta-clonotype centroid CDR2.5
  • bulk_sum_freq - In the bulk sample, sum of frequencies of TCRs within the <radius> of the centroid (* RADIUS)
  • bulk_sum_counts - In the bulk sample, total template TCRs (counts) within the <radius> of the centroid (* RADIUS)
  • bulk_seqs - In the bulk sample, CDR3 seqs of TCRs within the <radius> of the centroid
  • bulk_v_genes - In the bulk sample, TRBVs of TCRs within the <radius> of the centroid
  • bulk_j_genes - In the bulk sample, TRBJs of TCRs within the <radius> of the centroid
  • bulk_distances - In the bulk sample, distances of TCRs from centroid, for those within the <radius> of the centroid
  • bulk_counts - In the bulk sample, individual counts of each TCR within radius of the centroid
  • bulk_freqs - In the bulk sample, individual frequencies of each TCR within radius of the centroid
  • bulk_regex_match - In the bulk sample, boolean for each CDR3 within the <radius> whether it matched the regex motif pattern
  • bulk_sum_freqs_regex_adj - In the bulk sample, sum of frequencies of TCRs within the <radius> of the centroid and matching regex (RADIUS + MOTIF)
  • bulk_sum_counts_regex_adj - In the bulk sample, sum of counts of TCRs within the <radius> of the centroid and matching regex (RADIUS + MOTIF)
  • bulk_sum_freqs_tcrdist0 - In the bulk sample, sum of frequencies of TCRs within the <radius> = 0 of the centroid (EXACT)
  • bulk_sum_counts_tcrdist0 - In the bulk sample, sum of counts of TCRs within the <radius> = 0 of the centroid (EXACT)

Tabulation Against Many

Below is a complete example used to tabulate the frequency and counts of each meta-clonotype in 694 bulk samples. Note that you have to supply a valid path to the directory where all the bulk files reside in your environment.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# December 2020 
# Seattle, WA
import pandas as pd 
from tcrdist.repertoire import TCRrep
from tcrdist.tabulate import tabulate
import os
import numpy as np
from tcrdist.paths import path_to_base
import time
import math
import parmap

def get_safe_chunk(search_clones, bulk_clones,target = 10**7):
	"""
	This function help pick a chunk size that prevents excessive memory use,
	With two CPU, 10*7 should keep total overall memory demand below 1GB
	"""
	ideal_divisor = (search_clones * bulk_clones) / target
	if ideal_divisor < 1:
		ideal_chunk_size = search_clones
		print(ideal_chunk_size)
	else:
		ideal_chunk_size = math.ceil((search_clones)/ ideal_divisor)
		print(ideal_chunk_size)
	return ideal_chunk_size

def do_search2(file, df_search, dest, tag):
	sample_name = file.replace('.tsv.tcrdist3.v_max.tsv','')
	tic = time.perf_counter()
	
	# <tr_search> tcrdist.repertoire.TCRrep object for computing distances
	tr_search = TCRrep(cell_df = df_search,
					organism = 'human',
					chains = ['beta'],
					db_file = 'alphabeta_gammadelta_db.tsv',
					compute_distances = False)
	# set cpus according to parameter above
	tr_search.cpus = 1
	df_bulk   = pd.read_csv(os.path.join(path, file), sep = '\t')
	df_bulk = df_bulk[['cdr3_b_aa','v_b_gene','j_b_gene','templates','productive_frequency']].rename(columns = {'templates':'count'})

	tr_bulk = TCRrep(	cell_df = df_bulk,                 
						organism = 'human', 
						chains = ['beta'], 
						db_file = 'alphabeta_gammadelta_db.tsv',
						compute_distances = False)
	
	#lines_per_file.append(tr_bulk.clone_df.shape[0]) 
	
	search_clones = tr_search.clone_df.shape[0]
	bulk_clones   = tr_bulk.clone_df.shape[0]
	# To avoid memory pressure on the system we set a target that tcrdist doesn't do more than 10M comparisons per process
	ideal_chunk_size = get_safe_chunk(tr_search.clone_df.shape[0], tr_bulk.clone_df.shape[0],target = 10**8)
	tr_search.compute_sparse_rect_distances(df = tr_search.clone_df, df2 = tr_bulk.clone_df, chunk_size = ideal_chunk_size) #(5)
	r1 = tabulate(clone_df1 = tr_search.clone_df, clone_df2 = tr_bulk.clone_df, pwmat = tr_search.rw_beta)
	outfile = os.path.join(dest, f"{sample_name}.{tag}.bulk_tabulation.tsv")
	print(f"WRITING: {outfile}")
	r1.to_csv(outfile, sep = '\t')
	toc = time.perf_counter()
	print(f"TABULATED IN {toc - tic:0.4f} seconds")
	del(tr_search)
	del(tr_bulk)
	return(f"{toc - tic:0.4f}s")


if __name__ == "__main__":
	# 888      888                               888             
	# 888      888                               888             
	# 888      888                               888             
	# 88888b.  888  8888b.     .d8888b   .d88b.  888888 .d8888b  
	# 888 "88b 888     "88b    88K      d8P  Y8b 888    88K      
	# 888  888 888 .d888888    "Y8888b. 88888888 888    "Y8888b. 
	# 888  888 888 888  888         X88 Y8b.     Y88b.       X88 
	# 888  888 888 "Y888888     88888P'  "Y8888   "Y888  88888P' 
	# December 18, 2020 
	# Seattle, WA
	from collections import defaultdict 
	import os 
	import pandas as pd
	import re
	# <path> path where bulk files can be found 
	path_bulkfiles = INSERT_HERE # <--------------------------

	# <path> where files reside
	path = os.path.join(path_to_base,'tcrdist','data','covid19')#os.path.join('data-raw','2020-08-31-mira_tcr_by_epitope/')
	# <all_files> list of all files
	all_files = [f for f in os.listdir(path) if f.endswith('.tcrdist3.csv')]
	# <restriction> list of tuples from Supporting Table S5: https://docs.google.com/spreadsheets/d/1WAmze6lir-v11odO-nYYbCiYVh8WhQh_vy2d1UPPKb0/edit#gid=942295061
	restriction = \
	[('m_55_524_ALR','A*01'),
	('m_1_8260_HTT','A*01'),
	('m_45_689_SYF','A*01'),
	('m_10_2274_LSP','B*07'),
	('m_155_59_RAR','B*07'),
	('m_133_102_NQK','B*15'),
	('m_48_610_YLQ','A*02'),
	('m_111_146_AEI','A*11'),
	('m_53_532_NYL','A*24'),
	('m_90_216_GYQ','C*07'),
	('m_140_92_NSS','A*01'),
	('m_55_524_ALR','B*08'),
	('m_183_39_RIR','A*03'),
	('m_10_2274_LSP','C*07'),
	('m_99_191_QEC','B*40'),
	('m_155_59_RAR','C*07'),
	('m_185_39_ASQ','B*15'),
	('m_147_73_DLF','B*08'),
	('m_110_148_ELI','B*44'),
	('m_51_546_AYK','A*03'),
	('m_44_697_FPP','B*35'),
	('m_118_136_ALN','A*11'),
	('m_176_44_SST','A*11'),
	('m_30_1064_KAY','B*57'),
	('m_192_31_FQP','B*15'),
	('m_70_345_DTD','A*01')]
	# <restrictions_dict> convert list to dictionary
	restrictions_dict = defaultdict(list)
	for k,v in restriction:
		restrictions_dict[k].append(v)
	# Loop through all files to construct a Dataframe of only those with strongest evidence of HLA-restriction
	cache = list()
	for f in all_files:
		rgs = re.search(pattern ='(mira_epitope)_([0-9]{1,3})_([0-9]{1,6})_([A-Z]{1,3})', 
			string = f).groups()	
		rgs4 = re.search(pattern ='(mira_epitope)_([0-9]{1,3})_([0-9]{1,6})_([A-Z]{1,4})', 
			string = f).groups()
		key3 = f'm_{rgs[1]}_{rgs[2]}_{rgs[3]}'
		key4 = f'm_{rgs4[1]}_{rgs4[2]}_{rgs4[3]}'
		setkey = f"M_{rgs[1]}"
		include = key3 in restrictions_dict.keys()
		alleles = restrictions_dict.get(key3)
		cache.append((setkey, key3, key4, f, int(rgs[1]), int(rgs[2]), include, alleles))

	# <hla_df>
	hla_df = pd.DataFrame(cache, columns = ['set', 'key3','key4','filename','set_rank','clones','hla_restricted','alleles']).\
		sort_values(['hla_restricted','clones'], ascending = True).\
		query('hla_restricted == True').reset_index(drop = True)

	for ind, row in hla_df.iterrows():
		print(ind)
		print(row)
		tag_level = '1E6'
		# <tag> the results with this symbol
		tag = f"{row['set']}_{tag_level}"
		ranked_centers_fn = f"{row['filename']}.ranked_centers_bkgd_ctlr_{tag_level}.tsv"
		benchmark_fn      = f"{row['filename']}.ranked_centers_bkgd_ctlr_{tag_level}.tsv.benchmark_tabulation.tsv"
		# <dest> place where .tsv files shall be saved
		dest = f'/fh/fast/gilbert_p/fg_data/tcrdist/t3/{tag}'
		if not os.path.isdir(dest):
			os.mkdir(dest)

		# <path> path where bulk files can be found 
		path = path_bulkfiles
		# <covid_smaples> list of all samples to consider>
		covid_samples = ["BS-EQ-09-T1-replacement_TCRB", "BS-EQ-0022-T0-replacement_TCRB", "860011133_TCRB", "BS-GIGI_86-replacement_TCRB", "BS-EQ-12-T1-replacement_TCRB", "BS-GIGI_44-replacement_TCRB", "KH20-09697_TCRB", "BS-GN-0007-T0-replacement_TCRB", "860011216_TCRB", "BS-EQ-0002-T1-replacement_TCRB", "KH20-09988_TCRB", "BS-GIGI_21-replacement_TCRB", "INCOV018-AC-5_TCRB", "860011205_TCRB", "BS-GN-06-T0-replacement_TCRB", "KHBR20-00107_TCRB", "550043156_TCRB", "550041451_TCRB", "KH20-09700_TCRB", "BS-EQ-25-T0-replacement_TCRB", "860011320_TCRB", "BS-GN-0008-T0-replacement_TCRB", "BS-EQ-0002-T2-replacement_TCRB", "860011277_TCRB", "KH20-11306_TCRB", "BS-EQ-10-T1-replacement_TCRB", "BS-EQ-18-T1-replacement_TCRB", "KH20-11638_TCRB", "860011493_TCRB", "BS-GN-0010-T0-replacement_TCRB", "860011220_TCRB", "KH20-09681_TCRB", "550041430_TCRB", "INCOV033-AC-3_TCRB", "KHBR20-00093_TCRB", "860011221_TCRB", "BS-GIGI_28-replacement_TCRB", "BS-GIGI_46-replacement_TCRB", "KH20-09665_TCRB", "BS-EQ-0018-T0-replacement_TCRB", "860011287_TCRB", "BS-GIGI_42-replacement_TCRB", "INCOV004-BL-5_TCRB", "KH20-11809_TCRB", "BS-EQ-0001-T1-replacement_TCRB", "KH20-09671_TCRB", "KHBR20-00111_TCRB", "BS-EQ-0027-T1-replacement_TCRB", "BS-EQ-0024-T0-replacement_TCRB", "BS-GN-13-T0-replacement_TCRB", "BS-EQ-11-T2-replacement_TCRB", "BS-GIGI_32-replacement_TCRB", "BS-EQ-31-T0-replacement_TCRB", "BS-EQ-0027-T2-replacement_TCRB", "BS-GIGI_06-replacement_TCRB", "KH20-09679_TCRB", "KH20-09676_TCRB", "860011462_TCRB", "BS-GIGI_56-replacement_TCRB", "KH20-09722_TCRB", "KH20-09751_TCRB", "550040014_TCRB", "BS-GIGI_96-replacement_TCRB", "BS-GN-0012-T0-replacement_TCRB", "KHBR20-00085_TCRB", "KH20-09667_TCRB", "INCOV086-AC-3_TCRB", "KH20-11307_TCRB", "BS-GIGI_49-replacement_TCRB", "INCOV020-AC-5_TCRB", "BS-GIGI_85-replacement_TCRB", "KH20-11650_TCRB", "BS-GIGI_15-replacement_TCRB", "BS-EQ-43-T0-replacement_TCRB", "BS-GIGI_50-replacement_TCRB", "INCOV013-AC-5_TCRB", "BS-EQ-0011-T1-replacement_TCRB", "BS-GIGI_66-replacement_TCRB", "860011140_TCRB", "BS-EQ-12-T2-replacement_TCRB", "KHBR20-00089_TCRB", "860011235_TCRB", "860011123_TCRB", "KHBR20-00121_TCRB", "KHBR20-00180_TCRB", "BS-GIGI_18-replacement_TCRB", "BS-GIGI_64-replacement_TCRB", "BS-EQ-0024-T1-replacement_TCRB", "550042607_TCRB", "KH20-11683_TCRB", "KHBR20-00179_TCRB", "BS-GN-0002-T0-replacement_TCRB", "550043906_TCRB", "BS-EQ-0021-T0-replacement_TCRB", "BS-EQ-07-T1-replacement_TCRB", "860011306_TCRB", "550043943_TCRB", "BS-GIGI_20-replacement_TCRB", "BS-GIGI_52-replacement_TCRB", "KH20-09970_TCRB", "860011129_TCRB", "KHBR20-00098_TCRB", "860011124_TCRB", "860011110_TCRB", "BS-GIGI_01-replacement_TCRB", "KH20-09670_TCRB", "BS-GIGI_72-replacement_TCRB", "BS-EQ-11-T3-replacement_TCRB", "860011120_TCRB", "INCOV008-BL-5_TCRB", "BS-GN-0004-T0-replacement_TCRB", "INCOV028-AC-3_TCRB", "KH20-11295_TCRB", "860011214_TCRB", "BS-GIGI_13-replacement_TCRB", "860011204_TCRB", "BS-GIGI_87-replacement_TCRB", "860011215_TCRB", "860011202_TCRB", "INCOV005-BL-5_TCRB", "BS-GIGI_10-replacement_TCRB", "860011489_TCRB", "550043905_TCRB", "BS-GN-01-T0-replacement_TCRB", "550042578_TCRB", "KHBR20-00118_TCRB", "KH20-09655_TCRB", "BS-EQ-0016-T1-replacement_TCRB", "KHBR20-00106_TCRB", "BS-GIGI_62-replacement_TCRB", "BS-EQ-37-T0-replacement_TCRB", "860011244_TCRB", "KHBR20-00157_TCRB", "BS-GIGI_80-replacement_TCRB", "KHBR20-00182_TCRB", "KHBR20-00206_TCRB", "860011318_TCRB", "KH20-11645_TCRB", "KH20-09673_TCRB", "550040140_TCRB", "BS-GIGI_36-replacement_TCRB", "KHBR20-00171_TCRB", "BS-GIGI_81-replacement_TCRB", "860011225_TCRB", "550043911_TCRB", "BS-GIGI_03-replacement_TCRB", "KH20-09698_TCRB", "BS-EQ-29-T0-replacement_TCRB", "860011239_TCRB", "KH20-11303_TCRB", "550041293_TCRB", "INCOV028-BL-3_TCRB", "INCOV004-AC-5_TCRB", "110047437_TCRB", "860011201_TCRB", "BS-EQ-23-T1-replacement_TCRB", "KHBR20-00119_TCRB", "KHBR20-00149_TCRB", "BS-EQ-0017-T0-replacement_TCRB", "KH20-11810_TCRB", "BS-GIGI_45-replacement_TCRB", "550042567_TCRB", "860011109_TCRB", "KH20-09752_TCRB", "KH20-09675_TCRB", "KH20-09987_TCRB", "BS-GIGI_41-replacement_TCRB", "BS-EQ-0008-T1-replacement_TCRB", "550042420_TCRB", "KH20-09747_TCRB", "KHBR20-00144_TCRB", "KH20-11675_TCRB", "INCOV031-BL-3_TCRB", "KH20-11697_TCRB", "KHBR20-00141_TCRB", "KH20-09948_TCRB", "INCOV036-AC-3_TCRB", "550041421_TCRB", "BS-EQ-0026-T0-replacement_TCRB", "BS-GIGI_08-replacement_TCRB", "BS-EQ-29-T1-replacement_TCRB", "KHBR20-00160_TCRB", "860011322_TCRB", "KHBR20-00165_TCRB", "860011327_TCRB", "KH20-09674_TCRB", "KH20-09661_TCRB", "BS-GIGI_16-replacement_TCRB", "BS-EQ-43-T1_BS-GIGI-137-replacement_TCRB", "860011117_TCRB", "INCOV033-BL-3_TCRB", "KHBR20-00148_TCRB", "BS-GIGI_79-replacement_TCRB", "550042515_TCRB", "550042526_TCRB", "INCOV003-BL-5_TCRB", "BS-EQ-35-T0-replacement_TCRB", "550042550_TCRB", "BS-GN-0011-T0-replacement_TCRB", "BS-EQ-07-T2-replacement_TCRB", "KHBR20-00103_TCRB", "INCOV030-AC-3_TCRB", "KH20-09997_TCRB", "BS-GIGI_61-replacement_TCRB", "KHBR20-00161_TCRB", "KH20-09996_TCRB", "INCOV030-BL-3_TCRB", "KHBR20-00156_TCRB", "KHBR20-00137_TCRB", "550041125_TCRB", "KHBR20-00130_TCRB", "860011498_TCRB", "KH20-11698_TCRB", "BS-EQ-17-T1b-replacement_TCRB", "KHBR20-00105_TCRB", "550043986_TCRB", "INCOV071-BL-3_TCRB", "KH20-11304_TCRB", "BS-EQ-0023-T0-replacement_TCRB", "KH20-11692_TCRB", "550042377_TCRB", "KH20-09720_TCRB", "BS-GIGI_35-replacement_TCRB", "KHBR20-00113_TCRB", "860011105_TCRB", "KH20-11294_TCRB", "KHBR20-00167_TCRB", "INCOV031-AC-3_TCRB", "KH20-09666_TCRB", "KHBR20-00081_TCRB", "BS-GIGI_70-replacement_TCRB", "BS-GIGI_55-replacement_TCRB", "INCOV009-BL-5_TCRB", "BS-EQ-34-T0-replacement_TCRB", "BS-GIGI_34-replacement_TCRB", "KHBR20-00185_TCRB", "BS-EQ-33-T0-replacement_TCRB", "BS-GIGI_19-replacement_TCRB", "KHBR20-00126_TCRB", "550042351_TCRB", "KH20-11647_TCRB", "BS-GIGI_74-replacement_TCRB", "860011229_TCRB", "KH20-11640_TCRB", "BS-GN-0005-T0-replacement_TCRB", "KHBR20-00083_TCRB", "KH20-09735_TCRB", "550042361_TCRB", "KH20-09963_TCRB", "KHBR20-00166_TCRB", "KH20-11695_TCRB", "KHBR20-00096_TCRB", "BS-GIGI_07-replacement_TCRB", "KH20-11811_TCRB", "INCOV069-BL-3_TCRB", "BS-GIGI_05-replacement_TCRB", "BS-GIGI_40-replacement_TCRB", "INCOV053-AC-3_TCRB", "860011116_TCRB", "INCOV016-BL-5_TCRB", "860011112_TCRB", "INCOV034-BL-3_TCRB", "INCOV032-BL-3_TCRB", "KH20-09664_TCRB", "550041314_TCRB", "860011206_TCRB", "550042640_TCRB", "INCOV017-BL-5_TCRB", "BS-EQ-10-T2-replacement_TCRB", "KHBR20-00172_TCRB", "KH20-11813_TCRB", "550043940_TCRB", "550041390_TCRB", "INCOV015-BL-5_TCRB", "KH20-09684_TCRB", "BS-EQ-36-T0-replacement_TCRB", "KH20-11637_TCRB", "BS-EQ-39-T0-replacement_TCRB", "BS-EQ-16-T2-replacement_TCRB", "KHBR20-00153_TCRB", "BS-GIGI_92-replacement_TCRB", "BS-GIGI_91-replacement_TCRB", "860011492_TCRB", "KHBR20-00117_TCRB", "550041499_TCRB", "KHBR20-00181_TCRB", "KHBR20-00173_TCRB", "KHBR20-00168_TCRB", "KHBR20-00123_TCRB", "KH20-09945_TCRB", "BS-GN-0014-T0-replacement_TCRB", "550043912_TCRB", "KH20-09955_TCRB", "KH20-11298_TCRB", "550043995_TCRB", "KHBR20-00170_TCRB", "INCOV070-BL-3_TCRB", "KH20-11305_TCRB", "KHBR20-00203_TCRB", "KHBR20-00188_TCRB", "BS-GIGI_53-replacement_TCRB", "INCOV036-BL-3_TCRB", "KH20-09959_TCRB", "KHBR20-00145_TCRB", "BS-EQ-33-T1-replacement_TCRB", "KH20-09677_TCRB", "INCOV087-BL-3_TCRB", "INCOV084-BL-3_TCRB", "KH20-11642_TCRB", "KH20-09687_TCRB", "KH20-09724_TCRB", "KH20-11687_TCRB", "BS-EQ-14-T3-replacement_TCRB", "KH20-11688_TCRB", "INCOV016-AC-5_TCRB", "BS-EQ-38-T1_BS-GIGI-117-replacement_TCRB", "INCOV075-AC-3_TCRB", "KHBR20-00162_TCRB", "550043973_TCRB", "550042259_TCRB", "550042183_TCRB", "BS-EQ-18-T2-replacement_TCRB", "KH20-11672_TCRB", "KH20-11643_TCRB", "KH20-11639_TCRB", "BS-GIGI_26-replacement_TCRB", "KH20-09696_TCRB", "KH20-09952_TCRB", "860011325_TCRB", "550043955_TCRB", "KHBR20-00177_TCRB", "BS-EQ-15-T1-replacement_TCRB", "BS-EQ-34-T1-replacement_TCRB", "860011312_TCRB", "KH20-11671_TCRB", "KHBR20-00196_TCRB", "KH20-09695_TCRB", "860011490_TCRB", "INCOV005-AC-5_TCRB", "550043980_TCRB", "KH20-09734_TCRB", "KH20-09750_TCRB", "KHBR20-00094_TCRB", "860011243_TCRB", "KHBR20-00143_TCRB", "BS-EQ-28-T1-replacement_TCRB", "KH20-09947_TCRB", "860011309_TCRB", "860011450_TCRB", "INCOV049-AC-3_TCRB", "860011310_TCRB", "860011113_TCRB", "KH20-09964_TCRB", "KH20-09991_TCRB", "KH20-11292_TCRB", "860011131_TCRB", "KHBR20-00200_TCRB", "KHBR20-00078_TCRB", "KHBR20-00186_TCRB", "BS-GIGI_27-replacement_TCRB", "550041250_TCRB", "KH20-09962_TCRB", "860011127_TCRB", "KH20-09999_TCRB", "860011338_TCRB", "KHBR20-00076_TCRB", "860011137_TCRB", "KHBR20-00204_TCRB", "550042523_TCRB", "INCOV017-AC-5_TCRB", "550043908_TCRB", "KHBR20-00205_TCRB", "KHBR20-00158_TCRB", "KH20-09708_TCRB", "860011218_TCRB", "860011382_TCRB", "BS-EQ-36-T1-replacement_TCRB", "860011125_TCRB", "BS-EQ-41-T1-replacement_TCRB", "BS-EQ-33-T2-replacement_TCRB", "860011132_TCRB", "550043971_TCRB", "860011246_TCRB", "INCOV006-BL-5_TCRB", "KHBR20-00134_TCRB", "550041349_TCRB", "KH20-10000_TCRB", "KH20-09718_TCRB", "KH20-09694_TCRB", "550042609_TCRB", "860011383_TCRB", "KH20-09741_TCRB", "550043927_TCRB", "KH20-11807_TCRB", "KH20-09668_TCRB", "KH20-11682_TCRB", "550042599_TCRB", "KHBR20-00127_TCRB", "550042631_TCRB", "KHBR20-00102_TCRB", "KH20-11814_TCRB", "KH20-09992_TCRB", "KHBR20-00178_TCRB", "KH20-09701_TCRB", "860011240_TCRB", "BS-EQ-41-T0-replacement_TCRB", "INCOV014-BL-5_TCRB", "KH20-11659_TCRB", "INCOV007-BL-5_TCRB", "KH20-09685_TCRB", "BS-EQ-28-T2-replacement_TCRB", "INCOV035-AC-3_TCRB", "INCOV011-AC-5_TCRB", "BS-GN-0016-T0-replacement_TCRB", "KHBR20-00152_TCRB", "BS-GN-0009-T0-replacement_TCRB", "INCOV025-AC-3_TCRB", "KHBR20-00128_TCRB", "550043167_TCRB", "INCOV051-AC-3_TCRB", "860011314_TCRB", "KHBR20-00108_TCRB", "KH20-09704_TCRB", "KH20-09749_TCRB", "KH20-09956_TCRB", "550042447_TCRB", "860011212_TCRB", "KH20-11670_TCRB", "BS-EQ-0028-T0-replacement_TCRB", "550040030_TCRB", "KH20-09746_TCRB", "KH20-11296_TCRB", "860011233_TCRB", "860011139_TCRB", "BS-GN-0003-T0-replacement_TCRB", "INCOV073-BL-3_TCRB", "860011203_TCRB", "KH20-09971_TCRB", "INCOV007-AC-5_TCRB", "KHBR20-00135_TCRB", "KH20-11291_TCRB", "860011241_TCRB", "KH20-09736_TCRB", "KHBR20-00201_TCRB", "BS-EQ-26-T1-replacement_TCRB", "KH20-11657_TCRB", "860011231_TCRB", "860011227_TCRB", "KHBR20-00082_TCRB", "KH20-09657_TCRB", "KH20-09966_TCRB", "KH20-09725_TCRB", "KH20-11681_TCRB", "550042442_TCRB", "KHBR20-00109_TCRB", "860011210_TCRB", "KH20-09985_TCRB", "860011238_TCRB", "550044028_TCRB", "860011445_TCRB", "550042400_TCRB", "BS-EQ-30-T2-replacement_TCRB", "KHBR20-00084_TCRB", "550042395_TCRB", "KHBR20-00136_TCRB", "KH20-11641_TCRB", "KH20-11302_TCRB", "KH20-11676_TCRB", "BS-EQ-0014-T1-replacement_TCRB", "KH20-11673_TCRB", "860011111_TCRB", "KH20-09967_TCRB", "KH20-09984_TCRB", "KH20-11300_TCRB", "KH20-11297_TCRB", "860011228_TCRB", "KH20-11301_TCRB", "KH20-11646_TCRB", "KHBR20-00151_TCRB", "KH20-11661_TCRB", "KH20-11299_TCRB", "KHBR20-00131_TCRB", "KH20-09709_TCRB", "860011496_TCRB", "KH20-09946_TCRB", "KHBR20-00163_TCRB", "550041393_TCRB", "KH20-09690_TCRB", "KHBR20-00080_TCRB", "KH20-09753_TCRB", "KH20-09669_TCRB", "KHBR20-00139_TCRB", "KH20-09691_TCRB", "INCOV019-AC-5_TCRB", "860011217_TCRB", "KH20-09980_TCRB", "INCOV034-AC-3_TCRB", "KH20-09951_TCRB", "BS-EQ-0003-T1-replacement_TCRB", "KH20-09978_TCRB", "KH20-11660_TCRB", "KH20-09961_TCRB", "INCOV035-BL-3_TCRB", "KH20-09990_TCRB", "860011303_TCRB", "KHBR20-00184_TCRB", "KH20-09954_TCRB", "INCOV044-AC-3_TCRB", "KHBR20-00154_TCRB", "KHBR20-00183_TCRB", "KHBR20-00122_TCRB", "KH20-09993_TCRB", "KH20-09748_TCRB", "550042547_TCRB", "KH20-09689_TCRB", "KH20-09727_TCRB", "BS-EQ-0014-T2-replacement_TCRB", "860011323_TCRB", "KH20-11812_TCRB", "KHBR20-00091_TCRB", "INCOV048-BL-3_TCRB", "KH20-09977_TCRB", "KH20-09986_TCRB", "KH20-09745_TCRB", "860011115_TCRB", "KHBR20-00087_TCRB", "550043914_TCRB", "550043981_TCRB", "860011499_TCRB", "INCOV054-BL-3_TCRB", "KHBR20-00100_TCRB", "550042239_TCRB", "KH20-11649_TCRB", "KH20-09953_TCRB", "860011304_TCRB", "KHBR20-00133_TCRB", "550042210_TCRB", "INCOV051-BL-3_TCRB", "860011497_TCRB", "KHBR20-00116_TCRB", "KH20-09715_TCRB", "BS-EQ-30-T1-replacement_TCRB", "860011348_TCRB", "860011118_TCRB", "KH20-09975_TCRB", "550043928_TCRB", "KH20-09714_TCRB", "860011242_TCRB", "KHBR20-00079_TCRB", "550041441_TCRB", "KH20-09960_TCRB", "860011326_TCRB", "KHBR20-00191_TCRB", "BS-GIGI_75-replacement_TCRB", "KH20-09968_TCRB", "KH20-09723_TCRB", "KH20-09979_TCRB", "KH20-11658_TCRB", "INCOV047-BL-3_TCRB", "INCOV002-AC-5_TCRB", "KH20-09731_TCRB", "INCOV029-AC-3_TCRB", "KH20-11674_TCRB", "110047542_TCRB", "KH20-09743_TCRB", "KH20-11678_TCRB", "KH20-11689_TCRB", "KH20-09699_TCRB", "KH20-09726_TCRB", "860011336_TCRB", "860011256_TCRB", "KHBR20-00097_TCRB", "INCOV006-AC-5_TCRB", "KHBR20-00086_TCRB", "KH20-11651_TCRB", "KHBR20-00125_TCRB", "KH20-09755_TCRB", "KH20-11655_TCRB", "KH20-09683_TCRB", "KH20-09976_TCRB", "860011280_TCRB", "INCOV047-AC-3_TCRB", "KH20-11684_TCRB", "860011219_TCRB", "KH20-09654_TCRB", "INCOV044-BL-3_TCRB", "KH20-09738_TCRB", "INCOV015-AC-5_TCRB", "INCOV050-AC-3_TCRB", "860011494_TCRB", "KH20-11662_TCRB", "KH20-11665_TCRB", "INCOV062-BL-3_TCRB", "860011226_TCRB", "KH20-09702_TCRB", "860011354_TCRB", "860011236_TCRB", "KHBR20-00095_TCRB", "860011209_TCRB", "860011317_TCRB", "KH20-09711_TCRB", "KH20-11699_TCRB", "KH20-09754_TCRB", "KHBR20-00190_TCRB", "860011388_TCRB", "860011356_TCRB", "550043920_TCRB", "KHBR20-00099_TCRB", "KHBR20-00129_TCRB", "KH20-11696_TCRB", "KH20-11686_TCRB", "KH20-11664_TCRB", "INCOV026-BL-3_TCRB", "KH20-11667_TCRB", "KH20-11666_TCRB", "860011347_TCRB", "KH20-11648_TCRB", "KH20-09739_TCRB", "550044000_TCRB", "KH20-09972_TCRB", "KH20-11668_TCRB", "860011138_TCRB", "860011208_TCRB", "550041239-1_TCRB", "550042542_TCRB", "860011223_TCRB", "KHBR20-00092_TCRB", "INCOV052-BL-3_TCRB", "INCOV050-BL-3_TCRB", "860011311_TCRB", "550044029_TCRB", "INCOV003-AC-5_TCRB", "KHBR20-00088_TCRB", "860011264_TCRB", "INCOV012-AC-5_TCRB", "INCOV001-AC-5_TCRB", "INCOV002-BL-5_TCRB", "KH20-11654_TCRB", "KHBR20-00115_TCRB", "KH20-09707_TCRB", "860011392_TCRB", "KH20-11656_TCRB", "KH20-09717_TCRB", "860011284_TCRB", "KHBR20-00174_TCRB", "KH20-11653_TCRB", "KH20-09719_TCRB", "KH20-09958_TCRB", "INCOV026-AC-3_TCRB", "860011119_TCRB", "KH20-09981_TCRB", "KH20-11652_TCRB", "860011250_TCRB", "KH20-09995_TCRB", "INCOV059-AC-3_TCRB", "860011282_TCRB", "KH20-11669_TCRB", "550041173_TCRB", "KH20-09973_TCRB", "KH20-09706_TCRB", "860011122_TCRB", "KH20-09969_TCRB", "KH20-11293_TCRB", "INCOV062-AC-3_TCRB", "860011340_TCRB", "INCOV055-BL-3_TCRB", "860011224_TCRB", "KH20-09658_TCRB", "KHBR20-00104_TCRB", "860011211_TCRB", "KH20-11679_TCRB", "860011130_TCRB", "860011495_TCRB", "INCOV027-BL-3_TCRB", "KH20-09693_TCRB", "KHBR20-00146_TCRB", "860011106_TCRB", "KH20-09721_TCRB", "INCOV077-BL-3_TCRB", "KH20-09716_TCRB", "INCOV027-AC-3_TCRB", "860011121_TCRB", "KH20-11644_TCRB", "INCOV045-BL-3_TCRB", "860011330_TCRB", "INCOV023-AC-3_TCRB", "860011260_TCRB", "KH20-09994_TCRB", "KH20-09744_TCRB", "KH20-09712_TCRB", "860011488_TCRB", "550042451_TCRB", "INCOV078-BL-3_TCRB", "KHBR20-00164_TCRB"]
		# <max number of cpus to use>
		ncpus = 2
		# <df_search> dataframe containing metaclonotypes
		df_search = pd.read_csv(os.path.join('hla_restricted_meta_clonotypes', ranked_centers_fn), sep = "\t")
		if df_search.shape[0] > 0:
			df_search = df_search[['cdr3_b_aa','v_b_gene','j_b_gene','pgen','radius','regex']]
			# <all_files> list of all files in the project <path>
			all_files = [f for f in os.listdir(path) if f.endswith('v_max.tsv')]
			# add file size
			all_files = [ (f, f.replace('.tsv.tcrdist3.v_max.tsv',''), os.stat(os.path.join(path,f)).st_size) for f in all_files ]
			# sort ascending by file size, smaller files first
			all_files = sorted(all_files, key=lambda x: x[2])
			# <all_files_ms> is <all_files> subset to only the samples with 30 days of COVID19 diagnosis, and not including ADPATIVE-COVID19 training samples
			all_files_ms = [(x,y,z) for x,y,z in all_files if y in covid_samples] 
			ts_iterable = [f for f,sn,_  in all_files_ms]
			import parmap
			timings = parmap.map(do_search2, ts_iterable, df_search = df_search.copy(), dest = dest, tag = tag, pm_processes=6, pm_pbar = True)
			pd.DataFrame({'file':ts_iterable, 'seconds':timings}).to_csv(os.path.join('hla_restricted_meta_clonotypes', benchmark_fn), sep = "\t")


# 8888888888 888b    888 8888888b.  
# 888        8888b   888 888  "Y88b 
# 888        88888b  888 888    888 
# 8888888    888Y88b 888 888    888 
# 888        888 Y88b888 888    888 
# 888        888  Y88888 888    888 
# 888        888   Y8888 888  .d88P 
# 8888888888 888    Y888 8888888P"