Creating Custom DataConfig
Prerequisites¶
Before starting, ensure you have:
- FASTA reference files for V, D, J (and optionally C) segments
- Empirical dataset (CSV) with real sequence annotations
- GenAIRR installed:
pip install GenAIRR
Required CSV Columns¶
Your empirical data CSV must contain these columns:
required_columns = [
'sequence', # Full nucleotide sequence
'v_sequence_start', # V region start position
'v_sequence_end', # V region end position
'd_sequence_start', # D region start position (if applicable)
'd_sequence_end', # D region end position (if applicable)
'j_sequence_start', # J region start position
'j_sequence_end', # J region end position
'v_call', # V allele name(s)
'd_call', # D allele name(s) (if applicable)
'j_call', # J allele name(s)
'v_trim_5', # V 5' trimming length
'v_trim_3', # V 3' trimming length
'd_trim_5', # D 5' trimming length (if applicable)
'd_trim_3', # D 3' trimming length (if applicable)
'j_trim_5', # J 5' trimming length
'j_trim_3' # J 3' trimming length
]
Note: This tutorial uses immunoglobulin heavy chain data (IGH* naming) as an example. For TCR data, you would use TRB, TRA, etc. naming conventions and set the appropriate chain type.
Step 1: Setup and Imports¶
Let's start by importing the necessary modules and checking that we have example files available.
import os
import pickle
import pandas as pd
from pathlib import Path
# GenAIRR imports
from GenAIRR.dataconfig.make.custom import CustomDataConfigBuilder
from GenAIRR import Pipeline, steps, S5F, Uniform
print("All imports successful!")
Step 2: Locate Your Reference Files¶
For this tutorial, we'll use example files from the GenAIRR test data directory. In your own work, replace these paths with your actual FASTA files.
Example File Structure¶
your_project/
├── reference_files/
│ ├── IGHV.fasta # V segment germline sequences
│ ├── IGHD.fasta # D segment germline sequences
│ ├── IGHJ.fasta # J segment germline sequences
│ └── IGHC.fasta # C segment germline sequences (optional)
└── empirical_data/
└── inference_sample.csv # Annotated sequences for learning distributions
# Define paths to example files
# NOTE: Replace these paths with your own files
base_path = Path('../../tests/data') # Adjust this to your file location
v_reference_path = base_path / 'IGHV.fasta'
d_reference_path = base_path / 'IGHD.fasta'
j_reference_path = base_path / 'IGHJ.fasta'
c_reference_path = base_path / 'IGHC_fixed.fasta' # Using fixed C file without corrupted D alleles
empirical_data_path = base_path / 'inference_sample.csv'
# Verify files exist
files_to_check = {
'V reference': v_reference_path,
'D reference': d_reference_path,
'J reference': j_reference_path,
'C reference': c_reference_path,
'Empirical data': empirical_data_path
}
print("File availability check:")
all_found = True
for name, path in files_to_check.items():
exists = path.exists()
status = "✓" if exists else "✗"
print(f"{status} {name}: {path}")
if not exists and name != 'C reference': # C is optional
all_found = False
if all_found:
print("\n✓ All required files found!")
else:
print("\n⚠ Some required files are missing. Adjust the paths above.")
File availability check: ✓ V reference: ../../tests/data/IGHV.fasta ✓ D reference: ../../tests/data/IGHD.fasta ✓ J reference: ../../tests/data/IGHJ.fasta ✓ C reference: ../../tests/data/IGHC_fixed.fasta ✓ Empirical data: ../../tests/data/inference_sample.csv ✓ All required files found!
Step 3: Examine Your Empirical Data¶
Before creating the DataConfig, let's inspect the empirical data to ensure it has the correct format.
# Load and inspect the empirical data
df = pd.read_csv(empirical_data_path)
print(f"Dataset shape: {df.shape[0]} sequences, {df.shape[1]} columns")
print(f"\nAvailable columns:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
df.head(3)
Dataset shape: 10000 sequences, 21 columns Available columns: ['Unnamed: 0', 'sequence', 'v_sequence_start', 'v_sequence_end', 'd_sequence_start', 'd_sequence_end', 'j_sequence_start', 'j_sequence_end', 'v_call', 'd_call', 'j_call', 'mutation_rate', 'v_trim_5', 'v_trim_3', 'd_trim_5', 'd_trim_3', 'j_trim_5', 'j_trim_3', 'corruption_event', 'corruption_add_amount', 'corruption_remove_amount'] First few rows:
| Unnamed: 0 | sequence | v_sequence_start | v_sequence_end | d_sequence_start | d_sequence_end | j_sequence_start | j_sequence_end | v_call | d_call | ... | mutation_rate | v_trim_5 | v_trim_3 | d_trim_5 | d_trim_3 | j_trim_5 | j_trim_3 | corruption_event | corruption_add_amount | corruption_remove_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | TCGGTGAAGGTCTCCGGCACGGCTTCTGGAGGCACCTGCAGAAGTT... | 0 | 247 | 255 | 258 | 267 | 318 | IGHV2-26*03 | IGHD3-3*01 | ... | 0.150015 | 0 | 1 | 5 | 8 | 0 | 0 | remove | 0 | 48 |
| 1 | 1 | GTACCCTTATCATTGTTTAGCAGTGCCTCCCGTACTTGGNCTTTGG... | 96 | 395 | 401 | 426 | 444 | 489 | IGHV3-15*05 | IGHD7-27*01 | ... | 0.173507 | 0 | 3 | 0 | 11 | 4 | 0 | add | 96 | 0 |
| 2 | 2 | TGGGGCTGGAGAGGGAATCGGGTCCTAGTTGGAATGGCGCTACGAG... | 1 | 169 | 173 | 178 | 186 | 240 | IGHV3/OR16-17*01,IGHV1-2*03 | IGHD1-26*01,IGHD2-21*02 | ... | 0.117554 | 0 | 0 | 4 | 6 | 8 | 0 | remove_before_add | 1 | 128 |
3 rows × 21 columns
# Check for required columns
required_columns = [
'sequence', 'v_sequence_start', 'v_sequence_end',
'd_sequence_start', 'd_sequence_end', 'j_sequence_start', 'j_sequence_end',
'v_call', 'd_call', 'j_call',
'v_trim_5', 'v_trim_3', 'd_trim_5', 'd_trim_3', 'j_trim_5', 'j_trim_3'
]
print("Column availability check:")
missing_columns = []
for col in required_columns:
if col in df.columns:
print(f"✓ {col}")
else:
print(f"✗ {col} - MISSING")
missing_columns.append(col)
if missing_columns:
print(f"\n⚠ Warning: Missing columns: {missing_columns}")
print("You may need to adjust your data or use default values for missing columns.")
else:
print("\n✓ All required columns present!")
Column availability check: ✓ sequence ✓ v_sequence_start ✓ v_sequence_end ✓ d_sequence_start ✓ d_sequence_end ✓ j_sequence_start ✓ j_sequence_end ✓ v_call ✓ d_call ✓ j_call ✓ v_trim_5 ✓ v_trim_3 ✓ d_trim_5 ✓ d_trim_3 ✓ j_trim_5 ✓ j_trim_3 ✓ All required columns present!
Step 4: Initialize CustomDataConfigBuilder¶
The CustomDataConfigBuilder class automates the creation of all necessary DataConfig attributes from your reference files and empirical data.
Key Parameters:¶
convert_to_asc: Whether to convert alleles to Allele Similarity Clusters (ASC)True: Groups similar alleles (reduces complexity, faster)False: Keeps individual alleles (more granular, recommended for custom data)
For most custom datasets, use convert_to_asc=False to preserve your specific allele information.
# Initialize the DataConfig builder
dcb = CustomDataConfigBuilder(convert_to_asc=False)
print("✓ CustomDataConfigBuilder initialized")
print(f" - ASC conversion: {dcb.convert_to_asc}")
print(f" - Ready to process reference files")
✓ CustomDataConfigBuilder initialized - ASC conversion: False - Ready to process reference files
Step 5: Generate Custom DataConfig¶
Now comes the main step: generating your custom DataConfig instance. This process will:
- Parse FASTA files - Load V, D, J, and C alleles
- Analyze empirical data - Calculate trimming distributions, NP region patterns
- Build correction maps - Create ambiguity resolution structures
- Validate consistency - Ensure allele names match between references and data
Note: This tutorial uses a corrected C alleles file to avoid data corruption issues in the original test file.
Note: This process can take several minutes depending on data size. The generator will print detailed progress information.
from GenAIRR.dataconfig.config_info import ConfigInfo
from GenAIRR.dataconfig.enums import Species, ChainType
from datetime import date
# Generate the custom DataConfig using the correct API
# This will display detailed progress information
print("Starting DataConfig generation...")
print("=" * 60)
custom_dataconfig = dcb.make(
v_reference_path=str(v_reference_path),
d_reference_path=str(d_reference_path), # Include D segments for heavy chain
j_reference_path=str(j_reference_path),
c_reference_path=str(c_reference_path), # Using fixed C file
custom_data=str(empirical_data_path) # Can also pass DataFrame directly
)
# Add metadata - using BCR_HEAVY since our test data uses IGH* naming
custom_dataconfig.metadata = ConfigInfo(
species=Species.HUMAN,
chain_type=ChainType.TCR_BETA, # Immunoglobulin heavy chain (matches IGH* data)
reference_set='My Custom Heavy Chain Reference Set',
last_updated=date.today(),
has_d=True
)
print("=" * 60)
print("✓ DataConfig generation complete!")
Starting DataConfig generation... ============================================================ Alleles Mounted to DataConfig!... Gene Usage Mounted to DataConfig!... Trimming Proportions Mounted to DataConfig!... Trimming Proportions Mounted to DataConfig!... NP Parameters Mounted to DataConfig!... V Ns Ambiguity Map Mounted to DataConfig!... NP Parameters Mounted to DataConfig!... V Ns Ambiguity Map Mounted to DataConfig!... D (5,3) Prime Ambiguity Map Mounted to DataConfig!... ================================================== ============================================================ ✓ DataConfig generation complete! D (5,3) Prime Ambiguity Map Mounted to DataConfig!... ================================================== ============================================================ ✓ DataConfig generation complete!
Step 6: Inspect the Generated DataConfig¶
Let's examine what was created to ensure everything looks correct.
# Display basic information about the DataConfig
print("Custom DataConfig Summary:")
print("=" * 60)
print(f"Name: {custom_dataconfig.name}")
print(f"\nChain Type: {custom_dataconfig.metadata.chain_type if custom_dataconfig.metadata else 'Not set'}")
print(f"Species: {custom_dataconfig.metadata.species if custom_dataconfig.metadata else 'Not set'}")
print(f"\n Allele Counts:")
print(f" - V alleles: {custom_dataconfig.number_of_v_alleles}")
print(f" - D alleles: {custom_dataconfig.number_of_d_alleles}")
print(f" - J alleles: {custom_dataconfig.number_of_j_alleles}")
print(f" - C alleles: {custom_dataconfig.number_of_c_alleles}")
print(f"\nV allele families: {list(custom_dataconfig.v_alleles.keys())[:5]}...")
print(f"D allele families: {list(custom_dataconfig.d_alleles.keys())[:5]}...")
print(f"J allele families: {list(custom_dataconfig.j_alleles.keys())[:5]}...")
Custom DataConfig Summary: ============================================================ Name: None Chain Type: ChainType.BCR_HEAVY Species: Species.HUMAN Allele Counts: - V alleles: 300 - D alleles: 47 - J alleles: 14 - C alleles: 18 V allele families: ['IGHV1-18', 'IGHV1-2', 'IGHV1-24', 'IGHV1-3', 'IGHV1-45']... D allele families: ['IGHD1-1', 'IGHD1-14', 'IGHD1-20', 'IGHD1-26', 'IGHD1-7']... J allele families: ['IGHJ1', 'IGHJ2', 'IGHJ3', 'IGHJ4', 'IGHJ5']...
# Check specific allele details including C alleles
print("\nExample V allele details:")
first_v_family = list(custom_dataconfig.v_alleles.keys())[0]
first_v_alleles = custom_dataconfig.v_alleles[first_v_family]
for i, allele in enumerate(first_v_alleles[:3]): # Show first 3
print(f"\n Allele {i+1}:")
print(f" Name: {allele.name}")
print(f" Family: {allele.family}")
print(f" Gene: {allele.gene}")
print(f" Type: {allele.type}")
print(f" Length: {allele.ungapped_len} bp")
print(f" Sequence preview: {allele.ungapped_seq[:50]}...")
print("\n" + "="*50)
print("Example C allele details:")
first_c_family = list(custom_dataconfig.c_alleles.keys())[0]
first_c_alleles = custom_dataconfig.c_alleles[first_c_family]
for i, allele in enumerate(first_c_alleles[:3]): # Show first 3
print(f"\n C Allele {i+1}:")
print(f" Name: {allele.name}")
print(f" Family: {allele.family}")
print(f" Gene: {allele.gene}")
print(f" Type: {allele.type}")
print(f" Length: {allele.ungapped_len} bp")
Example V allele details:
Allele 1:
Name: IGHV1-18*01
Family: IGHV1
Gene: IGHV1-18
Type: AlleleTypes.V
Length: 296 bp
Sequence preview: caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctc...
Allele 2:
Name: IGHV1-18*03
Family: IGHV1
Gene: IGHV1-18
Type: AlleleTypes.V
Length: 296 bp
Sequence preview: caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctc...
Allele 3:
Name: IGHV1-18*04
Family: IGHV1
Gene: IGHV1-18
Type: AlleleTypes.V
Length: 296 bp
Sequence preview: caggttcagctggtgcagtctggagctgaggtgaagaagcctggggcctc...
==================================================
Example C allele details:
C Allele 1:
Name: IGKC*01
Family: IGKC
Gene: IGKC
Type: AlleleTypes.C
Length: 321 bp
C Allele 2:
Name: IGKC*02
Family: IGKC
Gene: IGKC
Type: AlleleTypes.C
Length: 321 bp
C Allele 3:
Name: IGKC*03
Family: IGKC
Gene: IGKC
Type: AlleleTypes.C
Length: 321 bp
Step 7: Save Your Custom DataConfig¶
Once created, save your DataConfig as a pickle file for future use. This avoids having to regenerate it every time.
Best Practices for Saving:¶
- Use descriptive names: Include species, chain type, and date
- Version control: Keep track of different versions
- Document metadata: Add a README explaining the source data
- Backup: Store in version control or cloud storage
import os
import pickle
import pandas as pd
from pathlib import Path
# GenAIRR imports - CORRECTED IMPORT PATH
from GenAIRR.dataconfig.make import CustomDataConfigBuilder
from GenAIRR import Pipeline, steps, S5F, Uniform
print("All imports successful!")
# Save the custom DataConfig for future use
path = './example_custom_tcr_files/custom_dataconfig.pkl'
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as f:
pickle.dump(custom_dataconfig, f)
print(f"✓ Custom DataConfig saved to: {path}")
✓ Custom DataConfig saved to: ./example_custom_tcr_files/custom_dataconfig.pkl
Step 8: Loading a Saved DataConfig¶
Demonstrate how to load your saved DataConfig in future sessions.
# Load the saved DataConfig (simulating a fresh session)
print("Loading DataConfig from file...")
with open(path, 'rb') as f:
loaded_dataconfig = pickle.load(f)
# Verify it loaded correctly
print("✓ DataConfig loaded successfully!")
print(f" Name: {loaded_dataconfig.name}")
print(f" V alleles: {loaded_dataconfig.number_of_v_alleles}")
print(f" D alleles: {loaded_dataconfig.number_of_d_alleles}")
print(f" J alleles: {loaded_dataconfig.number_of_j_alleles}")
# Quick verification that it's the same as the original
assert loaded_dataconfig.number_of_v_alleles == custom_dataconfig.number_of_v_alleles
assert loaded_dataconfig.number_of_j_alleles == custom_dataconfig.number_of_j_alleles
print("✓ Verification passed - loaded DataConfig matches original")
Loading DataConfig from file... ✓ DataConfig loaded successfully! Name: None V alleles: 300 D alleles: 47 J alleles: 14 ✓ Verification passed - loaded DataConfig matches original
Step 9: Using Your Custom DataConfig in Pipelines¶
Now let's use your custom DataConfig to build simulation pipelines and generate sequences!
# Create a simulation pipeline using your custom configuration
# Pass the config directly to Pipeline instead of using set_dataconfig()
pipeline = Pipeline(
config=loaded_dataconfig,
steps=[
steps.SimulateSequence(
Uniform(min_mutation_rate=0.02, max_mutation_rate=0.08),
productive=True,
),
steps.FixVPositionAfterTrimmingIndexAmbiguity(),
steps.FixDPositionAfterTrimmingIndexAmbiguity(),
steps.FixJPositionAfterTrimmingIndexAmbiguity(),
steps.CorrectForVEndCut(),
steps.CorrectForDTrims(),
steps.DistillMutationRate(),
steps.InsertNs(n_ratio=0.02, probability=0.5),
steps.ShortDValidation(),
steps.InsertIndels(probability=0.5, max_indels=5, insertion_probability=0.5, deletion_probability=0.5),
]
)
print("Custom simulation pipeline created with steps:")
for i, step in enumerate(pipeline.steps, 1):
print(f" {i}. {type(step).__name__}")
print(f"\nPipeline ready for sequence generation!")
Step 10: Generate Sequences with Your Custom DataConfig¶
Let's generate some sequences and examine the results to validate our custom configuration.
# Generate a single sequence to test the pipeline
print("Generating test sequence...")
test_sequence = pipeline.execute()
sequence_data = test_sequence.get_dict()
print("✓ Sequence generated successfully!")
print(f"\nSequence Details:")
print(f" Length: {len(sequence_data['sequence'])} bp")
print(f" V allele: {sequence_data['v_call'][0] if sequence_data['v_call'] else 'None'}")
print(f" J allele: {sequence_data['j_call'][0] if sequence_data['j_call'] else 'None'}")
print(f" Productive: {sequence_data['productive']}")
print(f" Mutation rate: {sequence_data['mutation_rate']:.3f}")
print(f" Number of mutations: {len(sequence_data['mutations'])}")
print(f"\nSequence preview (first 100 bp):")
print(sequence_data['sequence'][:100] + "...")
Generating test sequence... Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None ✓ Sequence generated successfully! Sequence Details: Length: 684 bp V allele: IGHV2-70*18 J allele: IGHJ6*03 Productive: False Mutation rate: 0.041 Number of mutations: 15 Sequence preview (first 100 bp): TAGGTCACCTTGAGGGAGTCTGGTCCTGCGCAGGTAAAACCCNCACAGACCCTCACCCTGANCTGCACCCTCTCTGGGTTCTCTCTCAGCACTAGTGAAA...
# Generate multiple sequences for analysis
print("Generating multiple sequences for validation...")
sequences = []
for i in range(10):
seq = pipeline.execute()
seq_dict = seq.get_dict()
seq_dict['seq_id'] = f"custom_seq_{i+1:03d}"
sequences.append(seq_dict)
# Basic statistics
productive_count = sum(1 for s in sequences if s['productive'])
avg_mutations = sum(len(s['mutations']) for s in sequences) / len(sequences)
unique_v_alleles = set(s['v_call'][0] for s in sequences if s['v_call'])
unique_j_alleles = set(s['j_call'][0] for s in sequences if s['j_call'])
print(f"✓ Generated {len(sequences)} sequences")
print(f"\nValidation Results:")
print(f" Productive sequences: {productive_count}/{len(sequences)} ({productive_count/len(sequences)*100:.1f}%)")
print(f" Average mutations per sequence: {avg_mutations:.1f}")
print(f" Unique V alleles used: {len(unique_v_alleles)}")
print(f" Unique J alleles used: {len(unique_j_alleles)}")
print(f"\nExample V alleles used: {list(unique_v_alleles)[:3]}")
print(f"Example J alleles used: {list(unique_j_alleles)[:3]}")
Generating multiple sequences for validation... Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None Warning: Failed to generate a productive sequence after 25 attempts. Proceeding with the last non-functional sequence. Parameters: V=None, J=NoneD=None ✓ Generated 10 sequences Validation Results: Productive sequences: 0/10 (0.0%) Average mutations per sequence: 32.2 Unique V alleles used: 10 Unique J alleles used: 7 Example V alleles used: ['IGHV7-81*02', 'IGHV4-30-2*03', 'IGHV1-2*01'] Example J alleles used: ['IGHJ4*03', 'IGHJ6*05', 'IGHJ6*02']
Step 11: Advanced Usage - Custom Allele Selection¶
Your custom DataConfig allows you to force specific allele combinations, just like with built-in configs.
# Select specific alleles from your custom DataConfig
first_v_family = list(loaded_dataconfig.v_alleles.keys())[0]
first_d_family = list(loaded_dataconfig.d_alleles.keys())[0]
first_j_family = list(loaded_dataconfig.j_alleles.keys())[0]
specific_v = loaded_dataconfig.v_alleles[first_v_family][0]
specific_d = loaded_dataconfig.d_alleles[first_d_family][0]
specific_j = loaded_dataconfig.j_alleles[first_j_family][0]
print("Selected specific alleles:")
print(f" V allele: {specific_v.name} (family: {first_v_family})")
print(f" D allele: {specific_d.name} (family: {first_d_family})")
print(f" J allele: {specific_j.name} (family: {first_j_family})")
# Create pipeline with specific alleles
specific_pipeline = Pipeline(
config=loaded_dataconfig,
steps=[
steps.SimulateSequence(
S5F(min_mutation_rate=0.02, max_mutation_rate=0.06),
productive=True,
specific_v=specific_v,
specific_j=specific_j
),
steps.FixVPositionAfterTrimmingIndexAmbiguity(),
steps.FixDPositionAfterTrimmingIndexAmbiguity(),
steps.FixJPositionAfterTrimmingIndexAmbiguity(),
steps.DistillMutationRate()
]
)
# Generate sequence with specific alleles
specific_seq = specific_pipeline.execute()
specific_data = specific_seq.get_dict()
print(f"\nGenerated sequence with specific alleles:")
print(f" V used: {specific_data['v_call']}")
print(f" D used: {specific_data['d_call']}")
print(f" J used: {specific_data['j_call']}")
print(f" Mutations: {len(specific_data['mutations'])}")