I'm writing a script for a bioinformatics application in Python that iterates through several sequences looking for amino acids in specific positions to calculate their frequency in relation to a genomic database. However, I'm new to the area and I've been learning while developing the project, so a lot of what I've done is poorly optimized. For a task that I consider relatively simple, the script takes half a minute for files and sometimes much longer for larger files
Can anyone suggest how I might try to get around this problem? Apologies if it's trivial, I'm a biologist just starting to get to grips with programming
def main(self):
# Process input data and extract necessary information
mutation_count, mutation_subjects, mutation_positions, alignment_file, seq_ID = self.process_input()
# Initialize a dictionary to count amino acids at each mutation position
aa_counts = {pos: {} for pos in mutation_positions}
# Read the alignment file using Bio.AlignIO
alignment = AlignIO.read(alignment_file, 'clustal')
# Find the reference sequence in the alignment
ref_seq = None
for record in alignment:
if record.id == seq_ID:
ref_seq = record
break
# Convert sequences in the alignment to strings for easier processing
ref_seq_str = str(ref_seq.seq)
alignment_str = {record.id: str(record.seq) for record in alignment}
# Count amino acids at each position in the alignment
for seq in alignment_str.values():
pos_in_ref_seq = 0
for pos, aa in enumerate(seq):
if ref_seq_str[pos] != '-':
pos_in_ref_seq += 1
if pos_in_ref_seq in aa_counts:
aa_counts[pos_in_ref_seq][aa] = aa_counts[pos_in_ref_seq].get(aa, 0) + 1
# If a specific position is provided, calculate and print the amino acid frequencies at that position
if self.args.position:
position = self.args.position
total_count = sum(aa_counts[position].values())
for aa, count in aa_counts[position].items():
freq = (count / total_count * 100) if total_count > 0 else 0
print(f"Amino Acid: {aa} | Frequency: {freq:.2f}% | Count: {count}")
return
# Analyze mutations and calculate frequencies
mutation_info = {}
for mutation, count in sorted(mutation_count.items(), key=lambda x: int(x[0][1:-1])):
seq_pos = int(mutation[1:-1])
query_aa = mutation[0]
subject_aa = mutation[-1]
if query_aa == '-' or subject_aa == '-':
continue
real_count = aa_counts[seq_pos].get(subject_aa, 0)
total_count = sum(aa_counts[seq_pos].values())
query_aa_frequency = (aa_counts[seq_pos].get(query_aa, 0) / total_count * 100) if total_count > 0 else 0
subject_aa_frequency = (aa_counts[seq_pos].get(subject_aa, 0) / total_count * 100) if total_count > 0 else 0
if subject_aa_frequency <= 10 and real_count > 2:
mutation_info[mutation] = {
'query_aa_frequency': query_aa_frequency,
'subject_aa_frequency': subject_aa_frequency,
'real_count': real_count,
}
# Identify strains with specific mutations
strains_with_mutations = {}
for mutation in mutation_count:
query_aa, pos, subject_aa = mutation[0], int(mutation[1:-1]), mutation[-1]
strains_with_this_mutation = []
for record in alignment:
strain_name = record.id[:3]
sequence = str(record.seq)
pos_in_ref_seq = 0
for i, aa in enumerate(sequence):
if ref_seq.seq[i] != '-':
pos_in_ref_seq += 1
if pos_in_ref_seq == pos:
if aa == subject_aa:
strains_with_this_mutation.append(strain_name[:3])
break
strains_with_mutations[mutation] = strains_with_this_mutation
# Write mutation information to a text file
with open('gapslist.txt', 'w') as f:
for mutation, info in mutation_info.items():
f.write(f"Mutation: {mutation} | {mutation[0]} Frequency: {info['query_aa_frequency']:.2f}% | {mutation[-1]} Frequency: {info['subject_aa_frequency']:.2f}% | Count: {info['real_count']}\n\n")
# Write mutation and strain information to a CSV file
with open('rare_mutations.csv', 'w', newline='') as csvfile:
fieldnames = ['Mutation', 'Strains']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for mutation, info in mutation_info.items():
writer.writerow({
'Mutation': mutation,
'Strains': ';'.join(strains_with_mutations.get(mutation, []))
})
Profiling the code, I realized that the biggest bottleneck is the number of times the getitem function of the Seq class in Bio.Seq is being called:
ncalls tottime percall cumtime percall filename:lineno(function)
69006091 25.002 0.000 99.135 0.000 Seq.py:470(__getitem__)
69098925 6.311 0.000 6.311 0.000 SeqRecord.py:334(<lambda>).
69008225 10.165 0.000 49.965 0.000 <frozen abc>:117(__instancecheck__)
69006091 10.330 0.000 22.310 0.000 <frozen abc>:121(__subclasscheck__)
I've been looking into ways of dealing with this but nothing seems to be working, or it's only marginally reducing execution time.