For my Biology class, I made a Python script which takes a DNA sequence as input, translates it into an mRNA sequence, and then again into a tRNA sequence. It then matches each mRNA codon with an amino acid, and gives the user all of the data it produces.
Since this program may have to work with large amounts of DNA code, I just want some advice to see what could be done to make the program run faster and more efficiently.
Here is the 'symbols.p' file and here is the 'mRNA_to_protein.p' file.
import pickle
'''
Program takes a DNA genetic code sequence in as input.
It then translates the DNA code into mRNA code, and again into tRNA code.
After that, it matches each mRNA codon with an amino acid, as found in the hash table inside the pickle file.
It then matches each amino acid with its symbol, and prints all the data onto the screen.
'''
def main():
    # Asks the user if they would like to open a file which contains their genetic sequence.
    # Removes all whitespace from input in order to process it correctly.
    open_choice = remove_spaces(input("Do you want to load a file to translate [Y/N]").upper())
    # Processes whether the user wants to use a file
    while open_choice != 'Y' and open_choice != 'N':
        open_choice = remove_spaces(input("Do you want to load a file to translate [Y/N]").upper())
    if open_choice == 'Y':
        sequence = get_file().upper()
    else:
        sequence = input("Enter the DNA sequence to convert it: ").upper()  # Gets the DNA sequence to convert from input, if the user
                                                                            # declines to open a file.
    sequence = remove_spaces(sequence) # Removes spaces from the user's sequence
    while not check_sequence(sequence, 'dna'):  # Sends to check sequence function to confirm that it is a valid sequence
        sequence = input("Please enter a correct sequence: ").upper()  # If sequence is invalid, repeat until it is valid
        sequence = remove_spaces(sequence)
    original_sequence = ' '.join([sequence[i:i + 3] for i in range(0, len(sequence), 3)])  # Saves original DNA sequence
    mRNA = convert_sequence(sequence, 'dna')  # Saves mRNA sequence
    tRNA = convert_sequence(remove_spaces(mRNA), 'rna')  # Saves tRNA sequence
    proteins = convert_to_proteins((mRNA + " ")) # Prints amino acid sequence
    symbols = convert_symbols(proteins) # Prints amino acid symbols
    print('DNA:  ' + original_sequence)  # Prints original sequence
    print('mRNA: ' + mRNA)  # Prints mRNA sequence
    print('tRNA: ' + tRNA)  # Prints tRNA sequence
    print(" ".join(proteins))
    print(" ".join(symbols))
    dump_data(original_sequence, mRNA, tRNA, " ".join(proteins), " ".join(symbols))
    input()
# Checks sequence for validility
def check_sequence(sequence, type):  # Takes the sequence input and the type of sequence
    if type == 'rna':  # If it is an RNA sequence, confirm it only contains characters in AUCG
        a = 'AUCG'
    else:
        a = 'ATCG'  # If it is an DNA sequence, confirm it only contains characters in ATCG
    sequence_list = list(sequence)  # Converts sequence into a list
    # Checks each character in list to see if it is in respective character list determined above
    for i in sequence_list:
        if i not in a:  # If a character is invalid, return False
            return False
    return True  # If all characters are valid, return True
# Converts sequence to rNA
def convert_sequence(sequence, sequence_type):  # Takes sequence and type of secuence
    if sequence_type == 'dna':  # if the sequence is DNA: convert t to u
        conversion_dict = {
            'A': 'U',
            'T': 'A',
            'C': 'G',
            'G': 'C'
        }
    else:  # if the sequence is RBA: convert u to a
        conversion_dict = {
            'A': 'U',
            'U': 'A',
            'C': 'G',
            'G': 'C'
        }
    # convert sequence into a list
    converted_sequence = []
    sequence_list = list(sequence)
    # convert list one by one, checking the dictionary for the corresponding key, and add it to the new clist
    for i in sequence_list:
        converted_sequence.append(conversion_dict[i])
    # return converted sequence, seperated by a space every three spaces
    converted_sequence = ''.join(converted_sequence)
    # noinspection PyTypeChecker
    return ' '.join([converted_sequence[i:i + 3] for i in range(0, len(converted_sequence), 3)])
def convert_to_proteins(sequence):
    n = []
    protein_sequence = []
    mrna_to_protein = pickle.load(open('mRNA_to_protein.p', 'rb'))
    for i in sequence:
        if not i.isspace():
            n.append(i)
        else:
            if len(n) < 3:
                break
            protein_sequence.append(mrna_to_protein[''.join(n)])
            n = []
    return protein_sequence
def convert_symbols(proteins):
    symbol_list = []
    symbols = pickle.load(open('symbols.p', 'rb'))
    for i in proteins:
        symbol_list.append(symbols[i])
    return symbol_list
# removes all spaces in a sequence
def remove_spaces(x):
    return (''.join(x.split())).strip()
def get_file():
    file_name = input("Enter file name: ")
    while True:
        try:
            f = open(file_name, 'r')
            sequence = f.read()
            while not check_sequence(remove_spaces(sequence).upper(), 'dna'):
                file_name = input("\nPlease provide a file with a correct DNA sequence: ")
            break
        except FileNotFoundError:
            file_name = input("\nThe file '{}' was not found. \nPlease enter an accurate file name/path: ".format(file_name))
    return sequence
def dump_data(dna, mrna, trna, aa, s):
    file = open('results.txt', 'w')
    file.write('DNA:  ' + dna + "\n")
    file.write('mRNA: ' + mrna + "\n")
    file.write('tRNA: ' + trna + "\n")
    file.write(aa + "\n")
    file.write(s + "\n")
    return True
main()