Skip to main content
deleted 16 characters in body; edited title
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238

Python function to find specific regex in the text of an XML document - need clean up

The code is:

Python function to find specific regex in the text of an XML document - need clean up

The code is:

Python function to find specific regex in the text of an XML document

Source Link
Anna
  • 169
  • 5

Python function to find specific regex in the text of an XML document - need clean up

I'm writing a code that, starting from an XML file:

  • stores the index of child elements of a tag and the child elements as key, values in a dictionary (function get_xml_by_tag_names);
  • deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (def search_delete_append);
  • joins, for each dictionary, the dict values and extracts their text(def main);
  • replaces certain values with "" (def main);
  • counts the occurrences of specific regex I specify (def find_regex).

The main function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?

The code is:

import re
from xml.dom import minidom
from xml.etree import ElementTree as ET


def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):

    data = {}
    xml_tree = minidom.parse(xml_path)
    item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
    for idx, item_group_node in enumerate(item_group_nodes):
        cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
        for _ in cl_compile_nodes:
            data[idx]=[item_group_node.toxml()]
    return data


def find_regex(regex, text):
    lista = []
    for x in text:
        matches_prima = re.findall(regex, x)
        lunghezza = len(matches_prima)
        lista.append(lunghezza)
    print("The number of {} matches is ".format(regex), sum(lista))

def find_regex_fasi(regex, text):
    matches_fasi = re.findall(regex, text)
    print("Numero di corpo minore è", len(matches_fasi))


def search_delete_append(dizionario, dizionariofasi):
    deletekeys = []
    insertvalues = []
    for k in dizionario:
        for v in dizionario[k]:
            if "7.489" in v:
                deletekeys.append(k)
                dizionariofasi[k] = v

    for item in deletekeys:
        del dizionario[item]


def main():
    dict_fasi = {}
    data = get_xml_by_tag_names('output2.xml', 'new_line', 'text')
    search_delete_append(data, dict_fasi)

    testo = []
    for value in data.values():
        myxml = ' '.join(value)
        tree = ET.fromstring(myxml)
        tmpstring = ' '.join(text.text for text in tree.findall('text'))
        for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
            tmpstring = tmpstring.replace(to_remove, "")
        testo.append(tmpstring)
    #testo = ''.join(testo)
    print(testo)
    find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
    #find_prima = re.compile(r"\]\s*prima(?!\S)")
    find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT")  # ] parole → T
    find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT")  # ] parole da cui T
    find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])")  # ] 1 parole 2 parole (esclude T)
    find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])")  # ] prima 1 parole 2 parole (esclude T)
    find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
    find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
    find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
    FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
    FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
    FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
    FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
    FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
    FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
    FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
    FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
    FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
    FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
    FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
    FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
    FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")

    #find_da = re.compile(r"\]\s*da(?!\S)")
    #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
    #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
    #find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
    #find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
    #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
    #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
    find_regex(FIND_FASE12T, testo)
    find_regex(find_fase_12T_leo, testo)
    #find_regex(find_prima, testo)
    find_regex(find_fase_base_2, testo)
    find_regex(find_fase_base_3, testo)
    find_regex(find_fase_12, testo)
    find_regex(find_fase_prima_12, testo)
    find_regex(find_fase_prima_123, testo)
    find_regex(find_fase_prima_123T, testo)
    find_regex(find_fase_prima_1freccia2, testo)
    #find_regex(find_da, testo)
    #find_regex(find_da_cui, testo)
    #find_regex(find_sps, testo)
    #find_regex(find_su, testo)
    #find_regex(find_as, testo)
    #find_regex(find_ins, testo)
    #find_regex(find_segue, testo)
    #################

    testo_fasi = []
    values = [x for x in dict_fasi.values()]
    myxml_fasi = ' '.join(values)
    find_CM = re.compile(r"10\.238")
    find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono?
    #print(myxml_fasi)
    for x in dict_fasi.values():
        xxx= ''.join(x)
        tree2 = ET.fromstring(xxx)
        tmpstring2 = ' '.join(text.text for text in tree2.findall('text'))
        for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
            tmpstring2 = tmpstring2.replace(to_remove, "")
        testo_fasi.append(tmpstring2)
    #testo_fasi = ''.join(testo_fasi)
    print(testo_fasi)
    find_regex(FIND_FASE12T, testo_fasi)
    find_regex(FIND_FASE123T_OPZ2, testo_fasi)
    find_regex(FIND_FASE123T, testo_fasi)
    find_regex(FIND_FASE_1FRECCIA23T, testo_fasi)
    find_regex(FIND_FASE_123FRECCIAT, testo_fasi)
    find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi)
    find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi)
    find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi)
    find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi)
    find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi)
    find_regex(FIND_FASE_PRIMA_12345T, testo_fasi)
    find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi)
    find_regex(FIND_FASE_12345T, testo_fasi)

    find_regex(find_fase_12T_leo, testo_fasi)
    #find_regex(find_prima, testo_fasi)
    find_regex(find_fase_base_2, testo_fasi)
    find_regex(find_fase_base_3, testo_fasi)
    find_regex(find_fase_12, testo_fasi)
    find_regex(find_fase_prima_12, testo_fasi)
    find_regex(find_fase_prima_123, testo_fasi)
    find_regex(find_fase_prima_123T, testo_fasi)
    find_regex(find_fase_prima_1freccia2, testo_fasi)
    #find_regex(find_da, testo_fasi)
    #find_regex(find_da_cui, testo_fasi)
    #find_regex(find_sps, testo_fasi)
    #find_regex(find_su, testo_fasi)
    #find_regex(find_as, testo_fasi)
    #find_regex(find_ins, testo_fasi)
    #find_regex(find_segue, testo_fasi)



if __name__ == "__main__":
    main()

I know it's half in Italian right now, but I need to keep it for now for my clarity.