import re import codecs def find_table_names(_data): s = [] bi = [] def find(p, offset=1): results = re.findall(p, _data, re.IGNORECASE) for l in results: # print('l', l) text = l if type(l) is tuple: if l[0] == 'lawe': text = l[1] elif len(l[1]) == 1 or l[1] == 'thd' or l[1] == 'AIiKi' or l[1] == 'XVuSA': text = l[0] else: text = l[1] # print('>', text) if not text.startswith('bi_') and not text.startswith('v_bi_') and text != 'lawe' and text != 'icm' and text != 'icm_bpm': s.append(text) if text.startswith('bi_'): bi.append(text) find(r'CREATE TABLE `(\w+)`') find(r'from lawe\.(\w+)') find(r'from `(\w+)`.`(\w+)`', 2) find(r'from\r\n `(\w+)`.`(\w+)`', 2) find(r'from (\w+)') find(r'LEFT JOIN (\w+)') find(r'LEFT JOIN (\w+).(\w+)', 2) find(r'from (\w+)\.(\w+)') find(r'from\r\n\t(\w+)') find(r'inner join (\w+).(\w+)', 2) bi = list(set(bi)) print(bi) r = list(set(s)) print(r) # print(len(r)) pass file_name = 'all.sql' text_file = codecs.open(file_name, "r") # read whole file to a string data = text_file.read() splitter = '-- (' parts = data.split(splitter) print('parts', len(parts)) for part in parts: print(part.partition('\n')[0]) find_table_names(part) # close file text_file.close()