From 853e8a9d5def7a4827820331de501bdd67e8f5dd Mon Sep 17 00:00:00 2001 From: Jani Alinikula Date: Sat, 6 Mar 2021 20:13:18 +0200 Subject: [PATCH] Add tree-based analysis for Y haplogroup detection --- haploy.py | 487 +++++++++++++++++++++++++++++++++++++++++++- haploy_db_import.py | 9 +- haploy_find.py | 154 ++++++++++---- snipsa-gui.py | 35 +++- 4 files changed, 626 insertions(+), 59 deletions(-) diff --git a/haploy.py b/haploy.py index c1e0fb5..de1d054 100755 --- a/haploy.py +++ b/haploy.py @@ -1,7 +1,300 @@ import re import csv import os -#import pickle +import copy +import snpload +from bs4 import BeautifulSoup +import urllib.request + +def print_uptree(snpset, ut, do_print=True, b3x='b37'): + rep='' + y=snpset['Y']; + pos=0 + neg=0 + tot=0 + for i, mut in enumerate(ut): + txt='' + if 'txt' in mut: + txt=mut['txt'] + if mut[b3x] in y: + rep += "%-1s%-8s %s %-19s %-38s %s\n"%(mut['tag'], mut['g'], y[mut[b3x]]['gen'], mut['isog'], mut['raw'], txt) + else: + rep += "%-1s%-8s %s %-19s %-38s %s\n"%(mut['tag'], mut['g'], ' ', mut['isog'], mut['raw'], txt) + pass + if do_print: + print(rep) + return rep + +def print_extras(snpset, bt, do_print=True): + rep='' + last='' + out=[] + outs='' + for e in bt['extras']: + out.append(e['raw']) + outs += e['raw'] + ' ' + last=e['raw'] + rep += 'Extra: '+outs+'\n' + if do_print: + print(rep) + return rep + +#TODO +def print_all(snpset, bt, do_print=True): + rep='' + last='' + out=[] + outs='' + c=0 + for e in bt['all']: + out.append(e['raw']) + outs += '%-40s'%(e['raw']+',') + c+=1 + if c == 3: + outs += '\n' + c=0 + last=e['raw'] + rep += 'All:\n'+outs+'\n' + if do_print: + print(rep) + return rep + +def print_data(do_print=True): + rep='' + + rep += 'Based on data from yfull.com on 2020-03-06 (CC-BY) and https://isogg.org/tree/ Y-DNA Tree 2019-2020\n' + + if do_print: + print(rep) + return rep + +def path_str(ut, n): + rep='' + + c=0 + prev='' + #for i, mut in enumerate(ut): + for mut in reversed(ut): + if mut['g'] != prev: + if c > 0: + rep = ' -> ' + rep + else: + rep = ' (ISOGG: %s)'%mut['isog'] + rep = mut['g'] + rep + c+=1 + prev = mut['g'] + if c >= n: + break + if not '-' in mut['g']: + break + + return rep + +def report(fname, n, do_uptree=True, do_extra=True, do_all=False, filt='', force=''): + rep='' + snpset, meta = snpload.load(fname, ['Y']) + if 'Y' not in snpset: + return "No Y data found\n" + + rep += print_data(False) + + rep += "%s: Total SNPs: %d\n"%(fname, meta['total']) + + b3x='b36' + if meta['build']==37: + b3x='b37' + if meta['build']==38: + b3x='b38' + best_trees = yfind2(snpset, n, filt, force, b3x) + + for bt in best_trees: + if do_all: + rep += print_all(snpset, bt, False) + + if do_uptree: + rep += print_uptree(snpset, bt['ut'], False, b3x) + + leaf_mut = bt['ut'][len(bt['ut'])-1] + #rep += "Result (%-8s %5.1f%% -%d +%d): %-8s (ISOGG: %s)\n"%(leaf_mut['raw'], bt['score'], bt['neg'], len(bt['extras']), leaf_mut['g'], leaf_mut['isog']) + rep += "Result (%-8s %5.1f%% -%d +%d): %-8s\n"%(leaf_mut['raw'], bt['score'], bt['neg'], len(bt['extras']), leaf_mut['g']) + #rep += "%s (ISOGG: %s)\n"%(path_str(bt['ut'], 15), leaf_mut['isog']) + rep += "%s\n"%(path_str(bt['ut'], 20)) + + if do_extra: + rep += print_extras(snpset, bt, False) + return rep + + +#Create a list of mutations on a path upwards from a mutation +def yfind_uptree(snpset, find_g): + found = 0 + uptree = [] + sames = [] + sameg = '' + for mut in reversed(haplo_muts_list): + if found == 0: + if mut['g'] == sameg: + sames.append(mut) + else: + sameg = mut['g'] + sames = [] + sames.append(mut) + if mut['g'] == find_g: #mut_leaf['g']: #TODO: g? + found = 1 + depth = mut['l'] + g = mut['g'] + uptree.extend(reversed(sames)) + else: + if mut['g'] == g: + uptree.insert(0, mut) + continue + if mut['l'] < depth: + depth = mut['l'] + g = mut['g'] + uptree.insert(0, mut) + return uptree + +def ymatch(snpset, mut, b3x): + y=snpset['Y']; + if mut['t'] == y[mut[b3x]]['gen']: + return True + else: + return False + +# Finds n best Y-DNA matches by tracing all possible mutation paths in database +def yfind2(snpset, nbest=5, filt='', force='', b3x='b37'): + mt=snpset['Y']; + all_mutations = [] + uptrees=[] + allmuts=[] + #TODO: support more exotic mutations? + + #find the route uptree for each mutation in snpset + for mut in haplo_muts_list: + pos = mut[b3x] + if pos in mt: + #print(mt[pos], mut) + if mt[pos]['gen'] == mut['t']: + #print(mt[pos], mut) + #if mut['!']==0: + all_mutations.append(mut) + uptree = yfind_uptree(snpset, mut['g']) + #print(uptree) + #print_uptree(snpset, uptree, do_print=True, b3x=b3x) + uptrees.append(uptree) + allmuts.append(mut) + + #if there is a forced path set, find a path for it too + if force: + uptree = yfind_uptree(snpset, force) + uptrees.append(uptree) + + #find the uptree route that is most consistent + #random mutations can have many negative matches above them until the path reaches the common + #common segment with the correct ones + best_trees=[] + for ut in uptrees: + verbose=0 + pos=0 + neg=0 + tot=0 + bm=[] + dbm=[] + tbm=[] + #prepare list of back mutations that need special handling + #for mut in ut: + # #TODO triple bm + # if mut['!'] == 1: + # bm.append(mut) + # if mut['!'] == 2: + # dbm.append(mut) + # if mut['!'] == 3: + # tbm.append(mut) + tag='' + ut_copy=[] + for i, mut in enumerate(ut): + if mut[b3x] in mt: + bm_matched = 0 #bm_match(snpset, i, ut, bm) + dbm_matched = 0 #bm_match(snpset, i, ut, dbm) + if ymatch(snpset, mut, b3x): + if bm_matched: + ##Double back mutation + if verbose: print("!!%-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw'])) + tag='!!' + else: + #TODO: should not match double without a single back mutation + if verbose: print("+ %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw'])) + tag='+' + pos+=1 + tot+=1 + else: + if bm_matched: + if verbose: print("! %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw'])) + tag='!' + elif dbm_matched: + if verbose: print("!!%-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw'])) + tag='!!' + else: + if verbose: print("- %-12s %s %s"%(mut['g'], mt[mut['p']]['gen'], mut['raw'])) + tag='-' + neg+=1 + tot+=1 + else: + tag='' + #if verbose: print(" (%s)"%mut['g']) + pass + mut_copy = mut.copy() + mut_copy['tag']=tag + ut_copy.append(mut_copy) + + #find extras: mutations that are found in snpset but not in uptree + extras=allmuts.copy() + toremove={} + for mut in ut_copy: + toremove[mut['raw']]=1 + extras = filter(lambda e: not e['raw'] in toremove, extras) + extras = sorted(extras, key=lambda i: int(i[b3x])) + last = '' + de_duplicate=[] + for e in extras: + if last != e['raw']: # and e['!'] == 0: + de_duplicate.append(e) + last = e['raw'] + extras = de_duplicate + nextras = len(extras) + + #extract unique all mutations + last = '' + de_duplicate=[] + #for e in sorted(all_mutations, key=lambda x: int(re.compile(r'[^\d.]+').sub('', x['raw']))): + for e in sorted(all_mutations, key=lambda x: x['raw']): + if last != e['raw']: + de_duplicate.append(e) + last = e['raw'] + all_mutations = de_duplicate + + #calculate score giving penalty from negative matches, favoring longest matches + score = 100.0*(pos - 2.0*neg + 0.00*tot - 0.0*nextras)/(tot + nextras) + + bt={ + 'ut': ut_copy, + 'score': score, + 'extras': extras, + 'all': all_mutations, + 'pos': pos, + 'neg': neg, + 'tot': tot, + } + best_trees.append(bt) + + if filt: + if filt[0] == '=': + best_trees = filter(lambda bt: filt[1:] == bt['ut'][-1]['g'], best_trees) + else: + best_trees = filter(lambda bt: filt in bt['ut'][-1]['g'], best_trees) + best_trees=sorted(best_trees, key=lambda i: -i['score']) + + return best_trees[:nbest] def yfind(snpset): y=snpset['Y']; @@ -46,9 +339,16 @@ def _mk_sort_key(mut): pr = g[0]+'5' return pr + g + + + + + haplo_muts_by_b38 = {} haplo_muts_by_b37 = {} haplo_muts_by_b36 = {} +haplo_muts_by_name = {} +haplo_muts_list = [] #Imports database from ISOGG spreadsheet def load_snp(): @@ -68,38 +368,39 @@ def load_snp(): if len(sline) < 7: continue if len(sline[6]) < 3: - print(line) #TODO: fix other bugs in db + print('TODO1:', line) #TODO: fix other bugs in db continue try: #int(sline[4]) #may be missing int(sline[5]) except: - print(line) #TODO: ranges are not SNPs + print('TODO2:', line) #TODO: ranges are not SNPs #try: # sline[5] = re.findall(r'\d+', sline[5])[0] #except: continue - #if sline[0] == "PF6503": - # continue #TODO!! b37=sline[4] b38=sline[5] + mname=sline[0].split('.')[0] ##TODO test multiple hg with same mut #p=sline[6].replace('-.', '->',1).split('->') p=sline[6].split('->') if len(p) < 2: - print(line) #TODO: fix other bugs in db + print('TODO:', line) #TODO: fix other bugs in db continue else: p=p[1][0].upper() - mut = { - 'm': sline[0], + 'm': mname, 'mall': sline[0], 'g': sline[1], 'rs': sline[3], 'b37': b37, 'b38': b38, - 'p': sline[6][3], + 'p': p, } + #print(mname) + if mname not in haplo_muts_by_name: + haplo_muts_by_name[mname] = mut if b38 not in haplo_muts_by_b38: haplo_muts_by_b38[b38] = mut else: @@ -123,7 +424,7 @@ def convert_build38to36(): for line in f: con=line.split('->') if len(con) < 2: - print(line) + print('TODO:', line) continue b36 = con[1].split()[1] #print("Conv %s to %s"%(con[0].split()[1], con[1].split()[1])) @@ -150,7 +451,173 @@ def load_db(): haplo_muts_by_b37[mut['b37']] = mut haplo_muts_by_b38[mut['b38']] = mut +def save_db2(): + with open('haploy_map2.txt', 'w') as f: + for mut in haplo_muts_list: + print(mut, file = f) + +def load_db2(): + with open('haploy_map2.txt', 'r') as f: + for line in f: + mut = eval(line) + haplo_muts_list.append(mut) + +def show_db2(): + for m in sorted(haplo_muts_list, key=lambda e: int(e['b37'])): + print(m) + + +def decode_entry(e): + global haplo_muts_by_name + m={} + #print('decode_entry: ', e) + for e1 in e.split('/'): + #print(e1) + if e1 in haplo_muts_by_name: + #print(haplo_muts_by_name[e1]) + mut = haplo_muts_by_name[e1] + m['f']='' + if not 'isogg' in m: + m['isog']='' + m['isog']+=mut['g'] + m['t']=mut['p'] + m['b38']=mut['b38'] + m['b37']=mut['b37'] + if 'b36' in mut: + m['b36']=mut['b36'] + else: + continue + break + return m + +def yfull_fname(group): + if group: + return 'yfull/yfull-ytree-'+group+'.html' + else: + return 'yfull/yfull-ytree.html' + +def yfull_url(group): + if group: + return 'https://www.yfull.com/tree/' + group + '/' + else: + return 'https://www.yfull.com/tree/' + +# YFull mtree import (experimental) +def download_yfull_file(group): + try: + os.mkdir('yfull') + except OSError: + pass + fname = yfull_fname(group) + url = yfull_url(group) + print('Downloading ' + url + 'to file: ' + fname) + urllib.request.urlretrieve("https://www.yfull.com/tree/"+group+"/", fname); + +def yfull_parse_muts(li): + s='' + snpforhg=li.find('span', class_='yf-snpforhg', recursive=False) + if snpforhg: + s+=snpforhg.text + plussnps=li.find('span', class_='yf-plus-snps', recursive=False) + if plussnps: + s += ' * ' + plussnps['title'] + o=[] + if len(s) > 0: + for m in s.split('*'): + o.append(m.strip()) + return o + +def yfull_parse_age(li): + s='' + agespan=li.find('span', class_='yf-age', recursive=False) + if agespan: + s+=agespan.text + return s + +def yfull_recurse_list(ul_in, level, fileroot): + lis = ul_in.find_all('li', recursive=False) + for li in lis: + muts={} + muts['l']=level + g=li.find('a', recursive=False) + if g: + muts['g']=g.text + l=li.find('a', href=True, recursive=False) + if l: + muts['link']=l['href'] + #print(g.text, g['href']) + + mlist = yfull_parse_muts(li) + age = yfull_parse_age(li) + + #if 'g' in muts and not muts['g'].endswith('*') and not fileroot: + if 'g' in muts and not muts['g'].endswith('*'): + #add separate entries for each mut in same hg + #print(mlist) + for m in mlist: + mutse=dict(muts) + if 'link' in mutse: + del mutse['link'] + dec = decode_entry(m) + mutse['raw']=m + if not dec: + continue + if not dec['b37']: #TODO + print('TODO b37:', mutse, dec) + dec['b37'] = '0' + if not dec['b38']: #TODO + print('TODO b37:', mutse, dec) + dec['b38'] = '0' + if not 'b36' in dec: + print('TODO b36:', mutse, dec) + dec['b36'] = '0' + #muts['f']=dec['f'] + mutse['t']=dec['t'] + mutse['isog']=dec['isog'] + mutse['b36']=dec['b36'] + mutse['b37']=dec['b37'] + mutse['b38']=dec['b38'] + mutse['raw']=m + if age: + mutse['txt']=age + #print(mutse) + haplo_muts_list.append(mutse) + + ul = li.find('ul', recursive=False) + if ul: + #print('->') + yfull_recurse_list(ul, level+1, None) + #print('<-') + else: + if 'g' in muts and muts['g'].endswith('*'): + continue + if 'link' in muts: + group=muts['link'].split('/')[-2] + #print('FILE: ' +fname) + yfull_recurse_file(group, level) + #print('END: ' +fname) + return 0 + +def yfull_recurse_file(group, level): + fname = yfull_fname(group) + try: + with open(fname) as f: + pass + except OSError: + print('File not found: ' +fname) + download_yfull_file(group) + + with open(fname) as f: + print('Importing file: ' +fname) + soup = BeautifulSoup(f.read(), features="html.parser") + ul = soup.find('ul', id='tree') + yfull_recurse_list(ul, level, None) +def import_yfull_snp(): + #TODO complex muts + #yfull_recurse_file('', 0) #cannot use top level: muts missing + yfull_recurse_file('A00', 0) + yfull_recurse_file('A0-T', 0) diff --git a/haploy_db_import.py b/haploy_db_import.py index 8755698..3bbfee3 100755 --- a/haploy_db_import.py +++ b/haploy_db_import.py @@ -3,13 +3,20 @@ import os import haploy -print("Loading mutation DB from csv...") +print("Loading mutation DB from ISOGG csv...") haploy.load_snp() print("Running conversion to support build 36 matching...") haploy.convert_build38to36() +print("Importing mutation DB from YFull...") +haploy.import_yfull_snp() +#haploy.show_db2() + print("Writing local mutation DB...") haploy.save_db() +print("Writing local mutation DB2...") +haploy.save_db2() + print("Database import done!") diff --git a/haploy_find.py b/haploy_find.py index 2d95b8b..474e006 100755 --- a/haploy_find.py +++ b/haploy_find.py @@ -3,52 +3,120 @@ import os import snpload import haploy +import argparse +#configure how many candidates are shown: +n_single = 1 +n_multi = 5 +force='' +filt='' +all=False +new_yfind=1 -if len(sys.argv) < 2: - print(sys.argv[0]+" ") - print(sys.argv[0]+" ..") -elif len(sys.argv) < 3: - print("Loading DB...") - haploy.load_db() - print("DB loaded!") - print("Loading chr data...") - fname=sys.argv[1] - snpset, meta = snpload.load(fname, ['Y']) +parser = argparse.ArgumentParser() +parser.add_argument('-s', '--single', help='Analyse a path for single group') +parser.add_argument('-a', '--all', action='store_true', help='Show listing of all found mutations') +parser.add_argument('-n', '--num', help='Show num best matches') +parser.add_argument('file', nargs='+') + +args = parser.parse_args() +if args.single: + force = args.single + filt = '='+args.single +if args.num: + n_single = int(args.num) + n_multi = int(args.num) +if args.all: + all=True + +if len(args.file) < 2: - if 'Y' not in snpset: - print("No Y data found") - sys.exit() - - print("%s: Total SNPs: %d"%(fname,meta['total'])) - - found_mutations=haploy.yfind(snpset) - for m in found_mutations: - print("%-24s %-50s b38:%s %s"%(m['g'], m['mall'], m['b38'], m['rs'])) -else: - print("Loading DB...") - haploy.load_db() - print("DB loaded!") - lookfor = sys.argv[1].split(',') - for fname in sys.argv[2:]: + if new_yfind: + print("Loading DB2...") + haploy.load_db2() + print("DB loaded!") + rep = haploy.report(args.file[0], n_single, do_all=all, filt=filt, force=force) + print(rep) + else: + # keep old one available + print("Loading DB...") + haploy.load_db() + print("DB loaded!") + print("Loading chr data...") + fname=args.file[0] snpset, meta = snpload.load(fname, ['Y']) + if 'Y' not in snpset: - print('%s: no Y data'%fname) - continue - found_mutations = haploy.yfind(snpset) - found_mutations.reverse() - found=0 + print("No Y data found") + sys.exit() + + print("%s: Total SNPs: %d"%(fname,meta['total'])) + + found_mutations=haploy.yfind(snpset) for m in found_mutations: - for cmp in lookfor: - if m['g'].startswith(cmp) or cmp == '0': - found=1 - if cmp in m['mall']: - found=1 - if found: - print('%s: match found'%fname) - for m in found_mutations[0:9]: - print("%-24s %-50s b38:%s %s"%(m['g'], m['mall'], m['b38'], m['rs'])) - else: - print('%s: no match'%fname) - - + print("%-24s %-50s b38:%s %s"%(m['g'], m['mall'], m['b38'], m['rs'])) +else: + lookfor = args.file[0].split(',') + if new_yfind: + print("Loading DB...") + haploy.load_db2() + print("DB loaded!") + for fname in args.file[1:]: + snpset, meta = snpload.load(fname, ['Y']) + + if 'Y' not in snpset: + print('%s: no Y data'%fname) + continue + + print('Build: %d'%meta['build']) + b3x='b36' + if meta['build']==37: + b3x='b37' + if meta['build']==38: + b3x='b38' + best_trees = haploy.yfind2(snpset, n_multi, filt, force, b3x) + + found=0 + for bt in best_trees: + for cmp in lookfor: + leaf_mut = bt['ut'][len(bt['ut'])-1] + path = haploy.path_str(bt['ut'], 15) + if leaf_mut['g'].startswith(cmp) or cmp == '0': + found=1 + if cmp in leaf_mut['raw']: ##todo + found=1 + if cmp in path: + found=1 + if found: + print('%s: match found'%fname) + for bt in best_trees: + leaf_mut = bt['ut'][len(bt['ut'])-1] + #print("Result (%-8s %5.1f%% -%d +%d): %-8s (ISOGG: %s)"%(leaf_mut['raw'], bt['score'], bt['neg'], len(bt['extras']), haploy.path_str(bt['ut'], 15), leaf_mut['isog'])) + print("Result (%-8s %5.1f%% -%d +%d): %-8s"%(leaf_mut['raw'], bt['score'], bt['neg'], len(bt['extras']), leaf_mut['g'])) + print(" %s"%(haploy.path_str(bt['ut'], 20))) + else: + print('%s: no match'%fname) + else: + print("Loading DB...") + haploy.load_db() + print("DB loaded!") + for fname in args.file[1:]: + snpset, meta = snpload.load(fname, ['Y']) + if 'Y' not in snpset: + print('%s: no Y data'%fname) + continue + found_mutations = haploy.yfind(snpset) + found_mutations.reverse() + found=0 + for m in found_mutations: + for cmp in lookfor: + if m['g'].startswith(cmp) or cmp == '0': + found=1 + if cmp in m['mall']: + found=1 + if found: + print('%s: match found'%fname) + for m in found_mutations[0:9]: + print("%-24s %-50s b38:%s %s"%(m['g'], m['mall'], m['b38'], m['rs'])) + else: + print('%s: no match'%fname) diff --git a/snipsa-gui.py b/snipsa-gui.py index f136fac..e69c300 100755 --- a/snipsa-gui.py +++ b/snipsa-gui.py @@ -5,6 +5,7 @@ import tkinter.filedialog from tkinter import scrolledtext import haplomt +import haploy # # Win install: @@ -19,7 +20,14 @@ def handle_file_select(): fname = tkinter.filedialog.askopenfile().name fnamevar.set(fname) +dbmt_loaded = 0 +dby_loaded = 0 + def handle_findmt(): + global dbmt_loaded + if not dbmt_loaded: + dbmt_loaded = 1 + haplomt.load_db() do_uptree = report_snps.get() do_all = report_all.get() force = pathvar.get() @@ -34,13 +42,28 @@ def handle_findmt(): scr.config(state=tk.DISABLED) scr.see("end") -print("Loading DB...") -haplomt.load_db() -print("DB loaded!") +def handle_findy(): + global dby_loaded + if not dby_loaded: + dby_loaded = 1 + haploy.load_db2() + do_uptree = report_snps.get() + do_all = report_all.get() + force = pathvar.get() + fname = fnamevar.get() + print("Reporting file: "+fname) + num = int(numbox.get()) + rep = haploy.report(fname, num, do_uptree=do_uptree, do_extra=do_uptree, do_all=do_all, filt=force, force=force) + print("Done") + scr.config(state=tk.NORMAL) + scr.delete('1.0', tk.END) + scr.insert('1.0', rep) + scr.config(state=tk.DISABLED) + scr.see("end") window = tk.Tk() window.title("Snipsa GUI") -window.geometry("600x700") +window.geometry("900x700") window.minsize(600, 300) hdrframe=tk.Frame(window) @@ -83,7 +106,9 @@ def handle_findmt(): cframe=tk.Frame(hdrframe) cframe.pack() cbutton1 = tk.Button(cframe, text="Find MT", command=handle_findmt) -cbutton1.pack(fill='x', expand=True) +cbutton1.pack(side=tk.LEFT, fill='x', expand=True) +cbutton2 = tk.Button(cframe, text="Find Y", command=handle_findy) +cbutton2.pack(side=tk.LEFT, fill='x', expand=True) # Result area scr=scrolledtext.ScrolledText(window, wrap=tk.WORD)