123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- class SearchDoublePoints:
- def __init__(self):
- self.BASE_DICT={'A':'T','G':'C','T':'A','C':'G','N':'N'}
-
- def GetMapInfo(self,info_str):
- #info_str:99M51S
- #return {0:'M99',1:'S51'}
- num_list=[]
- map_dict={}
- count=0
- for item in info_str:
- if item.isalpha():
- num=len(num_list)
- index=0
- for i in range(num):
- index += int(num_list[i])*10**(num-i-1)
- map_dict[count]=item+":"+str(index)
- count +=1
- num_list=[]
- elif item.isdigit():
- num_list.append(item)
- return map_dict
- #adjust D or I
- def AdjInfo(self,seq,map_info):
- flag = True
- for key in map_info.keys():
- item = map_info[key]
- if 'D' in item or 'I' in item:
- flag = False
- if 'H' in map_info[0]:
- flag = False
- #不需要调整
- if flag:
- return seq,map_info
- else:
- new_seq_list=[]
- new_info1={}
- start=0
- old_item=''
- for key in map_info.keys():
- item = map_info[key]
- temp_len=int(item.split(":")[1])
- end = start+temp_len
- if item.startswith('M') or item.startswith('S'):
- for index in range(start,end):
- new_seq_list.append(seq[index])
- old_item=item
- elif item.startswith('D') and old_item.startswith('M'):
- end = end-temp_len
- for index in range(temp_len):
- new_seq_list.append('-')
- elif item.startswith('H'):
- end = end-temp_len
- start = end
- #
- num=len(map_info)
- index =0
- new_index= 0
- while index+3 <= num:
- item1=map_info[index]
- item2=map_info[index+1]
- item3=map_info[index+2]
- if item1.startswith("M") and item2.startswith("D") and item3.startswith("M"):
- pos1=int(item1.split(":")[1])
- pos2=int(item2.split(":")[1])
- pos3=int(item3.split(":")[1])
- new_info1[new_index] = 'M:'+str(pos1+pos2+pos3)
- new_index +=1
- index +=3
- elif item1.startswith("M") and item2.startswith("I") and item3.startswith("M"):
- pos1=int(item1.split(":")[1])
- pos3=int(item3.split(":")[1])
- new_info1[new_index] = 'M:'+str(pos1+pos3)
- new_index +=1
- index +=3
- elif item1.startswith("H") and item2.startswith("M"):
- new_info1[new_index] = item2
- new_index +=1
- index +=2
- else:
- new_info1[new_index] = item1
- new_index += 1
- index += 1
-
- while index < num:
- new_info1[new_index]=map_info[index]
- new_index += 1
- index += 1
- return "".join(new_seq_list),new_info1
-
- def GetPoints(self,soft_info,hard_info,OVERLAP=10):
- #不分析有缺省值的
- if len(soft_info)<7 or len(hard_info)<7:
- return False,1
- #map_info:{0:'M:90',1:'S:60'}
- #seq:ATGC.....
- map_info1=self.GetMapInfo(soft_info[5])
- map_info2=self.GetMapInfo(hard_info[5])
- seq1=soft_info[6]
- seq2=hard_info[6]
- #矫正map info中有D,I的
- if 'D' in soft_info[5] or 'I' in soft_info[5]:
- info1_temp=self.AdjInfo(seq1,map_info1)
- seq1=info1_temp[0]
- map_info1=info1_temp[1]
-
- if 'D' in hard_info[5] or 'I' in hard_info[5]:
- info2_temp=self.AdjInfo(seq2,map_info2)
- seq2=info2_temp[0]
- map_info2=info2_temp[1]
- chr1=soft_info[2]
- pos1=int(soft_info[3])
- chr2=hard_info[2]
- pos2=int(hard_info[3])
- #point1
- soft_stat=""
- match_ms=0
- point_ms=0
- match_sm=0
- point_sm=0
- match_sms=0
- skip_sms_f=0
- skip_sms_e=0
- point_sms_f=0
- point_sms_e=0
- if len(map_info1)==2:
- #{0: 'M:131', 1: 'S:19'}
- if 'S' in map_info1[1]:
- soft_stat="MS"
- match_ms=int(map_info1[0].split(":")[1])
- point_ms=pos1+match_ms-1
- #{0:'S:19',1:'M:131'}
- elif 'S' in map_info1[0]:
- soft_stat="SM"
- match_sm=int(map_info1[0].split(":")[1])
- point_sm=pos1
- else:
- return False,2
-
- elif len(map_info1)==3:
- #{0: 'S:51', 1: 'M:99', 2:'S:5'}
- if 'S' in map_info1[0] and 'M' in map_info1[1] and 'S' in map_info1[2]:
- soft_stat="SMS"
- match_sms=int(map_info1[1].split(":")[1])
- skip_sms_f=int(map_info1[0].split(":")[1])
- skip_sms_e=int(map_info1[2].split(":")[1])
- point_sms_f=pos1
- point_sms_e=pos1+match_sms-1
- else:
- return False,3
- else:
- return False,4
- #point2
- hard_stat=""
- match_mh=0
- skip_mh=0
- point_mh=0
- skip_hm=0
- point_hm=0
- match_hmh=0
- skip_hmh_f=0
- skip_hmh_e=0
- point_hmh_f=0
- point_hmh_e=0
- if len(map_info2)==2:
- #{0:'M:31',1:'H:50'}
- if 'H' in map_info2[1]:
- hard_stat="MH"
- match_mh=int(map_info2[0].split(":")[1])
- skip_mh=int(map_info2[1].split(":")[1])
- point_mh=pos2+match_mh-1
- #{0:'H:50',1:'M:31'}
- elif 'H' in map_info2[1]:
- hard_stat="HM"
- skip_hm=int(map_info2[0].split(":")[1])
- point_hm=pos2
- else:
- return False,5
-
- elif len(map_info2)==3:
- #{0:'H:30',1:"M:100",2:"H:15"}
- if 'H' in map_info2[0] and 'M' in map_info2[1] and 'H' in map_info2[2]:
- hard_stat="HMH"
- match_hmh=int(map_info2[1].split(":")[1])
- skip_hmh_f=int(map_info2[0].split(":")[1])
- skip_hmh_e=int(map_info2[2].split(":")[1])
- point_hmh_f=pos2
- point_hmh_e=pos2+match_hmh-1
- else:
- return False,6
- else:
- return False,7
- seq1_rev="".join([self.BASE_DICT[base] for base in seq1][::-1])
- #确定points和fusion_seq
- point1=0
- point1_e=0
- point2=0
- point2_e=0
- overlap=0
- fusion_seq=""
- if soft_stat =="MS":
- point1=point_ms
- point1_e=point1-30
- fusion_seq=seq1[match_ms-30:match_ms+30]
- if hard_stat == "MH":
- if match_ms >= skip_mh:
- overlap = match_ms - skip_mh
- else:
- return False,8
- point2=point_mh-overlap
- point2_e=point2-30
- elif hard_stat == "HM":
- if match_ms >= skip_hm:
- overlap = match_ms - skip_hm
- else:
- return False,9
- point2=point_hm+overlap
- point2_e=point2+30
- elif hard_stat == "HMH":
- seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
- seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
- if seq2 == seq1_temp:
- if match_ms >= skip_hmh_f:
- overlap = match_ms - skip_hmh_f
- else:
- return False,10
- point2=point_hmh_f+overlap
- point2_e=point2+30
- elif seq2 == seq1_rev_temp:
- if match_ms >= skip_hmh_e:
- overlap = match_ms -skip_hmh_e
- else:
- return False,11
- point2=point_hmh_e-overlap
- point2_e=point2-30
- else:
- return False,12
- else:
- return False,13
- elif soft_stat == "SM":
- fusion_seq=seq1[match_sm-30:match_sm+30]
- point1=point_sm
- point1_e=point1+30
- if hard_stat == "HM":
- if match_sm >=skip_hm:
- overlap = match_sm - skip_hm
- else:
- return False,14
- point2=point_hm+overlap
- point2_e=point2+30
- elif hard_stat == "MH":
- if match_sm >= skip_mh:
- overlap = match_sm - skip_mh
- else:
- return False,15
- point2=point_mh-overlap
- point2_e=point2-30
- elif hard_stat == "HMH":
- seq1_temp=seq1[skip_hmh_f:skip_hmh_f+match_hmh]
- seq1_rev_temp=seq1_rev[skip_hmh_f:skip_hmh_f+match_hmh]
- if seq2 == seq1_temp:
- if match_ms >= skip_hmh_e:
- overlap = match_ms - skip_hmh_e
- else:
- return False,16
- point2=point_hmh_e-overlap
- point2_e=point2-30
- elif seq2 == seq1_rev_temp:
- if match_ms >= skip_hmh_f:
- overlap = match_ms - skip_hmh_f
- else:
- return False,17
- point2=point_hmh_f+overlap
- point2_e=point2+30
- else:
- return False,18
- elif soft_stat == "SMS":
- if hard_stat == "HM":
- seq1_temp=seq1[skip_hm:]
- seq1_rev_temp=seq1_rev[skip_hm:]
- if seq2 == seq1_temp:
- point1=point_sms_e
- point1_e=point1-30
- fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
- if skip_sms_f+match_sms >= skip_hm:
- overlap = skip_sms_f+match_sms - skip_hm
- else:
- return False,19
- elif seq2 == seq1_rev_temp:
- point1=point_sms_f
- point1_e=point1+30
- fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
- if skip_sms_e+match_sms >= skip_hm:
- overlap = skip_sms_e+match_sms - skip_hm
- else:
- return False,20
- else:
- return False,21
- point2=point_hm+overlap
- point2_e=point2+30
- elif hard_stat == "MH":
- seq1_temp=seq1[:match_mh]
- seq1_rev_temp=seq1_rev[:match_mh]
- if seq2 == seq1_temp:
- point1=point_sms_f
- point1_e=point1+30
- fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
- if skip_sms_e+match_sms >= skip_mh:
- overlap = skip_sms_e+match_sms - skip_mh
- else:
- return False,22
- elif seq2 == seq1_rev_temp:
- point1=point_sms_e
- point1_e=point1-30
- fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
- if skip_sms_f+match_sms >= skip_hm:
- overlap = skip_sms_f+match_sms - skip_hm
- else:
- return False,23
- else:
- return False,24
- point2=point_hm-overlap
- point2_e=point2-30
- elif hard_stat == "HMH":
- if skip_hmh_f < skip_sms_f and skip_sms_f <= skip_hmh_f + match_hmh:
- seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
- if seq2 == seq_temp:
- point1 = point_sms_f
- point1_e = point1 + 30
- fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
- overlap = skip_hmh_f + match_hmh - skip_sms_f
- point2 = point_hmh_e - overlap
- point2_e = point2 -30
- else:
- return False,25
- elif skip_hmh_f >= skip_sms_f and skip_hmh_f <= skip_sms_f + match_sms:
- seq_temp = seq1[skip_hmh_f:skip_hmh_f + match_hmh]
- if seq2 == seq_temp:
- point1 = point_sms_e
- point1_e = point1 - 30
- fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
- overlap = skip_sms_f + match_sms - skip_hmh_f
- point2 = point_hmh_f + overlap
- point2_e = point2 +30
- return False,26
- elif skip_sms_e > skip_hmh_e and skip_sms_e <= skip_hmh_f+match_hmh:
- seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
- if seq2 == seq_temp:
- point1 = point_sms_e
- point1_e = point1 - 30
- fusion_seq=seq1[skip_sms_f+match_sms-30:skip_sms_f+match_sms+30]
- overlap = skip_hmh_f+match_hmh - skip_sms_e
- point2 = point_hmh_e - overlap
- point2_e = point2 - 30
- else:
- return False,27
- elif skip_hmh_f > skip_sms_e and skip_hmh_f <= skip_sms_e+match_sms:
- seq_temp = seq1_rev[skip_hmh_f:skip_hmh_f + match_hmh]
- if seq2 == seq_temp:
- point1 = point_sms_f
- point1_e = point1 + 30
- fusion_seq=seq1[skip_sms_f-30:skip_sms_f+30]
- overlap = skip_sms_e+match_sms - skip_hmh_f
- point2 = point_hmh_e + overlap
- point2_e = point2 + 30
- else:
- return False,28
- return False,29
- else:
- return False,30
- else:
- return False,31
-
-
- if abs(overlap) <= OVERLAP and point1 !=0:
- return True,chr1+":"+str(point1),chr2+":"+str(point2),chr1+":"+str(point1_e),fusion_seq,str(overlap),chr1+":"+str(point1_e),chr2+":"+str(point2_e)
- else:
- return False,32
|