StatReadsFusion.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. class StatReadsFusion:
  2. def __init__(self):
  3. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  4. def GetSitePos(self,site):
  5. f_chr=site.split("-")[0].split(':')[0]
  6. f_pos=site.split("-")[0].split(':')[1]
  7. e_chr=site.split("-")[1].split(':')[0]
  8. e_pos=site.split("-")[1].split(':')[1]
  9. return f_chr,f_pos,e_chr,e_pos
  10. def CheckSite(self,key_site,select_site,FLAG=False):
  11. key_site_info=self.GetSitePos(key_site)
  12. select_site_info=self.GetSitePos(select_site)
  13. if key_site_info[0] == select_site_info[0] and abs(int(key_site_info[1])-int(select_site_info[1])) <=5:
  14. if FLAG or key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5:
  15. return True
  16. elif key_site_info[0] == select_site_info[2] and abs(int(key_site_info[1])-int(select_site_info[3])) <=5:
  17. if FLAG or key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5:
  18. return True
  19. else:
  20. return False
  21. def UniqSiteID(self,site_list):
  22. site_dict_num={}
  23. total_num = len(site_list)
  24. for item in site_list:
  25. if item not in site_dict_num.keys():
  26. site_dict_num[item]=1
  27. else:
  28. site_dict_num[item]+=1
  29. seq_site_sort= sorted(site_dict_num.items(), key=lambda d:d[1], reverse = True)
  30. site_num=len(seq_site_sort)
  31. key_site =seq_site_sort[0]
  32. site_list_new=[key_site]
  33. count = int(key_site[1])
  34. for index in range(1,site_num):
  35. select_site=seq_site_sort[index]
  36. if not self.CheckSite(key_site[0],select_site[0]):
  37. site_list_new.append(select_site)
  38. else:
  39. count += int(select_site[1])
  40. #不超过1半,则不合并
  41. if count*2 >= total_num:
  42. return site_list_new
  43. else:
  44. return seq_site_sort
  45. def GetCombineSeq(self,key_seq,check_index,seq_count_sort):
  46. total_seq_num=len(seq_count_sort)
  47. key_seq_list=[]
  48. for index in range(total_seq_num):
  49. if index not in check_index:
  50. select_seq=seq_count_sort[index][0]
  51. if self.IsSameSeq(key_seq,select_seq):
  52. key_seq_list.append(select_seq)
  53. check_index.append(index)
  54. return key_seq_list
  55. def IsSameSeq(self,key_seq,select_seq):
  56. select_seq_rev_comp="".join([self.BASE_DICT[base] for base in select_seq][::-1])
  57. #flag = True if key_seq == "AGTGGTTTTCTACTCTAAATATAGTAACTAGTATCCTCCTGGCTGATCAGGGGGTGGGGA" and select_seq == "GTCTCTGAGTGGTTTTCTACTCTAAATATATTAAATAGTATCCTCCTGGCTGATCAGGGG" else False
  58. flag =False
  59. #select seq former
  60. select_seq_num = len(select_seq)
  61. select_seq_num_half = int(select_seq_num/2)
  62. for index in range(select_seq_num_half):
  63. select_seq_temp=select_seq[index:]
  64. temp_len=len(select_seq_temp)
  65. key_seq_temp=key_seq[:temp_len]
  66. if self.SeqOverlap(select_seq_temp,key_seq_temp,8):
  67. return True
  68. #select seq latter
  69. key_seq_num = len(key_seq)
  70. key_seq_num_half = int(key_seq_num/2)
  71. for index in range(key_seq_num_half):
  72. key_seq_temp=key_seq[index:]
  73. temp_len=len(key_seq_temp)
  74. select_seq_temp=select_seq[:temp_len]
  75. if self.SeqOverlap(select_seq_temp,key_seq_temp):
  76. return True
  77. #rev_comp select seq former
  78. for index in range(select_seq_num_half):
  79. select_seq_rev_comp_temp=select_seq_rev_comp[index:]
  80. temp_len=len(select_seq_rev_comp_temp)
  81. key_seq_temp=key_seq[:temp_len]
  82. if self.SeqOverlap(select_seq_rev_comp_temp,key_seq_temp):
  83. return True
  84. #rev_comp select seq latter
  85. for index in range(key_seq_num_half):
  86. key_seq_temp=key_seq[index:]
  87. temp_len=len(key_seq_temp)
  88. select_seq_rev_comp_temp=select_seq_rev_comp[:temp_len]
  89. if self.SeqOverlap(select_seq_rev_comp_temp,key_seq_temp):
  90. return True
  91. return False
  92. def SeqOverlap(self,seq1,seq2,MISMATCH=4):
  93. seq_len=min(len(seq1),len(seq2))
  94. count =0
  95. for index in range(seq_len):
  96. if seq1[index] != seq2[index]:
  97. count +=1
  98. if count >MISMATCH:
  99. return False
  100. return True
  101. def SinglePoint(self,primer_list,mismatch_seq,match_seq):
  102. primer_down_seq = primer_list[9]
  103. primer_pos_start = primer_list[3]
  104. primer_pos_end = primer_list[4]
  105. primer_chr = primer_list[2]
  106. primer_strand = primer_list[5]
  107. primer_seq = primer_list[1]
  108. primer_len = len(primer_seq)
  109. mismatch_len = len(mismatch_seq)
  110. temp_len = mismatch_len - primer_len
  111. if temp_len<5:
  112. return False,""
  113. for index in range(primer_len,10,-1):
  114. for i_index in range(temp_len,5,-1):
  115. primer_seq_temp = primer_seq[-1*index:]
  116. mismatch_seq_temp = mismatch_seq[-1*(i_index+index):-1*i_index]
  117. if self.SeqOverlap(primer_seq_temp,mismatch_seq_temp,2):
  118. primer_down_seq_temp=primer_down_seq[:i_index]
  119. mismatch_seq_temp1 = mismatch_seq[-1*i_index:]
  120. if self.SeqOverlap(primer_down_seq_temp,mismatch_seq_temp1,0):
  121. if primer_strand =="-1":
  122. point=int(primer_pos_start)-i_index
  123. point_e=point+30
  124. else:
  125. point=int(primer_pos_end)+i_index
  126. point_e=point-30
  127. down_seq = match_seq[:30]
  128. return True,primer_seq_temp+mismatch_seq_temp1+down_seq,primer_chr+":"+str(point),primer_chr+":"+str(point_e)
  129. return False,""
  130. def SingleCombine(slef,info_dict,seq):
  131. seq_len=len(seq)
  132. for index in range(seq_len):
  133. base = seq[index]
  134. if index in info_dict.keys():
  135. info_dict[index][base]+=1
  136. else:
  137. BASE_DICT={'A':0,'T':0,'G':0,'C':0,'N':0}
  138. BASE_DICT[base] +=1
  139. info_dict[index]=BASE_DICT
  140. def SingleCombineSeq(self,info_dict):
  141. new_seq=""
  142. total_count =0
  143. flag = True
  144. for index in info_dict.keys():
  145. temp_dict=info_dict[index]
  146. max_count =0
  147. max_base=""
  148. for base in temp_dict.keys():
  149. total_count = total_count+temp_dict[base] if flag else total_count
  150. if temp_dict[base] > max_count:
  151. max_base = base
  152. max_count=temp_dict[base]
  153. flag = False
  154. if max_count > 0.5 * total_count:
  155. new_seq=new_seq+max_base
  156. else:
  157. return new_seq
  158. return new_seq
  159. def CheckSingleSite(slef,site,double_site):
  160. single_info=site.split(":")
  161. single_chr = single_info[0]
  162. single_pos = int(single_info[1])
  163. double_info_temp = double_site.split("-")
  164. double_info1 = double_info_temp[0]
  165. double_info2 = double_info_temp[1]
  166. double_info1_temp = double_info1.split(":")
  167. double_info2_temp = double_info2.split(":")
  168. double_info1_chr = double_info1_temp[0]
  169. double_info1_pos = int(double_info1_temp[1])
  170. double_info2_chr = double_info2_temp[0]
  171. double_info2_pos = int(double_info2_temp[1])
  172. if single_chr == double_info1_chr and abs(double_info1_pos-single_pos) <7:
  173. return True
  174. if single_chr == double_info2_chr and abs(double_info2_pos-single_pos) < 7:
  175. return True
  176. return False
  177. def CheckSingleFusionSeq(self,new_seq,fusion_seq):
  178. new_seq_rev="".join([self.BASE_DICT[base] for base in new_seq])[::-1]
  179. new_seq_len=len(new_seq)
  180. fusion_seq_len=len(fusion_seq)
  181. #new seq---fusion seq
  182. for index in range(new_seq_len,35,-1):
  183. new_seq_temp = new_seq[-1*index:]
  184. new_seq_temp_len = len(new_seq_temp)
  185. if new_seq_temp_len > fusion_seq_len:
  186. continue
  187. fusion_seq_temp = fusion_seq[:new_seq_temp_len]
  188. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  189. return True
  190. #new seq rev comp--fusion seq
  191. for index in range(new_seq_len,35,-1):
  192. new_seq_temp = new_seq_rev[-1*index:]
  193. new_seq_temp_len = len(new_seq_temp)
  194. if new_seq_temp_len > fusion_seq_len:
  195. continue
  196. fusion_seq_temp = fusion_seq[:new_seq_temp_len]
  197. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  198. return True
  199. #fusion seq -- new seq
  200. for index in range(fusion_seq_len,35,-1):
  201. fusion_seq_temp = fusion_seq[-1*index:]
  202. fusion_seq_temp_len = len(fusion_seq_temp)
  203. if fusion_seq_temp_len>new_seq_len:
  204. continue
  205. new_seq_temp = new_seq[:fusion_seq_temp_len]
  206. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  207. return True
  208. #fusion seq -- new seq rev
  209. for index in range(fusion_seq_len,35,-1):
  210. fusion_seq_temp = fusion_seq[-1*index:]
  211. fusion_seq_temp_len = len(fusion_seq_temp)
  212. if fusion_seq_temp_len>new_seq_len:
  213. continue
  214. new_seq_temp = new_seq_rev[:fusion_seq_temp_len]
  215. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  216. return True
  217. return False
  218. def GetSinglePrimer(self,seq,primer_seq_dict):
  219. primer=""
  220. if seq in primer_seq_dict.keys():
  221. primer = primer_seq_dict[seq][0]
  222. else:
  223. for item in primer_seq_dict.keys():
  224. if len(item) != len(seq):
  225. continue
  226. if self.SeqOverlap(seq,item):
  227. primer=primer_seq_dict[item][0]
  228. return primer
  229. def StatFPPrimer(self,readsIDFile,readsPrimer_dict):
  230. with open(readsIDFile,'r') as rf:
  231. header=rf.readline().strip("\n").split("\t")
  232. ReadsID_index=header.index("ReadsID")
  233. FP_index=header.index("FP_name")
  234. for line in rf:
  235. info = line.strip("\n").split("\t")
  236. readsPrimer_dict[info[ReadsID_index]] = info[FP_index]
  237. class StatDoubleReadsFusion(StatReadsFusion):
  238. def __init__(self):
  239. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  240. def StatPrimer(self,readsIDFile,readsPrimer_dict):
  241. self.StatFPPrimer(readsIDFile,readsPrimer_dict)
  242. def StatDouble(self,infile,readsID_dict,site_dict):
  243. seq_site_dict={}
  244. seq_readsID_dict={}
  245. seq_site_info={}
  246. with open(infile,'r') as inputF:
  247. header=inputF.readline().strip("\n").split("\t")
  248. seq_index=header.index("Seq")
  249. ReadsID_index=header.index("ReadsID")
  250. site_index=header.index("Site")
  251. Site1_Down_index=header.index("Site1_Down")
  252. Site2_Down_index=header.index("Site2_Down")
  253. Overlap_index=header.index("Overlap")
  254. for line in inputF:
  255. info = line.strip("\n").split("\t")
  256. seq = info[seq_index]
  257. ID = info[ReadsID_index]
  258. site = info[site_index]
  259. Site1_Down = info[Site1_Down_index]
  260. Site2_Down = info[Site2_Down_index]
  261. Overlap = info[Overlap_index]
  262. seq_rev_comp="".join([self.BASE_DICT[base] for base in seq][::-1])
  263. if seq == "":
  264. continue
  265. seq_site_info[site]=[Site1_Down,Site2_Down,Overlap]
  266. if seq not in seq_site_dict.keys() and seq_rev_comp not in seq_site_dict.keys():
  267. seq_site_dict[seq]=[site]
  268. seq_readsID_dict[seq]=[ID]
  269. elif seq_rev_comp in seq_site_dict.keys():
  270. site_temp=site.split("-")
  271. site=site_temp[1]+'-'+site_temp[0]
  272. seq_site_dict[seq_rev_comp].append(site)
  273. seq_readsID_dict[seq_rev_comp].append(ID)
  274. elif seq in seq_site_dict.keys():
  275. seq_site_dict[seq].append(site)
  276. seq_readsID_dict[seq].append(ID)
  277. #sort by count
  278. seq_count_dict={}
  279. for seq in seq_site_dict.keys():
  280. count = len(seq_site_dict[seq])
  281. seq_count_dict[seq]=count
  282. seq_count_sort= sorted(seq_count_dict.items(), key=lambda d:d[1], reverse = True)
  283. #combine seq
  284. total_seq_num=len(seq_count_sort)
  285. check_index=[]
  286. site_dict_temp={}
  287. for index in range(total_seq_num):
  288. if index not in check_index:
  289. key_seq=seq_count_sort[index][0]
  290. check_index.append(index)
  291. combine_seq_list=self.GetCombineSeq(key_seq,check_index,seq_count_sort)
  292. site_dict_temp[key_seq]=seq_site_dict[key_seq]
  293. seq_ID_list=seq_readsID_dict[key_seq]
  294. for seq in combine_seq_list:
  295. seq_ID_list.extend(seq_readsID_dict[seq])
  296. site_list=seq_site_dict[seq]
  297. for site_item in site_list:
  298. site_dict_temp[key_seq].append(site_item)
  299. readsID_dict[key_seq]=seq_ID_list
  300. #combine site
  301. for key_seq in site_dict_temp.keys():
  302. site_list=site_dict_temp[key_seq]
  303. temp = self.UniqSiteID(site_list)
  304. site=temp[0][0]
  305. site_rev_temp=site.split("-")
  306. site_rev=site_rev_temp[1]+'-'+site_rev_temp[0]
  307. site_temp=seq_site_info[site] if site in seq_site_info else seq_site_info[site_rev]
  308. site_dict[key_seq]=[site]
  309. for item in site_temp:
  310. site_dict[key_seq].append(item)
  311. class SingleReadsFusion(StatReadsFusion):
  312. def __init__(self,home_dict,info_dict,primer_seq_dict):
  313. self.home_dict=home_dict
  314. self.info_dict=info_dict
  315. self.primer_seq_dict=primer_seq_dict
  316. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  317. def StatPrimer(self,readsIDFile,readsPrimer_dict):
  318. self.StatFPPrimer(readsIDFile,readsPrimer_dict)
  319. def CheckSite(self,site,double_site):
  320. return self.CheckSingleSite(site,double_site)
  321. def CheckFusionSeq(self,new_seq,fusion_seq):
  322. return self.CheckSingleFusionSeq(new_seq,fusion_seq)
  323. def GetSingleCombine(self,info_list):
  324. mismatch_dict={}
  325. match_dict={}
  326. for item in info_list:
  327. mismatch_seq=item[1][::-1]
  328. self.SingleCombine(mismatch_dict,mismatch_seq)
  329. match_seq=item[2]
  330. self.SingleCombine(match_dict,match_seq)
  331. mismatch_seq=self.SingleCombineSeq(mismatch_dict)[::-1]
  332. match_seq=self.SingleCombineSeq(match_dict)
  333. return mismatch_seq,match_seq
  334. def GetSinglePoint(self,readsID,mismatch_seq,match_seq):
  335. readsID_info = readsID.split("_")
  336. if len(readsID_info)<5:
  337. return False,[]
  338. FP_primer = readsID_info[3]
  339. RP_primer = readsID_info[4]
  340. FP_primer_rev = "".join([self.BASE_DICT[base] for base in FP_primer])[::-1]
  341. #RP_primer_rev = "".join([self.BASE_DICT[base] for base in RP_primer])[::-1]
  342. if FP_primer in mismatch_seq or FP_primer_rev in mismatch_seq:
  343. primer_seq = FP_primer
  344. #elif RP_primer in mismatch_seq or RP_primer_rev in mismatch_seq:
  345. # primer_seq = RP_primer
  346. else:
  347. return False,[]
  348. primer_ID = self.GetSinglePrimer(primer_seq,self.primer_seq_dict)
  349. if primer_ID == "":
  350. return False,[]
  351. primer_list = self.info_dict[primer_ID]
  352. results=self.SinglePoint(primer_list,mismatch_seq,match_seq)
  353. final_point_list=[]
  354. if results[0]:
  355. temp=list(results)[1:]
  356. final_point_list.append(temp)
  357. #final_point_list.append(primer_ID)
  358. if primer_ID in self.home_dict.keys():
  359. primer_homo_list = self.home_dict[primer_ID]
  360. for ID in primer_homo_list:
  361. ID_list = self.info_dict[ID]
  362. results=self.SinglePoint(ID_list,mismatch_seq,match_seq)
  363. if results[0]:
  364. temp=list(results)[1:]
  365. final_point_list.append(temp)
  366. #final_point_list.append(ID)
  367. return True,final_point_list