StatReadsFusion.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. class StatReadsFusion:
  2. def __init__(self):
  3. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  4. def GetSitePos(self,site):
  5. f_chr=site.split("-")[0].split(':')[0]
  6. f_pos=site.split("-")[0].split(':')[1]
  7. e_chr=site.split("-")[1].split(':')[0]
  8. e_pos=site.split("-")[1].split(':')[1]
  9. return f_chr,f_pos,e_chr,e_pos
  10. def CheckSite(self,key_site,select_site,FLAG=False):
  11. key_site_info=self.GetSitePos(key_site)
  12. select_site_info=self.GetSitePos(select_site)
  13. if key_site_info[0] == select_site_info[0] and abs(int(key_site_info[1])-int(select_site_info[1])) <=5:
  14. if FLAG or key_site_info[2] == select_site_info[2] and abs(int(key_site_info[3])-int(select_site_info[3])) <=5:
  15. return True
  16. elif key_site_info[0] == select_site_info[2] and abs(int(key_site_info[1])-int(select_site_info[3])) <=5:
  17. if FLAG or key_site_info[2] == select_site_info[0] and abs(int(key_site_info[3])-int(select_site_info[1])) <=5:
  18. return True
  19. else:
  20. return False
  21. def UniqSiteID(self,site_list):
  22. site_dict_num={}
  23. total_num = len(site_list)
  24. for item in site_list:
  25. if item not in site_dict_num.keys():
  26. site_dict_num[item]=1
  27. else:
  28. site_dict_num[item]+=1
  29. seq_site_sort= sorted(site_dict_num.items(), key=lambda d:d[1], reverse = True)
  30. site_num=len(seq_site_sort)
  31. key_site =seq_site_sort[0]
  32. site_list_new=[key_site]
  33. count = int(key_site[1])
  34. for index in range(1,site_num):
  35. select_site=seq_site_sort[index]
  36. if not self.CheckSite(key_site[0],select_site[0]):
  37. site_list_new.append(select_site)
  38. else:
  39. count += int(select_site[1])
  40. #不超过1半,则不合并
  41. if count*2 >= total_num:
  42. return site_list_new
  43. else:
  44. return seq_site_sort
  45. def GetCombineSeq(self,key_seq,check_index,seq_count_sort):
  46. total_seq_num=len(seq_count_sort)
  47. key_seq_list=[]
  48. for index in range(total_seq_num):
  49. if index not in check_index:
  50. select_seq=seq_count_sort[index][0]
  51. if self.IsSameSeq(key_seq,select_seq):
  52. key_seq_list.append(select_seq)
  53. check_index.append(index)
  54. return key_seq_list
  55. def IsSameSeq(self,key_seq,select_seq):
  56. select_seq_rev_comp="".join([self.BASE_DICT[base] for base in select_seq][::-1])
  57. #flag = True if key_seq == "AGTGGTTTTCTACTCTAAATATAGTAACTAGTATCCTCCTGGCTGATCAGGGGGTGGGGA" and select_seq == "GTCTCTGAGTGGTTTTCTACTCTAAATATATTAAATAGTATCCTCCTGGCTGATCAGGGG" else False
  58. flag =False
  59. #select seq former
  60. for index in range(8):
  61. select_seq_temp=select_seq[index:]
  62. temp_len=len(select_seq_temp)
  63. key_seq_temp=key_seq[:temp_len]
  64. if self.SeqOverlap(select_seq_temp,key_seq_temp,8):
  65. return True
  66. #select seq latter
  67. for index in range(8):
  68. key_seq_temp=key_seq[index:]
  69. temp_len=len(key_seq_temp)
  70. select_seq_temp=select_seq[:temp_len]
  71. if self.SeqOverlap(select_seq_temp,key_seq_temp):
  72. return True
  73. #rev_comp select seq former
  74. for index in range(6):
  75. select_seq_rev_comp_temp=select_seq_rev_comp[index:]
  76. temp_len=len(select_seq_rev_comp_temp)
  77. key_seq_temp=key_seq[:temp_len]
  78. if self.SeqOverlap(select_seq_rev_comp_temp,key_seq_temp):
  79. return True
  80. #rev_comp select seq latter
  81. for index in range(5):
  82. key_seq_temp=key_seq[index:]
  83. temp_len=len(key_seq_temp)
  84. select_seq_rev_comp_temp=select_seq_rev_comp[:temp_len]
  85. if self.SeqOverlap(select_seq_rev_comp_temp,key_seq_temp):
  86. return True
  87. return False
  88. def SeqOverlap(self,seq1,seq2,MISMATCH=4):
  89. seq_len=min(len(seq1),len(seq2))
  90. count =0
  91. for index in range(seq_len):
  92. if seq1[index] != seq2[index]:
  93. count +=1
  94. if count >MISMATCH:
  95. return False
  96. return True
  97. def SinglePoint(self,primer_list,mismatch_seq,match_seq):
  98. primer_down_seq = primer_list[9]
  99. primer_pos_start = primer_list[3]
  100. primer_pos_end = primer_list[4]
  101. primer_chr = primer_list[2]
  102. primer_strand = primer_list[5]
  103. primer_seq = primer_list[1]
  104. primer_len = len(primer_seq)
  105. mismatch_len = len(mismatch_seq)
  106. temp_len = mismatch_len - primer_len
  107. if temp_len<5:
  108. return False,""
  109. for index in range(primer_len,10,-1):
  110. for i_index in range(temp_len,5,-1):
  111. primer_seq_temp = primer_seq[-1*index:]
  112. mismatch_seq_temp = mismatch_seq[-1*(i_index+index):-1*i_index]
  113. if self.SeqOverlap(primer_seq_temp,mismatch_seq_temp,2):
  114. primer_down_seq_temp=primer_down_seq[:i_index]
  115. mismatch_seq_temp1 = mismatch_seq[-1*i_index:]
  116. if self.SeqOverlap(primer_down_seq_temp,mismatch_seq_temp1,0):
  117. if primer_strand =="-1":
  118. point=int(primer_pos_start)-i_index
  119. point_e=point+30
  120. else:
  121. point=int(primer_pos_end)+i_index
  122. point_e=point-30
  123. down_seq = match_seq[:30]
  124. return True,primer_seq_temp+mismatch_seq_temp1+down_seq,primer_chr+":"+str(point),primer_chr+":"+str(point_e)
  125. return False,""
  126. def SingleCombine(slef,info_dict,seq):
  127. seq_len=len(seq)
  128. for index in range(seq_len):
  129. base = seq[index]
  130. if index in info_dict.keys():
  131. info_dict[index][base]+=1
  132. else:
  133. BASE_DICT={'A':0,'T':0,'G':0,'C':0,'N':0}
  134. BASE_DICT[base] +=1
  135. info_dict[index]=BASE_DICT
  136. def SingleCombineSeq(self,info_dict):
  137. new_seq=""
  138. total_count =0
  139. flag = True
  140. for index in info_dict.keys():
  141. temp_dict=info_dict[index]
  142. max_count =0
  143. max_base=""
  144. for base in temp_dict.keys():
  145. total_count = total_count+temp_dict[base] if flag else total_count
  146. if temp_dict[base] > max_count:
  147. max_base = base
  148. max_count=temp_dict[base]
  149. flag = False
  150. if max_count > 0.5 * total_count:
  151. new_seq=new_seq+max_base
  152. else:
  153. return new_seq
  154. return new_seq
  155. def CheckSingleSite(slef,site,double_site):
  156. single_info=site.split(":")
  157. single_chr = single_info[0]
  158. single_pos = int(single_info[1])
  159. double_info_temp = double_site.split("-")
  160. double_info1 = double_info_temp[0]
  161. double_info2 = double_info_temp[1]
  162. double_info1_temp = double_info1.split(":")
  163. double_info2_temp = double_info2.split(":")
  164. double_info1_chr = double_info1_temp[0]
  165. double_info1_pos = int(double_info1_temp[1])
  166. double_info2_chr = double_info2_temp[0]
  167. double_info2_pos = int(double_info2_temp[1])
  168. if single_chr == double_info1_chr and abs(double_info1_pos-single_pos) <7:
  169. return True
  170. if single_chr == double_info2_chr and abs(double_info2_pos-single_pos) < 7:
  171. return True
  172. return False
  173. def CheckSingleFusionSeq(self,new_seq,fusion_seq):
  174. new_seq_rev="".join([self.BASE_DICT[base] for base in new_seq])[::-1]
  175. new_seq_len=len(new_seq)
  176. fusion_seq_len=len(fusion_seq)
  177. #new seq---fusion seq
  178. for index in range(new_seq_len,35,-1):
  179. new_seq_temp = new_seq[-1*index:]
  180. new_seq_temp_len = len(new_seq_temp)
  181. if new_seq_temp_len > fusion_seq_len:
  182. continue
  183. fusion_seq_temp = fusion_seq[:new_seq_temp_len]
  184. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  185. return True
  186. #new seq rev comp--fusion seq
  187. for index in range(new_seq_len,35,-1):
  188. new_seq_temp = new_seq_rev[-1*index:]
  189. new_seq_temp_len = len(new_seq_temp)
  190. if new_seq_temp_len > fusion_seq_len:
  191. continue
  192. fusion_seq_temp = fusion_seq[:new_seq_temp_len]
  193. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  194. return True
  195. #fusion seq -- new seq
  196. for index in range(fusion_seq_len,35,-1):
  197. fusion_seq_temp = fusion_seq[-1*index:]
  198. fusion_seq_temp_len = len(fusion_seq_temp)
  199. if fusion_seq_temp_len>new_seq_len:
  200. continue
  201. new_seq_temp = new_seq[:fusion_seq_temp_len]
  202. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  203. return True
  204. #fusion seq -- new seq rev
  205. for index in range(fusion_seq_len,35,-1):
  206. fusion_seq_temp = fusion_seq[-1*index:]
  207. fusion_seq_temp_len = len(fusion_seq_temp)
  208. if fusion_seq_temp_len>new_seq_len:
  209. continue
  210. new_seq_temp = new_seq_rev[:fusion_seq_temp_len]
  211. if self.SeqOverlap(fusion_seq_temp,new_seq_temp):
  212. return True
  213. return False
  214. def GetSinglePrimer(self,seq,primer_seq_dict):
  215. primer=""
  216. if seq in primer_seq_dict.keys():
  217. primer = primer_seq_dict[seq][0]
  218. else:
  219. for item in primer_seq_dict.keys():
  220. if len(item) != len(seq):
  221. continue
  222. if self.SeqOverlap(seq,item):
  223. primer=primer_seq_dict[item][0]
  224. return primer
  225. def StatFPPrimer(self,readsIDFile,readsPrimer_dict):
  226. with open(readsIDFile,'r') as rf:
  227. header=rf.readline().strip("\n").split("\t")
  228. ReadsID_index=header.index("ReadsID")
  229. FP_index=header.index("FP_name")
  230. for line in rf:
  231. info = line.strip("\n").split("\t")
  232. readsPrimer_dict[info[ReadsID_index]] = info[FP_index]
  233. class StatDoubleReadsFusion(StatReadsFusion):
  234. def __init__(self):
  235. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  236. def StatPrimer(self,readsIDFile,readsPrimer_dict):
  237. self.StatFPPrimer(readsIDFile,readsPrimer_dict)
  238. def StatDouble(self,infile,readsID_dict,site_dict):
  239. seq_site_dict={}
  240. seq_readsID_dict={}
  241. seq_site_info={}
  242. with open(infile,'r') as inputF:
  243. header=inputF.readline().strip("\n").split("\t")
  244. seq_index=header.index("Seq")
  245. ReadsID_index=header.index("ReadsID")
  246. site_index=header.index("Site")
  247. Site1_Down_index=header.index("Site1_Down")
  248. Site2_Down_index=header.index("Site2_Down")
  249. Overlap_index=header.index("Overlap")
  250. for line in inputF:
  251. info = line.strip("\n").split("\t")
  252. seq = info[seq_index]
  253. ID = info[ReadsID_index]
  254. site = info[site_index]
  255. Site1_Down = info[Site1_Down_index]
  256. Site2_Down = info[Site2_Down_index]
  257. Overlap = info[Overlap_index]
  258. seq_rev_comp="".join([self.BASE_DICT[base] for base in seq][::-1])
  259. if seq == "":
  260. continue
  261. seq_site_info[site]=[Site1_Down,Site2_Down,Overlap]
  262. if seq not in seq_site_dict.keys() and seq_rev_comp not in seq_site_dict.keys():
  263. seq_site_dict[seq]=[site]
  264. seq_readsID_dict[seq]=[ID]
  265. elif seq_rev_comp in seq_site_dict.keys():
  266. site_temp=site.split("-")
  267. site=site_temp[1]+'-'+site_temp[0]
  268. seq_site_dict[seq_rev_comp].append(site)
  269. seq_readsID_dict[seq_rev_comp].append(ID)
  270. elif seq in seq_site_dict.keys():
  271. seq_site_dict[seq].append(site)
  272. seq_readsID_dict[seq].append(ID)
  273. #sort by count
  274. seq_count_dict={}
  275. for seq in seq_site_dict.keys():
  276. count = len(seq_site_dict[seq])
  277. seq_count_dict[seq]=count
  278. seq_count_sort= sorted(seq_count_dict.items(), key=lambda d:d[1], reverse = True)
  279. #combine seq
  280. total_seq_num=len(seq_count_sort)
  281. check_index=[]
  282. site_dict_temp={}
  283. for index in range(total_seq_num):
  284. if index not in check_index:
  285. key_seq=seq_count_sort[index][0]
  286. check_index.append(index)
  287. combine_seq_list=self.GetCombineSeq(key_seq,check_index,seq_count_sort)
  288. site_dict_temp[key_seq]=seq_site_dict[key_seq]
  289. seq_ID_list=seq_readsID_dict[key_seq]
  290. for seq in combine_seq_list:
  291. seq_ID_list.extend(seq_readsID_dict[seq])
  292. site_list=seq_site_dict[seq]
  293. for site_item in site_list:
  294. site_dict_temp[key_seq].append(site_item)
  295. readsID_dict[key_seq]=seq_ID_list
  296. #combine site
  297. for key_seq in site_dict_temp.keys():
  298. site_list=site_dict_temp[key_seq]
  299. temp = self.UniqSiteID(site_list)
  300. site=temp[0][0]
  301. site_rev_temp=site.split("-")
  302. site_rev=site_rev_temp[1]+'-'+site_rev_temp[0]
  303. site_temp=seq_site_info[site] if site in seq_site_info else seq_site_info[site_rev]
  304. site_dict[key_seq]=[site]
  305. for item in site_temp:
  306. site_dict[key_seq].append(item)
  307. class SingleReadsFusion(StatReadsFusion):
  308. def __init__(self,home_dict,info_dict,primer_seq_dict):
  309. self.home_dict=home_dict
  310. self.info_dict=info_dict
  311. self.primer_seq_dict=primer_seq_dict
  312. self.BASE_DICT={'A':'T','T':'A','G':'C','C':'G','N':'N','-':'-'}
  313. def StatPrimer(self,readsIDFile,readsPrimer_dict):
  314. self.StatFPPrimer(readsIDFile,readsPrimer_dict)
  315. def CheckSite(self,site,double_site):
  316. return self.CheckSingleSite(site,double_site)
  317. def CheckFusionSeq(self,new_seq,fusion_seq):
  318. return self.CheckSingleFusionSeq(new_seq,fusion_seq)
  319. def GetSingleCombine(self,info_list):
  320. mismatch_dict={}
  321. match_dict={}
  322. for item in info_list:
  323. mismatch_seq=item[1][::-1]
  324. self.SingleCombine(mismatch_dict,mismatch_seq)
  325. match_seq=item[2]
  326. self.SingleCombine(match_dict,match_seq)
  327. mismatch_seq=self.SingleCombineSeq(mismatch_dict)[::-1]
  328. match_seq=self.SingleCombineSeq(match_dict)
  329. return mismatch_seq,match_seq
  330. def GetSinglePoint(self,readsID,mismatch_seq,match_seq):
  331. readsID_info = readsID.split("_")
  332. if len(readsID_info)<5:
  333. return False,[]
  334. FP_primer = readsID_info[3]
  335. RP_primer = readsID_info[4]
  336. FP_primer_rev = "".join([self.BASE_DICT[base] for base in FP_primer])[::-1]
  337. #RP_primer_rev = "".join([self.BASE_DICT[base] for base in RP_primer])[::-1]
  338. if FP_primer in mismatch_seq or FP_primer_rev in mismatch_seq:
  339. primer_seq = FP_primer
  340. #elif RP_primer in mismatch_seq or RP_primer_rev in mismatch_seq:
  341. # primer_seq = RP_primer
  342. else:
  343. return False,[]
  344. primer_ID = self.GetSinglePrimer(primer_seq,self.primer_seq_dict)
  345. if primer_ID == "":
  346. return False,[]
  347. primer_list = self.info_dict[primer_ID]
  348. results=self.SinglePoint(primer_list,mismatch_seq,match_seq)
  349. final_point_list=[]
  350. if results[0]:
  351. temp=list(results)[1:]
  352. final_point_list.append(temp)
  353. #final_point_list.append(primer_ID)
  354. if primer_ID in self.home_dict.keys():
  355. primer_homo_list = self.home_dict[primer_ID]
  356. for ID in primer_homo_list:
  357. ID_list = self.info_dict[ID]
  358. results=self.SinglePoint(ID_list,mismatch_seq,match_seq)
  359. if results[0]:
  360. temp=list(results)[1:]
  361. final_point_list.append(temp)
  362. #final_point_list.append(ID)
  363. return True,final_point_list