Primer_Check_big_v6.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. #!/usr/bin/python
  2. # -*- coding:utf-8 -*-
  3. import argparse,os,re,gzip,time,itertools,sys
  4. primer_dict_seq={}
  5. primer_dict_seq_check={}
  6. primer_dict1={}
  7. primer_dict2={}
  8. primer_dict3={}
  9. def get_primer_dict(primerFile):
  10. global primer_dict_seq
  11. global primer_dict1
  12. global primer_dict2
  13. global primer_dict3
  14. with open(primerFile,'r') as pf:
  15. header= pf.readline()
  16. for line in pf:
  17. seq=line.strip("\n").split("\t")[2].upper()
  18. primer_name = line.strip("\n").split("\t")[0]
  19. seq_check=line.strip("\n").split("\t")[1]
  20. seq_len = len(seq)
  21. primer_dict_seq[primer_name] = seq
  22. primer_dict_seq_check[primer_name] = seq_check[-1*(seq_len+8):]
  23. for i in [0,4,8]:
  24. tmp_seq = seq[i:i+4]
  25. if i %4 == 0:
  26. if tmp_seq not in primer_dict1.keys():
  27. primer_dict1[tmp_seq]={}
  28. primer_dict1[tmp_seq][primer_name]=seq
  29. else:
  30. if primer_name not in primer_dict1[tmp_seq].keys():
  31. primer_dict1[tmp_seq][primer_name]=seq
  32. elif i % 4 == 1:
  33. if tmp_seq not in primer_dict2.keys():
  34. previous_seq=seq[i-3:i]
  35. primer_dict2[tmp_seq]={}
  36. primer_dict2[tmp_seq][primer_name]=seq
  37. else:
  38. if primer_name not in primer_dict2[tmp_seq].keys():
  39. primer_dict2[tmp_seq][primer_name]=seq
  40. elif i %4 ==2:
  41. if tmp_seq not in primer_dict3.keys():
  42. primer_dict3[tmp_seq]={}
  43. primer_dict3[tmp_seq][primer_name]=seq
  44. else:
  45. if primer_name not in primer_dict3[tmp_seq].keys():
  46. primer_dict3[tmp_seq][primer_name]=seq
  47. def mis_match(seq1,seq2,num=0):
  48. count = 0
  49. len1=len(seq1)
  50. len2=len(seq2)
  51. if len1 != len2:
  52. return num+1
  53. for i in range(len1):
  54. if seq1[i] != seq2[i] and seq1[i] != "H":
  55. count +=1
  56. if count > num:
  57. return count
  58. return count
  59. def get_location(sequence):
  60. #4-base seq
  61. seq1=sequence[:4]
  62. seq2=sequence[4:8]
  63. seq3=sequence[8:12]
  64. primer_dict1_tmp={}
  65. if seq1 in primer_dict1.keys():
  66. primer_dict1_tmp=primer_dict1[seq1]
  67. primer_dict2_tmp={}
  68. if seq2 in primer_dict2.keys():
  69. primer_dict2_tmp=primer_dict2[seq2]
  70. primer_dict3_tmp={}
  71. for seq3 in primer_dict3.keys():
  72. primer_dict3_tmp=primer_dict3[seq3]
  73. max_len=0
  74. final_primer=""
  75. mismatch=0
  76. #seq1 correct
  77. for primer in primer_dict1_tmp.keys():
  78. primer_seq=primer_dict1_tmp[primer]
  79. candidate_len=len(primer_seq)
  80. if max_len <= candidate_len:
  81. sequence_seq=sequence[:candidate_len]
  82. mis_num=mis_match(sequence_seq,primer_seq,2)
  83. if mis_num <=2:
  84. if max_len < candidate_len:
  85. final_primer=primer
  86. max_len = candidate_len
  87. mismatch = mis_num
  88. elif max_len == candidate_len:
  89. if mismatch>mis_num:
  90. final_primer=primer
  91. max_len = candidate_len
  92. mismatch = mis_num
  93. #seq1 incorrect,seq2 correct
  94. primer_dict2_key=[item for item in primer_dict2_tmp.keys() if item not in primer_dict2_tmp.keys()]
  95. for primer in primer_dict2_key:
  96. primer_seq=primer_dict2_tmp[primer]
  97. candidate_len=len(primer_seq)
  98. if max_len <= candidate_len:
  99. sequence_seq=sequence[:candidate_len]
  100. mis_num=mis_match(sequence_seq,primer_seq,2)
  101. if mis_num <=2:
  102. if max_len < candidate_len:
  103. final_primer=primer
  104. max_len = candidate_len
  105. mismatch = mis_num
  106. elif max_len == candidate_len:
  107. if mismatch>mis_num:
  108. final_primer=primer
  109. max_len = candidate_len
  110. mismatch = mis_num
  111. #seq1 incorrect,seq2 incorrect,seq3 correct
  112. primer_dict3_key=[item for item in primer_dict3_tmp.keys() if (item not in primer_dict2_tmp.keys() and item not in primer_dict1_tmp.keys())]
  113. for primer in primer_dict3_key:
  114. primer_seq=primer_dict3_tmp[primer]
  115. candidate_len=len(primer_seq)
  116. if max_len <= candidate_len:
  117. sequence_seq=sequence[:candidate_len]
  118. mis_num=mis_match(sequence_seq,primer_seq,2)
  119. if mis_num <=2:
  120. if max_len < candidate_len:
  121. final_primer=primer
  122. max_len = candidate_len
  123. mismatch = mis_num
  124. elif max_len == candidate_len:
  125. if mismatch>mis_num:
  126. final_primer=primer
  127. max_len = candidate_len
  128. mismatch = mis_num
  129. return max_len,final_primer,sequence[:max_len]
  130. def reverse(seq):
  131. base_dict={'A':'T','G':'C','T':'A','C':'G','N':'N'}
  132. new_seq=[base_dict[base] for base in seq][::-1]
  133. return "".join(new_seq)
  134. def get_Insert_Size(primer,sequence):
  135. primer_length = len(primer)
  136. sequence_length = len(sequence)
  137. if primer_length == 0:
  138. return 0
  139. for i in range(sequence_length-primer_length):
  140. tmp=sequence[i:i+primer_length]
  141. if mis_match(tmp,primer,1)<=1:
  142. return i
  143. for i in range(primer_length-3,primer_length):
  144. tmp=sequence[:i]
  145. primer_tmp=primer[-1*i:]
  146. if mis_match(tmp,primer,1)<=1:
  147. return 0
  148. return sequence_length
  149. def main(primerFile,FileR1,FileR2,outfile):
  150. get_primer_dict(primerFile)
  151. outf=open(outfile,'w')
  152. outf.write("ReadsID\tRP_Start\tRP_End\tRP_Name\tRP_Seq\tFP_Start\tFP_End\tFP_name\tFP_seq\tDimer\n")
  153. R1outf_insert=open(outfile.replace('.txt','_primer_check_')+"R1.fastq",'w')
  154. R2outf_insert=open(outfile.replace('.txt','_primer_check_')+"R2.fastq",'w')
  155. with gzip.open(FileR1,'r') as R1, gzip.open(FileR2,'r') as R2:
  156. while True:
  157. try:
  158. linesR1 = itertools.islice(R1,4)
  159. IDR1 = next(linesR1).decode('utf-8').strip("\n")
  160. IDR1tmp=IDR1.split(" ")
  161. ID=IDR1tmp[0][1:]
  162. sequenceR1 = next(linesR1).decode('utf-8').strip("\n")
  163. tmp1R1=next(linesR1).decode('utf-8').strip("\n")
  164. tmp2R1=next(linesR1).decode('utf-8').strip("\n")
  165. linesR2 = itertools.islice(R2,4)
  166. IDR2 = next(linesR2).decode('utf-8').strip("\n")
  167. IDR2tmp=IDR2.split(" ")
  168. sequenceR2 = next(linesR2).decode('utf-8').strip("\n")
  169. tmp1R2=next(linesR2).decode('utf-8').strip("\n")
  170. tmp2R2=next(linesR2).decode('utf-8').strip("\n")
  171. UMI = sequenceR2[4:19]
  172. RP_location=get_location(sequenceR1)
  173. FP_location=get_location(sequenceR2[19:])
  174. FP_start=20
  175. RP_start=1
  176. #check R2 primer
  177. if FP_location[1] != "":
  178. sequence_seq=sequenceR2[FP_start-9:FP_start-1+FP_location[0]]
  179. primer_seq=primer_dict_seq_check[FP_location[1]]
  180. if mis_match(primer_seq,sequence_seq,2)>2:
  181. FP_location=0,"",""
  182. out_str = ID+"\t"+str(RP_start)+"\t"+str(RP_start-1+int(RP_location[0]))+"\t"+RP_location[1]+"\t"+RP_location[2]+"\t"+str(FP_start)+"\t"+str(FP_start-1+int(FP_location[0]))+"\t"+FP_location[1]+"\t"+FP_location[2]
  183. if "G" not in UMI:
  184. trim_R1="@"+ID+"_UMI_"+UMI+" "+IDR1tmp[-1]+"\n"+sequenceR1[RP_start-1:]+"\n"+tmp1R1+"\n"+tmp2R1[RP_start-1:]
  185. trim_R2="@"+ID+"_UMI_"+UMI+" "+IDR2tmp[-1]+"\n"+sequenceR2[FP_start-1:]+"\n"+tmp1R2+"\n"+tmp2R2[FP_start-1:]
  186. else:
  187. trim_R1="@"+ID+" "+IDR1tmp[-1]+"\n"+sequenceR1[RP_start-1:]+"\n"+tmp1R1+"\n"+tmp2R1[RP_start-1:]
  188. trim_R2="@"+ID+" "+IDR2tmp[-1]+"\n"+sequenceR2[FP_start-1:]+"\n"+tmp1R2+"\n"+tmp2R2[FP_start-1:]
  189. #no primer
  190. if RP_location[1] == "" and FP_location[1] =="":
  191. R1outf_insert.write(trim_R1+"\n")
  192. R2outf_insert.write(trim_R2+"\n")
  193. outf.write(out_str+"\tno_primer"+"\n")
  194. else:
  195. #two primer
  196. if RP_location[1] !="" and FP_location[1] !="":
  197. rev_seq1=reverse(FP_location[2])
  198. Insert_Size = get_Insert_Size(rev_seq1,sequenceR1)
  199. if Insert_Size > int(RP_location[0]):
  200. RP_primer=RP_location[-1]
  201. FP_primer=FP_location[-1]
  202. trim_R1="@"+ID+"_UMI_"+UMI+"_"+RP_primer+"_"+FP_primer+" "+IDR1tmp[-1]+"\n"+sequenceR1[RP_start-1:]+"\n"+tmp1R1+"\n"+tmp2R1[RP_start-1:]
  203. trim_R2="@"+ID+"_UMI_"+UMI+"_"+RP_primer+"_"+FP_primer+" "+IDR2tmp[-1]+"\n"+sequenceR2[FP_start-1:]+"\n"+tmp1R2+"\n"+tmp2R2[FP_start-1:]
  204. R1outf_insert.write(trim_R1+"\n")
  205. R2outf_insert.write(trim_R2+"\n")
  206. outf.write(out_str+"\tothers"+"\n")
  207. else:
  208. outf.write(out_str+"\tdimer"+"\n")
  209. #only one primer
  210. else:
  211. R1outf_insert.write(trim_R1+"\n")
  212. R2outf_insert.write(trim_R2+"\n")
  213. outf.write(out_str+"\tone_primer"+"\n")
  214. except:
  215. err = sys.exc_info()
  216. for e in err:
  217. print(e)
  218. break
  219. outf.close()
  220. R1outf_insert.close()
  221. R2outf_insert.close()
  222. if __name__ == '__main__':
  223. parser = argparse.ArgumentParser(description='get primer location')
  224. parser.add_argument('-p', required=True, type=str, help="primer file")
  225. parser.add_argument('-R1', required=True, type=str, help="FileR1")
  226. parser.add_argument('-R2', required=True, type=str, help="FileR2")
  227. parser.add_argument('-o', required=True, type=str, help="outfile")
  228. args = parser.parse_args()
  229. main(args.p, args.R1, args.R2, args.o)