#-------------------------------------------------------------------------------
# Name:
caculate the number of reads
mapped or unmapped
# Purpose:
#
# Author:
XX
#
# Created:
13/08/2014
# Copyright: (c) lianhe
2014
# Licence:
#-------------------------------------------------------------------------------
import os,sys
import re
import string
import math
import random
_reExpr1=re.compile(r'\s+')
_reExpr2=re.compile(r'^\s*$')
_splicedHit_pat =
re.compile(r'(\d+)[M|N]',re.IGNORECASE)
_monoHit_pat =
re.compile(r'^(\d+)M$',re.IGNORECASE)
_insertionHit_pat
=re.compile(r'\d+I',re.IGNORECASE)
_deletionHit_pat
=re.compile(r'\d+D',re.IGNORECASE)
_softClipHi_pat
=re.compile(r'\d+S',re.IGNORECASE)
_hardClipHi_pat
=re.compile(r'\d+H',re.IGNORECASE)
_padHit_pat
=re.compile(r'\d+P',re.IGNORECASE)
_uniqueHit_pat =
re.compile(r'[NI]H:i:1\b')
def stat (self):
#
total_read=0
D_total_read =
{}
pcr_duplicate
=0
D_pcr_duplicate=
{}
low_qual
=0
D_low_qual={}
secondary_hit
=0
D_secondary_hit
={}
D_total_unmapped={}
D_total_mapped
={}
# when I try to caculate the number of total_mapped and
total_unmapped ,I correct them by using dictionary not by adding
mapped_read1 and mapped# _reads2 together
unmapped_read1=0
D_unmapped_read1={}
mapped_read1=0
D_mapped_read1={}
reverse_read1=0
D_reverse_read1={}
forward_read1=0
D_forward_read1={}
unmapped_read2=0
D_unmapped_read2={}
mapped_read2=0
D_mapped_read2={}
reverse_read2=0
D_reverse_read2={}
forward_read2=0
D_forward_read2={}
_numSplitHit
=0
D_numSplitHit={}
_numMonoHit
=0
D_numMonoHit={}
_numInsertion
=0
D_numInsertion={}
_numDeletion
=0
D_numDeletion={}
minus_minus=0
D_minus_minus={}
minus_plus
=0
D_minus_plus={}
plus_minus=0
D_plus_minus={}
plus_plus=0
D_plus_plus={}
paired=True
unmap_SE=0
D_unmap_SE={}
map_SE=0
D_map_SE={}
reverse_SE=0
D_reverse_SE={}
forward_SE=0
D_forward_SE={}
temp_count_1 =
0
temp_count_2 =
0
temp_count_3 =
0
for line in
self:
line=line.rstrip()
if
line.startswith('@'):continue
if
_reExpr2.match(line):continue
field = line.split()
flagCode=string.atoi(field[1])
if (not
D_total_read.has_key(field[0])):
D_total_read.setdefault(field[0])
if (flagCode & 0x0400 !=0
and (not
D_pcr_duplicate.has_key(field[0]))):
D_pcr_duplicate.setdefault(field[0])
continue
if(flagCode & 0x0200 !=0 and (not
D_low_qual.has_key(field[0]))):
D_low_qual.setdefault(field[0])
continue
if(flagCode & 0x0200 !=0 and (not
D_secondary_hit.has_key(field[0]))):
D_secondary_hit.setdefault(field[0])
continue
if(len(_splicedHit_pat.findall(field[5]))>1
and (not D_numSplitHit.has_key(field[0]))):
D_numSplitHit.setdefault(field[0])
if(len(_splicedHit_pat.findall(field[5]))==1 and
(not D_numMonoHit.has_key(field[0]))):
D_numMonoHit.setdefault(field[0])
if(_insertionHit_pat.search(field[5]) and (not
D_numInsertion.has_key(field[0]))):
D_numInsertion.setdefault(field[0])
if(_deletionHit_pat.search(field[5]) and (not
D_numDeletion.has_key(field[0]))):
D_numDeletion.setdefault(field[0])
if(flagCode & 0x0001
!=0):
temp_count_3 += 1
if
(flagCode & 0x0040 != 0):
temp_count_1 +=
1
if (flagCode & 0x0004 !=
0 and (not
D_unmapped_read1.has_key(field[0]))):
D_unmapped_read1.setdefault(field[0])
D_total_unmapped.setdefault(field[0])
else:pass
if (flagCode & 0x0004 ==
0 and (not D_mapped_read1.has_key(field[0]))):
D_mapped_read1.setdefault(field[0])
D_total_mapped.setdefault(field[0])
else:pass
# print "D_mapped_read1 add something in
"+field[0]
# else
:pass
if (flagCode & 0x0010 !=
0 and not D_reverse_read1.has_key(field[0])):
D_reverse_read1.setdefault(field[0])
#
print "D_reverse_read1
add"+field[0]
#
else:pass
if (flagCode & 0x0010 ==
0 and not D_forward_read1.has_key(field[0])):
D_forward_read1.setdefault(field[0])
#
print "D_forward_read1 add
"+field[0]
#
else:pass
if(flagCode & 0x0080 != 0):
temp_count_2 +=
1
if (flagCode & 0x0004 !=
0 and not D_unmapped_read2.has_key(field[0])):
D_unmapped_read2.setdefault(field[0])
D_total_unmapped.setdefault(field[0])
if (flagCode & 0x0004 ==
0 and not D_mapped_read2.has_key(field[0])):
D_mapped_read2.setdefault(field[0])
D_total_mapped.setdefault(field[0])
else:pass
if (flagCode & 0x0010 !=
0 and not D_reverse_read2.has_key(field[0])):
D_reverse_read2.setdefault(field[0])
if (flagCode & 0x0010 ==
0 and not D_forward_read2.has_key(field[0])):
D_forward_read2.setdefault(field[0])
if
(flagCode & 0x0010 != 0 and flagCode & 0x0020 != 0 and not
D_minus_minus.has_key(field[0])):
D_minus_minus.setdefault(field[0])
if
(flagCode & 0x0010 != 0 and flagCode & 0x0020 == 0 and not
D_minus_plus.has_key(field[0])):
D_minus_plus.setdefault(field[0])
if
(flagCode & 0x0010 == 0 and flagCode & 0x0020 != 0 and not
D_plus_minus.has_key(field[0])):
D_plus_minus.setdefault(field[0])
if
(flagCode & 0x0010 == 0 and flagCode & 0x0020 == 0 and not
D_plus_plus.has_key(field[0])):
D_plus_plus.setdefault(field[0])
if (flagCode & 0x0001
==0):
paired=False
if
(flagCode & 0x0004 != 0 and not
D_unmap_SE.has_key(field[0])):
D_unmap_SE.setdefault(field[0])
if
(flagCode & 0x0004 == 0 and not
D_map_SE.has_key(field[0])):
D_map_SE.setdefault(field[0])
if
(flagCode & 0x0010 != 0 and not
D_reverse_SE.has_key(field[0])):
D_reverse_SE.setdefault(field[0])
if
(flagCode & 0x0010 == 0 and not
D_forward_SE.has_key(field[0])):
D_forward_SE.setdefault(field[0])
if
paired:
print
>>sys.stderr,"\n#=================================================="
print
>>sys.stderr,"#================Report
(pair-end)================="
print >>sys.stderr, "%-25s%d" % ("Total
Reads:",len(D_total_read))
print >>sys.stderr, "%-25s%d" % ("Total
Mapped Reads:", (len(D_total_mapped)))
print >>sys.stderr, "%-25s%d" % ("Total
Unmapped Reads:",(len(D_total_unmapped)))
print >>sys.stderr, "%-25s%d" % ("PCR
duplicate:",len(D_pcr_duplicate))
print >>sys.stderr, "%-25s%d" %
("QC-failed:",len(D_low_qual))
print >>sys.stderr, "%-25s%d" % ("Not
primary mapping:",len(D_secondary_hit))
print >>sys.stderr,
"\n",
print >>sys.stderr, "%-25s%d" % ("Unmapped
Read-1:",len(D_unmapped_read1))
print >>sys.stderr, "%-25s%d" % ("Mapped
Read-1:",len(D_mapped_read1))
print >>sys.stderr, "%-25s%d" % ("
Forward
(+)",len(D_forward_read1))
print >>sys.stderr, "%-25s%d" % ("
Reverse
(-):",len(D_reverse_read1))
print >>sys.stderr,
"\n",
print >>sys.stderr, "%-25s%d" % ("Unmapped
Read-2:",len(D_unmapped_read2))
print >>sys.stderr, "%-25s%d" % ("Mapped
Read-2:",len(D_mapped_read2))
print >>sys.stderr, "%-25s%d" % ("
Forward
(+):",len(D_forward_read2))
print >>sys.stderr, "%-25s%d" % ("
Reverse
(-):",len(D_reverse_read2))
print >>sys.stderr,
"\n",
print >>sys.stderr, "%-25s%d" % ("Mapped
to (+/-):",len(D_plus_minus))
print >>sys.stderr, "%-25s%d" % ("Mapped
to (-/+):",len(D_minus_plus))
print >>sys.stderr, "%-25s%d" % ("Mapped
to (+/+):",len(D_plus_plus))
print >>sys.stderr, "%-25s%d" % ("Mapped
to (-/-):",len(D_minus_minus))
print >>sys.stderr,
"\n",
print >>sys.stderr, "%-25s%d" % ("Spliced
Hits:",len(D_numSplitHit))
print >>sys.stderr, "%-25s%d" %
("Non-spliced Hits:",len(D_numMonoHit))
print >>sys.stderr, "%-25s%d" % ("Reads
have insertion:",len(D_numInsertion))
print >>sys.stderr, "%-25s%d" % ("Reads
have deletion:",len(D_numDeletion))
else:
print
>>sys.stderr,"\n#===================================================="
print
>>sys.stderr,"#================Report
(single-end)================="
print >>sys.stderr, "%-25s%d" % ("Total
Reads:",len(D_total_read))
print >>sys.stderr, "%-25s%d" % ("Total
Mapped Reads:", len(D_map_SE))
print >>sys.stderr, "%-25s%d" % ("Total
Unmapped Reads:",len(D_unmap_SE))
print >>sys.stderr, "%-25s%d" % ("PCR
duplicate:",len(D_pcr_duplicate))
print >>sys.stderr, "%-25s%d" %
("QC-failed:",len(D_low_qual))
print >>sys.stderr, "%-25s%d" % ("Not
primary mapping:",len(D_secondary_hit))
print >>sys.stderr, "%-25s%d" % ("froward
(+):",len(D_forward_SE))
print >>sys.stderr, "%-25s%d" % ("reverse
(-):",len(D_reverse_SE))
print >>sys.stderr,
"\n",
print >>sys.stderr, "%-25s%d" % ("Spliced
Hits:",len(D_numSplitHit))
print >>sys.stderr, "%-25s%d" %
("Non-spliced Hits:",len(D_numMonoHit))
print >>sys.stderr, "%-25s%d" % ("Reads
have insertion:",len(D_numInsertion))
print >>sys.stderr, "%-25s%d" % ("Reads
have deletion:",len(D_numDeletion))
# print "one:"+str(temp_count_1)
# print
"two:"+str(temp_count_2)
# print
"three:"+str(temp_count_3)
filename=sys.argv[1]
stat(open(filename,'r'))
例子:
$ python /leofs/noncode/dechao/python/bam_stat.py ERR1880440.sam
2> ERR1880440.sam.stat
http://image.sciencenet.cn/album/201410/16/211811ucyrwnra0k0hh7tz.jpg
其中,total reads/total mapped reads/total unmapped
reads统计过程如下:
total reads = Total
Reads * 2 = 27256165 * 2 =54512330
#由于是双端测序的reads,所以结果要乘以2。
Total Mapped Reads即包含有一端mapping上的reads,也含有两端都mapping上的reads,所以不能简单的乘以2来统计total
mapped reads。
Total Unmapped Reads为两端都没mapping上的reads,但仅有一端mapping上,另一端没有mapping上的情况没统计,所以不能用于统计total
unmapped reads。
Unmapped Read-1/Unmapped
Read-2为read-1/read-2分别没有mapping上的数目。
Mapped Read-1/Mapped
Read-2为read-1/read-2分别mapping上的数目。
因此,根据以上信息可以统计出total
mapped reads/total unmapped reads的数目:
total mapped reads = Mapped
Read-1 + Mapped
Read-2 =
24337121 + 24115846 =48452967
total unmapped reads = Unmapped
Read-1 + Unmapped
Read-2 =
2919044 + 3140319 =
6059363