加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

对BWA的Mapping输出结果进行统计分析

(2014-11-08 21:11:12)
标签:

股票

分类: 【命令与脚本】
bwa的mapping输出结果*.sai经转化为*.sam后的文件进行统计分析的python脚本:
命令行: $ python /leofs/noncode/dechao/python/bam_stat.py sam_file 2> out_file       #注意: "2"和">"号之间没有空格,如果输入的时候留有空格会使输出结果重定向到屏幕上,而不会重定向入out_file中。【注意:该脚本接受的输入为sam格式的文件!!!!】
bam_stat.py脚本】
#-------------------------------------------------------------------------------
# Name:        caculate the number of reads mapped or unmapped 
# Purpose: 
#
# Author:      XX
#
# Created:     13/08/2014
# Copyright:   (c) lianhe 2014
# Licence:    
#-------------------------------------------------------------------------------
import os,sys
import re
import string
import math
import random

_reExpr1=re.compile(r'\s+')
_reExpr2=re.compile(r'^\s*$')
_splicedHit_pat = re.compile(r'(\d+)[M|N]',re.IGNORECASE)
_monoHit_pat = re.compile(r'^(\d+)M$',re.IGNORECASE)
_insertionHit_pat =re.compile(r'\d+I',re.IGNORECASE)
_deletionHit_pat =re.compile(r'\d+D',re.IGNORECASE)
_softClipHi_pat =re.compile(r'\d+S',re.IGNORECASE)
_hardClipHi_pat =re.compile(r'\d+H',re.IGNORECASE)
_padHit_pat =re.compile(r'\d+P',re.IGNORECASE)
_uniqueHit_pat = re.compile(r'[NI]H:i:1\b')

def stat (self):
  #  total_read=0
    D_total_read = {}
    pcr_duplicate =0
    D_pcr_duplicate= {}
    low_qual =0
    D_low_qual={}
    secondary_hit =0
    D_secondary_hit ={}


    D_total_unmapped={}
    D_total_mapped ={}
# when I try to caculate the number of total_mapped and total_unmapped ,I correct them by using dictionary not by adding mapped_read1 and mapped# _reads2 together

    unmapped_read1=0
    D_unmapped_read1={}
    mapped_read1=0
    D_mapped_read1={}
    reverse_read1=0
    D_reverse_read1={}
    forward_read1=0
    D_forward_read1={}

    unmapped_read2=0
    D_unmapped_read2={}
    mapped_read2=0
    D_mapped_read2={}
    reverse_read2=0
    D_reverse_read2={}
    forward_read2=0
    D_forward_read2={}

    _numSplitHit =0
    D_numSplitHit={}
    _numMonoHit =0
    D_numMonoHit={}
    _numInsertion =0
    D_numInsertion={}
    _numDeletion =0
    D_numDeletion={}

    minus_minus=0
    D_minus_minus={}
    minus_plus =0
    D_minus_plus={}
    plus_minus=0
    D_plus_minus={}
    plus_plus=0
    D_plus_plus={}

    paired=True

    unmap_SE=0
    D_unmap_SE={}
    map_SE=0
    D_map_SE={}
    reverse_SE=0
    D_reverse_SE={}
    forward_SE=0
    D_forward_SE={}

    temp_count_1 = 0
    temp_count_2 = 0
    temp_count_3 = 0

    for line in self:
        line=line.rstrip()
        if line.startswith('@'):continue
        if _reExpr2.match(line):continue


        field = line.split()
        flagCode=string.atoi(field[1])
        if (not D_total_read.has_key(field[0])):
            D_total_read.setdefault(field[0])

        if (flagCode & 0x0400 !=0  and (not D_pcr_duplicate.has_key(field[0]))):
            D_pcr_duplicate.setdefault(field[0])
            continue
        if(flagCode & 0x0200 !=0 and (not D_low_qual.has_key(field[0]))):
            D_low_qual.setdefault(field[0])
            continue
        if(flagCode & 0x0200 !=0 and (not D_secondary_hit.has_key(field[0]))):
            D_secondary_hit.setdefault(field[0])
            continue
        if(len(_splicedHit_pat.findall(field[5]))>1 and (not D_numSplitHit.has_key(field[0]))):
            D_numSplitHit.setdefault(field[0])
        if(len(_splicedHit_pat.findall(field[5]))==1 and (not D_numMonoHit.has_key(field[0]))):
            D_numMonoHit.setdefault(field[0])
        if(_insertionHit_pat.search(field[5]) and (not D_numInsertion.has_key(field[0]))):
            D_numInsertion.setdefault(field[0])
        if(_deletionHit_pat.search(field[5]) and (not D_numDeletion.has_key(field[0]))):
            D_numDeletion.setdefault(field[0])



        if(flagCode & 0x0001 !=0):
            temp_count_3 += 1
            if (flagCode & 0x0040 != 0):
                temp_count_1 += 1
                if (flagCode & 0x0004 != 0 and (not D_unmapped_read1.has_key(field[0]))):
                    D_unmapped_read1.setdefault(field[0])
                    D_total_unmapped.setdefault(field[0])
                else:pass
                if (flagCode & 0x0004 == 0 and (not D_mapped_read1.has_key(field[0]))):
                    D_mapped_read1.setdefault(field[0])
                    D_total_mapped.setdefault(field[0])
                else:pass
                   # print "D_mapped_read1 add something in "+field[0]
             #   else :pass
                if (flagCode & 0x0010 != 0 and not D_reverse_read1.has_key(field[0])):
                    D_reverse_read1.setdefault(field[0])
                  #  print "D_reverse_read1 add"+field[0]
              #  else:pass
                if (flagCode & 0x0010 == 0 and not D_forward_read1.has_key(field[0])):
                    D_forward_read1.setdefault(field[0])
                  #  print "D_forward_read1 add "+field[0]
              #  else:pass

            if(flagCode & 0x0080 != 0):
                temp_count_2 += 1
                if (flagCode & 0x0004 != 0 and not D_unmapped_read2.has_key(field[0])):
                    D_unmapped_read2.setdefault(field[0])
                    D_total_unmapped.setdefault(field[0])
                if (flagCode & 0x0004 == 0 and not D_mapped_read2.has_key(field[0])):
                    D_mapped_read2.setdefault(field[0])
                    D_total_mapped.setdefault(field[0])
                else:pass 
                if (flagCode & 0x0010 != 0 and not D_reverse_read2.has_key(field[0])):
                    D_reverse_read2.setdefault(field[0])
                if (flagCode & 0x0010 == 0 and not D_forward_read2.has_key(field[0])):
                    D_forward_read2.setdefault(field[0])
            if (flagCode & 0x0010 != 0 and flagCode & 0x0020 != 0 and not D_minus_minus.has_key(field[0])):
                D_minus_minus.setdefault(field[0])
            if (flagCode & 0x0010 != 0 and flagCode & 0x0020 == 0 and not D_minus_plus.has_key(field[0])):
                D_minus_plus.setdefault(field[0])
            if (flagCode & 0x0010 == 0 and flagCode & 0x0020 != 0 and not D_plus_minus.has_key(field[0])):
                D_plus_minus.setdefault(field[0])
            if (flagCode & 0x0010 == 0 and flagCode & 0x0020 == 0 and not D_plus_plus.has_key(field[0])):
                D_plus_plus.setdefault(field[0])

        if (flagCode & 0x0001 ==0):
            paired=False
            if (flagCode & 0x0004 != 0 and not D_unmap_SE.has_key(field[0])):
                D_unmap_SE.setdefault(field[0])
            if (flagCode & 0x0004 == 0 and not D_map_SE.has_key(field[0])):
                D_map_SE.setdefault(field[0])
            if (flagCode & 0x0010 != 0 and not D_reverse_SE.has_key(field[0])):
                D_reverse_SE.setdefault(field[0])
            if (flagCode & 0x0010 == 0 and not D_forward_SE.has_key(field[0])):
                D_forward_SE.setdefault(field[0])





    if paired:
        print >>sys.stderr,"\n#=================================================="
        print >>sys.stderr,"#================Report (pair-end)================="
        print >>sys.stderr, "%-25s%d" % ("Total Reads:",len(D_total_read))
        print >>sys.stderr, "%-25s%d" % ("Total Mapped Reads:", (len(D_total_mapped)))
        print >>sys.stderr, "%-25s%d" % ("Total Unmapped Reads:",(len(D_total_unmapped)))

        print >>sys.stderr, "%-25s%d" % ("PCR duplicate:",len(D_pcr_duplicate))
        print >>sys.stderr, "%-25s%d" % ("QC-failed:",len(D_low_qual))
        print >>sys.stderr, "%-25s%d" % ("Not primary mapping:",len(D_secondary_hit))
        print >>sys.stderr, "\n",
        print >>sys.stderr, "%-25s%d" % ("Unmapped Read-1:",len(D_unmapped_read1))
        print >>sys.stderr, "%-25s%d" % ("Mapped Read-1:",len(D_mapped_read1))
        print >>sys.stderr, "%-25s%d" % ("  Forward (+)",len(D_forward_read1))
        print >>sys.stderr, "%-25s%d" % ("  Reverse (-):",len(D_reverse_read1))

        print >>sys.stderr, "\n",
        print >>sys.stderr, "%-25s%d" % ("Unmapped Read-2:",len(D_unmapped_read2))
        print >>sys.stderr, "%-25s%d" % ("Mapped Read-2:",len(D_mapped_read2))
        print >>sys.stderr, "%-25s%d" % ("  Forward (+):",len(D_forward_read2))
        print >>sys.stderr, "%-25s%d" % ("  Reverse (-):",len(D_reverse_read2))

        print >>sys.stderr, "\n",
        print >>sys.stderr, "%-25s%d" % ("Mapped to (+/-):",len(D_plus_minus))
        print >>sys.stderr, "%-25s%d" % ("Mapped to (-/+):",len(D_minus_plus))
        print >>sys.stderr, "%-25s%d" % ("Mapped to (+/+):",len(D_plus_plus))
        print >>sys.stderr, "%-25s%d" % ("Mapped to (-/-):",len(D_minus_minus))
        print >>sys.stderr, "\n",
        print >>sys.stderr, "%-25s%d" % ("Spliced Hits:",len(D_numSplitHit))
        print >>sys.stderr, "%-25s%d" % ("Non-spliced Hits:",len(D_numMonoHit))
        print >>sys.stderr, "%-25s%d" % ("Reads have insertion:",len(D_numInsertion))
        print >>sys.stderr, "%-25s%d" % ("Reads have deletion:",len(D_numDeletion))

    else:
        print >>sys.stderr,"\n#===================================================="
        print >>sys.stderr,"#================Report (single-end)================="
        print >>sys.stderr, "%-25s%d" % ("Total Reads:",len(D_total_read))
        print >>sys.stderr, "%-25s%d" % ("Total Mapped Reads:", len(D_map_SE))
        print >>sys.stderr, "%-25s%d" % ("Total Unmapped Reads:",len(D_unmap_SE))
        print >>sys.stderr, "%-25s%d" % ("PCR duplicate:",len(D_pcr_duplicate))
        print >>sys.stderr, "%-25s%d" % ("QC-failed:",len(D_low_qual))
        print >>sys.stderr, "%-25s%d" % ("Not primary mapping:",len(D_secondary_hit))
        print >>sys.stderr, "%-25s%d" % ("froward (+):",len(D_forward_SE))
        print >>sys.stderr, "%-25s%d" % ("reverse (-):",len(D_reverse_SE))
        print >>sys.stderr, "\n",
        print >>sys.stderr, "%-25s%d" % ("Spliced Hits:",len(D_numSplitHit))
        print >>sys.stderr, "%-25s%d" % ("Non-spliced Hits:",len(D_numMonoHit))
        print >>sys.stderr, "%-25s%d" % ("Reads have insertion:",len(D_numInsertion))
        print >>sys.stderr, "%-25s%d" % ("Reads have deletion:",len(D_numDeletion))
# print "one:"+str(temp_count_1)
#   print "two:"+str(temp_count_2)
#   print "three:"+str(temp_count_3)

filename=sys.argv[1]
stat(open(filename,'r'))

例子:

$ python /leofs/noncode/dechao/python/bam_stat.py ERR1880440.sam 2> ERR1880440.sam.stat

http://image.sciencenet.cn/album/201410/16/211811ucyrwnra0k0hh7tz.jpg

其中,total reads/total mapped reads/total unmapped reads统计过程如下:

total reads = Total Reads * 2 = 27256165 * 2 =54512330    #由于是双端测序的reads,所以结果要乘以2。

Total Mapped Reads即包含有一端mapping上的reads,也含有两端都mapping上的reads,所以不能简单的乘以2来统计total mapped reads。

Total Unmapped Reads为两端都没mapping上的reads,但仅有一端mapping上,另一端没有mapping上的情况没统计,所以不能用于统计total unmapped reads。

Unmapped Read-1/Unmapped Read-2为read-1/read-2分别没有mapping上的数目。

Mapped Read-1/Mapped Read-2read-1/read-2分别mapping上的数目。

因此,根据以上信息可以统计出total mapped reads/total unmapped reads的数目:

total mapped reads = Mapped Read-1 Mapped Read-2 = 24337121 + 24115846 =48452967

total unmapped reads = Unmapped Read-1 Unmapped Read-2 = 2919044 + 3140319 = 6059363

0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有