加载中…
个人资料
马步水
马步水
  • 博客等级:
  • 博客积分:0
  • 博客访问:845,910
  • 关注人气:166
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
相关博文
推荐博文
谁看过这篇博文
加载中…
正文 字体大小:

【原创】利用python爬虫技术实现基于本福特定律的上市公司报表真假检验方案(二)

(2016-11-14 09:00:05)
标签:

python

爬虫

统计

本福特

财务造假

分类: Python

三、代码实现

#coding=utf-8
import tushare as ts
import talib as ta
import numpy as np
import pandas as pd
import os,time,sys,re,datetime
import csv
import scipy
import re,urllib2
import xlwt
from BeautifulSoup import BeautifulSoup

 

#获取股票列表
#code,代码 name,名称 industry,所属行业 area,地区 pe,市盈率 outstanding,流通股本 totals,总股本(万) totalAssets,总资产(万)liquidAssets,流动资产
# fixedAssets,固定资产 reserved,公积金 reservedPerShare,每股公积金 eps,每股收益 bvps,每股净资 pb,市净率 timeToMarket,上市日期
def Get_Stock_List():
    df = ts.get_stock_basics()
    return df[0:2]


#取财务报表每个单元格的首位数字
def Get_First(inputtext):
    outputtext = 0
    try:
        if inputtext<>'--' and len(inputtext)>0:
            #print inputtext
            inputtext = float(inputtext.replace(',',''))
            inputtext = abs(inputtext)
       
            if inputtext>1:
                outputtext=int(str(inputtext)[0])
            else:
                temp=inputtext*10000*10000
                outputtext=int(str(temp)[0])
    except:
        return

    return outputtext


#计算每个首位数字出现的次数,计入统计数组
def Get_Count(numcount,inputtext):
    outputtext = Get_First(inputtext)
   
    if outputtext ==1:
        numcount[0]+=1
       
    elif outputtext ==2:
        numcount[1]+=1

    elif outputtext ==3:
        numcount[2]+=1

    elif outputtext ==4:
        numcount[3]+=1
       
    elif outputtext ==5:
        numcount[4]+=1

    elif outputtext ==6:
        numcount[5]+=1
       
    elif outputtext ==7:
        numcount[6]+=1

    elif outputtext ==8:
        numcount[7]+=1

    elif outputtext ==9:
        numcount[8]+=1

    #elif outputtext ==0:
        #print inputtext,outputtext
       
    return numcount

 

#抓取网页数据,统计每位数字频率
def Get_Num(url,code,numcount):
    headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"}
    req = urllib2.Request(url, headers = headers)
    try:
        content = urllib2.urlopen(req).read()
    except:
        return
    soup = BeautifulSoup(content)


    table = soup.find("table",{"class":"table_bg001 border_box limit_sale scr_table"})
   
    for row in table.findAll("tr"):
        cells = row.findAll("td")
           
        if len(cells) > 0:#
            i = 0
            lencell = len(cells)#统计财务报表的年数
            #print lencell
            while i < len(cells):
                numcount = Get_Count(numcount,cells[i].text)
                #print cells[i].text
                i=i+1
               
          
    return (numcount,lencell)


def Benfude(df_Code,count):
   
    for Code in df_Code.index:
       
        print(u"股票代码:" + Code)
        Name = df_Code.loc[Code,'name']
        print Name
        ws.write(count, 0, Code)
        ws.write(count, 1, Name)
                 1,2,3,4,5,6,7,8,9
        NumCount = [0,0,0,0,0,0,0,0,0]
        #资产负债表
        Url1 = 'http://quotes.money.163.com/f10/zcfzb_'+Code+'.html?type=year'                   
        (NumCount,LenCell) = Get_Num(Url1,Code,NumCount)
        #利润表
        Url2 = 'http://quotes.money.163.com/f10/lrb_'+Code+'.html?type=year'
        (NumCount,LenCell) = Get_Num(Url2,Code,NumCount)
        #现金流量表
        Url3 = 'http://quotes.money.163.com/f10/xjllb_'+Code+'.html?type=year'
        (NumCount,LenCell) = Get_Num(Url3,Code,NumCount)

        print NumCount
        Number =[1,2,3,4,5,6,7,8,9]
        for i in Number:
            ws.write(count, i+1, NumCount[i-1])
        SumCount = sum(NumCount)
        ws.write(count, 11, SumCount)

        #统计每个数字的出现频率并写入文件
        NumberCount = NumCount
        for i in Number:
            NumberCount[i-1]=round(NumCount[i-1]/float(SumCount),3)
            ws.write(count, i+11, NumberCount[i-1])
        print NumberCount
        ws.write(count, 21, LenCell)
        wb.save('NumCount.xls')
       
        count = count +1
       

#主函数       
df = Get_Stock_List()
count = 1
if __name__ == '__main__':
    #定义excel表格内容
    wb = xlwt.Workbook()
    ws = wb.add_sheet(u'统计表')
    ws.write(0, 0, u'股票代码')
    ws.write(0, 1, u'股票名称')
    ws.write(0, 2, u'1')
    ws.write(0, 3, u'2')
    ws.write(0, 4, u'3')
    ws.write(0, 5, u'4')
    ws.write(0, 6, u'5')
    ws.write(0, 7, u'6')
    ws.write(0, 8, u'7')
    ws.write(0, 9, u'8')
    ws.write(0, 10, u'9')
    ws.write(0, 11, u'求和')

    ws.write(0, 12, u'1')
    ws.write(0, 13, u'2')
    ws.write(0, 14, u'3')
    ws.write(0, 15, u'4')
    ws.write(0, 16, u'5')
    ws.write(0, 17, u'6')
    ws.write(0, 18, u'7')
    ws.write(0, 19, u'8')
    ws.write(0, 20, u'9')
    ws.write(0, 21, u'年数')

    Benfude(df,count)

0

阅读 评论 收藏 转载 喜欢 打印举报/Report
  • 评论加载中,请稍候...
发评论

    发评论

    以上网友发言只代表其个人观点,不代表新浪网的观点或立场。

      

    新浪BLOG意见反馈留言板 电话:4000520066 提示音后按1键(按当地市话标准计费) 欢迎批评指正

    新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 会员注册 | 产品答疑

    新浪公司 版权所有