用python对pdf批量重命名_bluemonster

http://blog.sina.com.cn/u/1708890500

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

用python对pdf批量重命名

(2011-07-08 20:44:41)

标签：

pdf批量重命名

python

pdfminer

pypdf

it

分类：实验室

这两天全组的人都被shadi抓着去下各种会议论文，有些会议还好，可以直接批量下载。像IEEE和ACM的会议，只能从学校图书馆里进数据库，然后一篇篇地打开、保存，而且保存下来的pdf文件名字都是数字。还有些是从google scholar里搜出来的，名字更加没有规律了。

大神jhonglei同学见状，自告奋勇地上网搜索了一番，给出了一段可以根据pdf文件的title属性对pdf文件进行批量重命名的python代码。

http://s10/middle/65db9984ga78e49bd09a9&690

#encoding:utf-8

'''

需要到：http://pybrary.net/pyPdf/ 下载安装pyPdf库

'''

import os
import operator
from pyPdf import PdfFileWriter, PdfFileReader
#对取得的文件命格式化，去掉其中的非法字符
def format(filename):
    if (isinstance(filename, str)):
        tuple=('?','╲','*','/',',','"','<','>','|','“','"','，','‘','”',',','/',':')
        for char in tuple:
            if (filename.find(char)!=-1):
                filename=filename.replace(char," ")
        return filename
    else:
        return 'None'
#通过递归调用次方法依次遍历文件夹中的每个文件,如果后缀名是.pdf，则对其处理
def VisitDir(path):
    li=os.listdir(path)
    for p in li:
        pathname=os.path.join(path,p)
        if not os.path.isfile(pathname):
            VisitDir(pathname)
        else:
            back=os.path.splitext(pathname)
            backname=back[1]
            if backname=='.pdf':
                print pathname
                rename(pathname)
#文件改名程序
def rename(pathname):
    stream=file(pathname, "rb")
    input1 = PdfFileReader(stream)
    isEncrypted=input1.isEncrypted
    if not(isEncrypted):
#这里的pathname包含路径以及文件名，根据/将起分割成一个list然后去除文件名，保留路径
        list=pathname.split("\\")
        oldname=""
        for strname in list:
            oldname+=strname+'\\'
        old=oldname[0:len(oldname)-1]
#这就是去除文件名
        list.pop()

        string=""
        for strname in list:
            string+=strname+'\\'
        print "string= %s" % string
        title=str(input1.getDocumentInfo().title)
        print "title = %s" % (input1.getDocumentInfo().title)

        title=format(title)
#这里就是把先前得到的路径名加上得到的新文件名，再加上后缀名，得到新的文件名
        new=string+title+".pdf"
        print "old=%s " % old
        print "new = %s " % new
#这里一定要对新的文件名重新定义编码格式，而且一定是GBK，因为Windows中文版默认的就是GBK编码
        new=new.encode('GBK')
#关闭文件流，不然无法更名
        stream.close()
        if(str(title)!="None"):
            try:
                os.rename(old, new)
            except WindowsError,e:
                 #print str(e)
                  print e
        else:
            print"The file contian no title attribute!"
    else:
        print "This file is encrypted!"
if __name__=="__main__":
path=r"F:\Papers\ICDE'09\Demos"
VisitDir(path)

但是这段代码还是有问题的：

1、有些pdf文件的title属性为空，或者并不是真正的文件名

http://s4/middle/65db9984ga78e68d3f2d3&690

这样的重命名无意义

2、有些pdf文件的title属性里有非拉丁字符，

http://s2/middle/65db9984ga78e75b37da1&690
这会报错

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 4: ordinal not in range(128)

后来大神又找到了一个更牛逼的库，pdfminer http://www.unixuser.org/~euske/python/pdfminer/

主要的想法就是用pfdminer的pdf2txt模块将pdf文件的第一页转换成文本，然后从中读取第一行作为文件名

得益于pdfminer强大的功能，这样的命名准确率非常高。只要你的pdf文件不是扫描版的（图片格式），都可以正确获取文件名

下面的代码要正常运行，必须与pdfminer中的pdf2txt.py位于同一目录下

#encoding:utf-8

'''
目的：根据文章的标题重命名title

采用两种方式获取文章的title
方式一：
读取PDF的title属性g，根据这个属性，更改次文档的名字！
也就是选中pdf文件右键后点击查看获取的
需要到：http://pybrary.net/pyPdf/
方式二：
根据pdf内容获取title
需要pdfminer
'''

import os
import operator
from pyPdf import PdfFileWriter, PdfFileReader
#对取得的文件命格式化，去掉其中的非法字符
def format(filename):
    if (isinstance(filename, str)):
        tuple=('?','╲','*','/',',','"','<','>','|','“','"','，','‘','”',',','/',':')
        for char in tuple:
            if (filename.find(char)!=-1):
                filename=filename.replace(char," ")
        return filename
    else:
        return 'None'

## 添加因为pdf转换产生的乱码
key_value = { '\xef\xac\x81':'fi'}

#通过递归调用次方法依次遍历文件夹中的每个文件,如果后缀名是.pdf，则对其处理
def VisitDir(path):
    li=os.listdir(path)
    for p in li:
        pathname=os.path.join(path,p)
        if not os.path.isfile(pathname):
            VisitDir(pathname)
        else:
            back=os.path.splitext(pathname)
            backname=back[1]
            if backname=='.pdf':
                print pathname
                rename(pathname)
#文件改名程序
def rename(pathname):
    stream=file(pathname, "rb")
    input1 = PdfFileReader(stream)
    isEncrypted=input1.isEncrypted
    if not(isEncrypted):
#这里的pathname包含路径以及文件名，根据/将起分割成一个list然后去除文件名，保留路径
        list=pathname.split("\\")
        oldname=""
        for strname in list:
            oldname+=strname+'\\'
        old=oldname[0:len(oldname)-1]
#这就是去除文件名
        list.pop()

        string=""
        for strname in list:
            string+=strname+'\\'
        print "string= %s" % string
        ## Option 1: user attributes
        title = str(input1.getDocumentInfo().title)
        #print "title = %s" % (input1.getDocumentInfo().title)
        ################ jiang added
        #if(str(title) == "None"):
        ###Option2: use pdf content download
        os.system('python pdf2txt.py -p 1 "'+old+'" >c.txt')
        f = open("c.txt","rb")
        title = ""
        a = f.readline()
        while( a not in ("\r\n","\n") ):
            title += a
            a = f.readline()
        title = title.replace("\r\n"," ").strip()
        ########### jiang added
        title=format(title)
        for key,value in key_value.iteritems():
            title = title.replace(key,value)

#这里就是把先前得到的路径名加上得到的新文件名，再加上后缀名，得到新的文件名
        new=string+title+".pdf"
        print "old=%s " % old
        print "new = %s " % new
#这里一定要对新的文件名重新定义编码格式，而且一定是GBK，因为Windows中文版默认的就是GBK编码
        new=new.encode('GBK')
#关闭文件流，不然无法更名
        stream.close()
        if(str(title)!="None"):
            try:
                os.rename(old, new)
            except WindowsError,e:
                 print str(e)
        else:
            # python pdf2txt.py -p 1 p43-nazir.pdf >c.txt
            print"The file contian no title attribute!"
    else:
        print "This file is encrypted!"
if __name__=="__main__":
    path=r"." #the current directory
    VisitDir(path)

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：C++产生伪随机数

后一篇：[转载]使用ifttt背后的巨大风险

新浪BLOG意见反馈留言板　欢迎批评指正