将使用win32com包进行处理
读取doc文件
# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def word2txt(filePath, savePath = ''):
dirs, filename = os.path.split(filePath)
print(dirs, '\n', filename)
new_name = ''
if fnmatch.fnmatch(filename, "*.docx"):
new_name = filename[:-5] + '.txt'
if fnmatch.fnmatch(filename, "*.doc"):
new_name = filename[:-4] + '.txt'
if savePath == '':
savePath = dirs
else:
savePath = savePath
word2txtPath = os.path.join(savePath, new_name)
print(word2txtPath)
wordappp = wc.Dispatch('Word.Application')
mytxt = wordappp.Documents.Open(filePath)
mytxt.SaveAs(word2txtPath, 4) # 4代表抽取结果保存为文本
mytxt.Close()
if __name__ == '__main__':
filePath = os.path.abspath(r'./专业课.docx')
word2txt(filePath)
读取pdf
# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch
def pdf2txt(filePath, savePath=''):
dirs, filename = os.path.split(filePath)
print(dirs, '\n', filename)
new_name = ''
if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
new_name = filename[:-4] + '.txt'
else:
print('格式不正确,仅支持pdf格式')
return
if savePath == '':
savePath = dirs
else:
savePath = savePath
pdf2txtPath = os.path.join(savePath, new_name)
print(pdf2txtPath)
wordappp = wc.Dispatch('Word.Application')
mytxt = wordappp.Documents.Open(filePath)
mytxt.SaveAs(pdf2txtPath, 4) # 4代表抽取文本
mytxt.Close()
if __name__ == '__main__':
filePath = os.path.abspath(r'./论文.pdf')
pdf2txt(filePath)
本文介绍如何使用Python的win32com库将Word(.doc, .docx)及PDF文件转换为文本文件。针对不同文件类型,通过指定路径加载文件,并利用Word.Application对象打开文件再保存为文本格式。

3152

被折叠的 条评论
为什么被折叠?



