python利用win32com读取doc和pdf内容，并保存到文件

最新推荐文章于 2024-08-02 17:18:12 发布

原创最新推荐文章于 2024-08-02 17:18:12 发布 · 3k 阅读

9 ·

本内容遵循CC 4.0 BY-SA版权协议

python 专栏收录该内容

55 篇文章

订阅专栏

本文介绍如何使用Python的win32com库将Word（.doc, .docx）及PDF文件转换为文本文件。针对不同文件类型，通过指定路径加载文件，并利用Word.Application对象打开文件再保存为文本格式。

Python3.8

Python 是一种高级、解释型、通用的编程语言，以其简洁易读的语法而闻名，适用于广泛的应用，包括Web开发、数据分析、人工智能和自动化脚本

将使用win32com包进行处理

读取doc文件

# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch


def word2txt(filePath, savePath = ''):
    dirs, filename = os.path.split(filePath)
    print(dirs, '\n', filename)
    new_name = ''
    if fnmatch.fnmatch(filename, "*.docx"):
        new_name = filename[:-5] + '.txt'
    if fnmatch.fnmatch(filename, "*.doc"):
        new_name = filename[:-4] + '.txt'
    if savePath == '':
        savePath = dirs
    else:
        savePath = savePath
    word2txtPath = os.path.join(savePath, new_name)
    print(word2txtPath)
    wordappp = wc.Dispatch('Word.Application')
    mytxt = wordappp.Documents.Open(filePath)
    mytxt.SaveAs(word2txtPath, 4) # 4代表抽取结果保存为文本
    mytxt.Close()


if __name__ == '__main__':
    filePath = os.path.abspath(r'./专业课.docx')
    word2txt(filePath)

读取pdf

# coding=utf-8
import os, fnmatch
from win32com import client as wc
from win32com.client import Dispatch

def pdf2txt(filePath, savePath=''):
    dirs, filename = os.path.split(filePath)
    print(dirs, '\n', filename)
    new_name = ''
    if fnmatch.fnmatch(filename, '*.pdf') or fnmatch.fnmatch(filename, '*.PDF'):
        new_name = filename[:-4] + '.txt'
    else:
        print('格式不正确，仅支持pdf格式')
        return
    if savePath == '':
        savePath = dirs
    else:
        savePath = savePath
    pdf2txtPath = os.path.join(savePath, new_name)
    print(pdf2txtPath)
    wordappp = wc.Dispatch('Word.Application')
    mytxt = wordappp.Documents.Open(filePath)
    mytxt.SaveAs(pdf2txtPath, 4)  # 4代表抽取文本
    mytxt.Close()


if __name__ == '__main__':
    filePath = os.path.abspath(r'./论文.pdf')
    pdf2txt(filePath)