赞
踩
1、将Word转化成txt
# -*- coding=utf-8 -*-
import sys
import os
from win32com import client
word=client.Dispatch('Word.Application')
word.Visible=0
##
#目前来看,window 默认编码都是GBK 要如果要读取中文的文件信息,要decode
'''
代码功能:将word转换成txt
也可以直接读取word
Docs.py
http://new.galalaly.me/2011/09/use-python-to-parse-microsoft-word-documents-using-pywin32-library/
https://msdn.microsoft.com/EN-US/library/office/ff837519.aspx
'''
###
baseDir = "F:\\work\\".decode("utf-8")
# print baseDir,type(baseDir)
dirList = []
baseDir2 = []
if os.path.exists(baseDir):
dirList = os.listdir(baseDir)
else:
print "not exist"
fileCount=1
for name in dirList:
baseDir2=baseDir+name
dirList2=os.listdir(baseDir2)
print name
os.chdir(baseDir2.encode("GBK"))
for dir2 in dirList2:
print os.path.abspath(dir2)
filepath=os.path.abspath(dir2)
doc = word.documents.Open(filepath)
doc.SaveAs('E:\\pythonP\\docs\\'+str(fileCount)+'.txt', 2)
doc.Close()
fileCount+=1
word.Quit()
print 'over'
2、直接从Word中读取数据(.doc),从docx中读取数据可参考Python-docx
# -*- coding=utf-8 -*-
import win32com.client as win32
word = win32.gencache.EnsureDispatch('Word.Application')
word.Visible = False
'''
Opens the specified document and adds it to the Documents collection. Returns a Document object.
'''
cont = word.Documents.Open('E:\pythonP\\test.doc')
'''
一种方式:
'''
# for aword in cont.Content.Words:
# print type(aword.Text)
#
'''
另一种方式
'''
for paragraph in cont.Paragraphs:
print paragraph.Range.Text
word.Quit(-1)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。