当前位置:   article > 正文

python实现word内容替换_python 替换word内容

python 替换word内容

doc文件与docx文件不同

  • 存储方式的不同: doc 是二进制存储,docx是打包文件(docx文件可以解压,能看到里面的文件结构,主要是xml 等组成的打包文件);
  • docx易于跨平台,docx更小;
  • docx对于处理一些复杂对象比如公式、表格、图片更得心应手,因为可以通过xml的配置进行。

1. python-docx

#pip install python-docx
import docx
# 创建文档对象,获得word文档
doc = docx.Document(path)

#每一段的内容
for para in doc.paragraphs:
    print(para.text)

#每一段的编号、内容
for i in range(len(doc.paragraphs)):
    print(str(i), doc.paragraphs[i].text)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
doc = docx.Document('test.docx')
for paragraph in doc.paragraphs:
    tmp = ''
    runs = paragraph.runs
    for i, run in enumerate(runs):
        tmp += run.text # 合并run字符串
        if '需要替换的字符串' in tmp:
            # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
            run.text = run.text.replace(run.text, tmp)
            run.text = run.text.replace('需要替换的字符串', '我是替换后的字符串')
            tmp = ''
        else:
            # 如果没匹配到目标字符串则把当前run置空
            run.text = run.text.replace(run.text, '')
        if i == len(runs) - 1:
            # 如果是当前段落一直没有符合规则得字符串直接将当前run替换为tmp
            run.text = run.text.replace(run.text, tmp)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17

def docx_inplace_replace(file):
    file_ = rreplace(file, '.docx', '', 1)
    new_file = file_ + '_.docx'
    doc = docx.Document(file)
    for paragraph in doc.paragraphs:
        runs = paragraph.runs
        for i, run in enumerate(runs):
            tmp = run.text
            tmp = re.sub("\s+", " ", tmp)
            sensitive_datas = re_tmp(tmp)
            names = name_identify([tmp])
            if len(sensitive_datas) > 0:
                names = names + sensitive_datas
            if len(names) > 0:
                # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
                for name in names:
                    tmp = tmp.replace(name, 'X'*len(name))
                run.text = run.text.replace(run.text, tmp)
    # 遍历所有表格的单元格
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for run in paragraph.runs:
                        tmp = run.text
                        tmp = re.sub("\s+", " ", tmp)
                        sensitive_datas = re_tmp(tmp)
                        names = name_identify([tmp])
                        if len(sensitive_datas) >0:
                            names = names + sensitive_datas
                        if len(names) > 0:
                            # 如果存在匹配得字符串,那么将当前得run替换成合并后得字符串
                            for name in names:
                                tmp = tmp.replace(name, 'X'*len(name))
                        run.text = tmp
        # 保存文档
    doc.save(new_file)
    remove_header_footer(new_file, new_file)
    return new_file
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40

2. 解压处理xml 数据

import zipfile
import os
import re
import tempfile
import shutil
from functools import reduce
#1. 获取xml 字符串
def getXml(docxFilename):
    zip = zipfile.ZipFile(open(docxFilename,"rb"))
    xmlString = zip.read("word/document.xml")
    return xmlString.decode('utf-8')
#2.  TODO 对xml 字符串进行替换处理

#3. 封装回docx 文件
def createNewDocx(originalDocx,xmlContent,newFilename):
    
    """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """
    tmpDir = tempfile.mkdtemp()
    zip = zipfile.ZipFile(open(originalDocx,"rb"))
    zip.extractall(tmpDir)
    with open(os.path.join(tmpDir,"word/document.xml"),"w" ,encoding='utf-8') as f:
        f.write(xmlContent)
    # Get a list of all the files in the original docx zipfile
    filenames = zip.namelist()
    # Now, create the new zip file and add all the filex into the archive
    zipCopyFilename = newFilename
    with zipfile.ZipFile(zipCopyFilename,"w") as docx:
        for filename in filenames:
            docx.write(os.path.join(tmpDir,filename),filename)
    # Clean up the temp dir
    shutil.rmtree(tmpDir)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/不正经/article/detail/549189
推荐阅读
相关标签
  

闽ICP备14008679号