赞
踩
Python 处理文本是一项非常常见的功能,本文整理了多种文本提取及NLP相关的案例,还是非常用心的
文章很长,高低要忍一下,如果忍不了,那就收藏吧,总会用到的!
- # pip install PyPDF2 安装 PyPDF2
- import PyPDF2
- from PyPDF2 import PdfFileReader
-
- # Creating a pdf file object.
- pdf = open("test.pdf", "rb")
-
- # Creating pdf reader object.
- pdf_reader = PyPDF2.PdfFileReader(pdf)
-
- # Checking total number of pages in a pdf file.
- print("Total number of Pages:", pdf_reader.numPages)
-
- # Creating a page object.
- page = pdf_reader.getPage(200)
-
- # Extract data from a specific page number.
- print(page.extractText())
-
- # Closing the object.
- pdf.close()
- # pip install python-docx 安装 python-docx
-
-
- import docx
-
-
- def main():
- try:
- doc = docx.Document('test.docx') # Creating word reader object.
- data = ""
- fullText = []
- for para in doc.paragraphs:
- fullText.append(para.text)
- data = '\n'.join(fullText)
-
- print(data)
-
- except IOError:
- print('There was an error opening the file!')
- return
-
-
- if __name__ == '__main__':
- main()
- # pip install bs4 安装 bs4
-
- from urllib.request import Request, urlopen
- from bs4 import BeautifulSoup
-
- req = Request('http://www.cmegroup.com/trading/products/#sortField=oi&sortAsc=false&venues=3&page=1&cleared=1&group=1',
- headers={'User-Agent': 'Mozilla/5.0'})
-
- webpage = urlopen(req).read()
-
- # Parsing
- soup = BeautifulSoup(webpage, 'html.parser')
-
- # Formating the parsed html file
- strhtm = soup.prettify()
-
- # Print first 500 lines
- print(strhtm[:500])
-
- # Extract meta tag value
- print(soup.title.string)
- print(soup.find('meta', attrs={'property':'og:description'}))
-
- # Extract anchor tag value
- for x in soup.find_all('a'):
- print(x.string)
-
- # Extract Paragraph tag value
- for x in soup.find_all('p'):
- print(x.text)
- import requests
- import json
-
- r = requests.get("https://support.oneskyapp.com/hc/en-us/article_attachments/202761727/example_2.json")
- res = r.json()
-
- # Extract spec
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。