当前位置:   article > 正文

利用LangChain实现网页内容爬取并总结_langchain获取网页数据

langchain获取网页数据

背景

利用LangChain中load_summarize_chain实现网页内容爬取并总结。

亮点:

网页内容过长,导致超过LLM的token限制,使用LangChain中load_summarize_chain实现。

Map-reduce思想:

  • 先对长文本进行切分
  • map阶段-对每段进行summary
  • reduce-对每个map再进行总结
  • 实现长文本内容总结

案例实现:

背景:想查找某个产品的生产厂商,需要先去网页查找相关连接,然后分别总结每个连接内容,最后对内容进行汇总。以下为代码:

  1. # 使用google针对产品进行搜索,返回产品列表
  2. import os
  3. from autogen import config_list_from_json
  4. import autogen
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import json
  8. # from langchain.chat_models import ChatOpenAI
  9. from langchain_community.chat_models import ChatOpenAI
  10. from langchain.text_splitter import RecursiveCharacterTextSplitter
  11. from langchain.chains.summarize import load_summarize_chain
  12. from langchain import PromptTemplate
  13. import openai
  14. from dotenv import load_dotenv
  15. # Get API key
  16. load_dotenv()
  17. config_list3 = {"model": "gpt-3.5-turbo","api_key": "sk-xxxxx", "cache_seed": 42}
  18. os.environ["OPENAI_API_KEY"] = "sk-xxxxx"
  19. # summary chain:对每个url输出进行总结
  20. def summary(product,content):
  21. llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
  22. text_splitter = RecursiveCharacterTextSplitter(
  23. separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
  24. docs = text_splitter.create_documents([content])
  25. map_prompt = """
  26. content is :{text}
  27. Please summarize the Chinese manufacturers of """+ product +""" ,based on the above content and return them in list format.
  28. The returned results should be in the following format(Return strictly in list format.): ["manu1","manu2","manu3"...]
  29. The manufacturers should be from the Chinese market, and it's preferred to use the full name of the manufacturers rather than abbreviations.
  30. """
  31. combine_prompt = """
  32. content is :{text}
  33. Please summarize the Chinese manufacturers of """+ product +""" ,based on the above content and return them in list format.
  34. The returned results should be in the following format(Return strictly in list format.): ["manu1","manu2","manu3"...]
  35. The manufacturers should be from the Chinese market, and it's preferred to use the full name of the manufacturers rather than abbreviations.
  36. """
  37. map_prompt_template = PromptTemplate(
  38. template=map_prompt, input_variables=["text"])
  39. combine_prompt_template = PromptTemplate(
  40. template=combine_prompt, input_variables=["text"])
  41. summary_chain = load_summarize_chain(
  42. llm=llm,
  43. chain_type='map_reduce',
  44. map_prompt=map_prompt_template,
  45. combine_prompt=combine_prompt_template,
  46. verbose=False
  47. )
  48. output = summary_chain.run(input_documents=docs, )
  49. # print(output)
  50. return output
  51. # print(summary("GPU","GPU的生产厂家有:七彩虹厂商,技嘉厂商."))
  52. # print(type(summary("GPU","GPU的生产厂家有:七彩虹厂商,技嘉厂商.")))
  53. # 抓取内容:
  54. def scrape(product:str,url: str):
  55. # scrape and summary
  56. print("Scraping website...")
  57. # Define the headers for the request
  58. headers = {
  59. 'Cache-Control': 'no-cache',
  60. 'Content-Type': 'application/json',
  61. }
  62. # Define the data to be sent in the request
  63. data = {
  64. "url": url
  65. }
  66. # 转json
  67. data_json = json.dumps(data)
  68. # Send the POST request
  69. response = requests.post(
  70. "https://chrome.browserless.io/content?token=2db344e9-a08a-4179-8f48-195a2f7ea6ee", headers=headers, data=data_json)
  71. # Check the response status code
  72. if response.status_code == 200:
  73. soup = BeautifulSoup(response.content, "html.parser")
  74. text = soup.get_text()
  75. # print("CONTENTTTTTT:", text)
  76. # 不论长短都做总结 -> 生成厂商list
  77. # text超长问题
  78. if len(text) > 8000:
  79. text = text[:8000]
  80. output = summary(product,text)
  81. try:
  82. result_list = eval(output)
  83. except Exception as e:
  84. print("生成结果格式转化为list失败,返回为[]")
  85. result_list = []
  86. return result_list
  87. else:
  88. print(f"HTTP request failed with status code {response.status_code}")
  89. # 查找
  90. def search(query):
  91. url = "https://google.serper.dev/search"
  92. payload = json.dumps({
  93. "q": query
  94. })
  95. headers = {
  96. 'X-API-KEY': 'do not use mine',
  97. 'Content-Type': 'application/json'
  98. }
  99. response = requests.request("POST", url, headers=headers, data=payload)
  100. results = response.json()['organic']
  101. # print(results)
  102. product_manu=[]
  103. for res in results[:10]:
  104. if res["link"]:
  105. res_manu = scrape(query, res["link"])
  106. # 增加判断,如果返回是列表再去扩展
  107. if isinstance(res_manu, list):
  108. product_manu.extend(res_manu)
  109. else:
  110. print("the result of scrape is not list ,pass ")
  111. else:
  112. continue
  113. print("****** product_manu is: \n",product_manu)
  114. return response.json()
  115. search("RTX3050显卡生产厂商")

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/664477
推荐阅读
相关标签
  

闽ICP备14008679号