赞
踩
C#获取HTML源码
2024年03月23日记录
以前的那个从网上找到的方法, 在一些网站上用不了,如17K,取出来的是乱码,要么就是一坨JS,好像是用JS又重新加载了什么的
- using System;
- using System.Collections.Generic;
- using System.Web;
- using System.Net;
- using System.IO;
- using System.Text;
- using System.Net.Security;
- using System.Security.Authentication;
- using System.Security.Cryptography.X509Certificates;
-
- namespace Niunan.XiaoShuo.Util
- {
- /// <summary>
- /// http连接基础类,负责底层的http通信
- /// </summary>
- public class HttpService
- {
-
- public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
- {
- //直接确认,否则打不开
- return true;
- }
-
- /// <summary>
- /// post提交
- /// </summary>
- /// <param name="xml"></param>
- /// <param name="url"></param>
- /// <param name="isUseCert"></param>
- /// <param name="timeout"></param>
- /// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param>
- /// <param name="Authorization">为空的时候就不用加,用于容联云通讯</param>
- /// <returns></returns>
- public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="")
- {
- System.GC.Collect();//垃圾回收,回收没有正常关闭的http连接
-
- string result = "";//返回结果
-
- HttpWebRequest request = null;
- HttpWebResponse response = null;
- Stream reqStream = null;
-
- try
- {
- //设置最大连接数
- ServicePointManager.DefaultConnectionLimit = 200;
- //设置https验证方式
- if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
- {
- ServicePointManager.ServerCertificateValidationCallback =
- new RemoteCertificateValidationCallback(CheckValidationResult);
- }
-
- /***************************************************************
- * 下面设置HttpWebRequest的相关属性
- * ************************************************************/
- request = (HttpWebRequest)WebRequest.Create(url);
-
- request.Method = "POST";
- request.Timeout = timeout * 1000;
-
- if (!string.IsNullOrEmpty(Authorization))
- {
- request.Headers.Add(HttpRequestHeader.Authorization, Authorization);
- }
-
-
- //设置代理服务器
- //WebProxy proxy = new WebProxy(); //定义一个网关对象
- //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //网关服务器端口:端口
- //request.Proxy = proxy;
-
- //设置POST的数据类型和长度
- request.ContentType =contenttype;
- byte[] data = System.Text.Encoding.UTF8.GetBytes(xml);
- request.ContentLength = data.Length;
-
- //是否使用证书
- if (isUseCert)
- {
- //复制微信DEMO的,这里不用证书
- //string path = HttpContext.Current.Request.PhysicalApplicationPath;
- //X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD);
- //request.ClientCertificates.Add(cert);
- //Log.Debug("WxPayApi", "PostXml used cert");
- }
-
- //往服务器写入数据
- reqStream = request.GetRequestStream();
- reqStream.Write(data, 0, data.Length);
- reqStream.Close();
-
- //获取服务端返回
- response = (HttpWebResponse)request.GetResponse();
-
- //获取服务端返回数据
- StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
- result = sr.ReadToEnd().Trim();
- sr.Close();
- }
- catch (Exception e)
- {
- // Log.Error("HttpService", e.ToString());
- throw e;
- }
- finally
- {
- //关闭连接和流
- if (response != null)
- {
- response.Close();
- }
- if(request != null)
- {
- request.Abort();
- }
- }
- return result;
- }
-
- /// <summary>
- /// 处理http GET请求,返回数据
- /// </summary>
- /// <param name="url">请求的url地址</param>
- /// <returns>http GET成功后返回的数据,失败抛WebException异常</returns>
- public static string Get(string url)
- {
- System.GC.Collect();
- string result = "";
-
- HttpWebRequest request = null;
- HttpWebResponse response = null;
-
- //请求url以获取数据
- try
- {
- //设置最大连接数
- ServicePointManager.DefaultConnectionLimit = 200;
- //设置https验证方式
- if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
- {
- ServicePointManager.ServerCertificateValidationCallback =
- new RemoteCertificateValidationCallback(CheckValidationResult);
- }
-
- /***************************************************************
- * 下面设置HttpWebRequest的相关属性
- * ************************************************************/
- request = (HttpWebRequest)WebRequest.Create(url);
-
- request.Method = "GET";
-
- //设置代理
- //WebProxy proxy = new WebProxy();
- //proxy.Address = new Uri(WxPayConfig.PROXY_URL);
- //request.Proxy = proxy;
-
- //获取服务器返回
- response = (HttpWebResponse)request.GetResponse();
-
- //获取HTTP返回数据
- StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
- result = sr.ReadToEnd().Trim();
- sr.Close();
- }
- catch (Exception e)
- {
-
- throw e;
- }
- finally
- {
- //关闭连接和流
- if (response != null)
- {
- response.Close();
- }
- if (request != null)
- {
- request.Abort();
- }
- }
- return result;
- }
- }
- }

弄了一上午,到处问人到处查,发现下面的代码可以用于17K网站,
- var handler = new HttpClientHandler()
- {
- AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate,
- UseCookies=false,
- };
- var httpClient = new HttpClient(handler);
- var requestMessage = new HttpRequestMessage(HttpMethod.Get, url);
- requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd");
- var message = await httpClient.SendAsync(requestMessage);
- var content = await message.Content.ReadAsStringAsync();
- //后来发现这段代码前几次可以抓取到,然后又抓不到了。。只能用下面的模拟浏览器打开网页抓取源代码了
后来又来了个更狠的,用PuppeteerSharp, 相当于用代码来控制让系统中的chrome浏览器打开一个网页,然后再来获取这个网页的源代码
- using PuppeteerSharp; //nuget引入一下
-
- namespace ConsoleApp2
- {
- internal class Program
- {
- static async Task Main(string[] args)
- {
- await new BrowserFetcher().DownloadAsync(BrowserTag.Stable); //自动下载他提供的无头浏览器,不用这一行就得在下面指定本地的浏览器
-
- var browser = await Puppeteer.LaunchAsync(new LaunchOptions
- {
- //ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
- Headless = true
- });
-
- var page = await browser.NewPageAsync();
- await page.GoToAsync("https://www.17k.com/book/554720.html");
- await page.WaitForTimeoutAsync(2000);
- string html = await page.GetContentAsync();
-
- Console.WriteLine(html);
-
- await browser.CloseAsync();
- }
- }
- }

然后还有一个playwright的也能实现操作浏览器打开网页的功能,用于自动化测试的,以前有记录过这个名字,不过一直没有时间看。。。主要是“懒”。。。。。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。