当前位置:   article > 正文

C#获取HTML源码

C#获取HTML源码

C#获取HTML源码

2024年03月23日记录

以前的那个从网上找到的方法, 在一些网站上用不了,如17K,取出来的是乱码,要么就是一坨JS,好像是用JS又重新加载了什么的

  1. using System;
  2. using System.Collections.Generic;
  3. using System.Web;
  4. using System.Net;
  5. using System.IO;
  6. using System.Text;
  7. using System.Net.Security;
  8. using System.Security.Authentication;
  9. using System.Security.Cryptography.X509Certificates;
  10. namespace Niunan.XiaoShuo.Util
  11. {
  12. /// <summary>
  13. /// http连接基础类,负责底层的http通信
  14. /// </summary>
  15. public class HttpService
  16. {
  17. public static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)
  18. {
  19. //直接确认,否则打不开
  20. return true;
  21. }
  22. /// <summary>
  23. /// post提交
  24. /// </summary>
  25. /// <param name="xml"></param>
  26. /// <param name="url"></param>
  27. /// <param name="isUseCert"></param>
  28. /// <param name="timeout"></param>
  29. /// <param name="contenttype">如:application/x-www-form-urlencoded,text/xml</param>
  30. /// <param name="Authorization">为空的时候就不用加,用于容联云通讯</param>
  31. /// <returns></returns>
  32. public static string Post(string xml, string url, bool isUseCert, int timeout,string contenttype = "application/x-www-form-urlencoded",string Authorization="")
  33. {
  34. System.GC.Collect();//垃圾回收,回收没有正常关闭的http连接
  35. string result = "";//返回结果
  36. HttpWebRequest request = null;
  37. HttpWebResponse response = null;
  38. Stream reqStream = null;
  39. try
  40. {
  41. //设置最大连接数
  42. ServicePointManager.DefaultConnectionLimit = 200;
  43. //设置https验证方式
  44. if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
  45. {
  46. ServicePointManager.ServerCertificateValidationCallback =
  47. new RemoteCertificateValidationCallback(CheckValidationResult);
  48. }
  49. /***************************************************************
  50. * 下面设置HttpWebRequest的相关属性
  51. * ************************************************************/
  52. request = (HttpWebRequest)WebRequest.Create(url);
  53. request.Method = "POST";
  54. request.Timeout = timeout * 1000;
  55. if (!string.IsNullOrEmpty(Authorization))
  56. {
  57. request.Headers.Add(HttpRequestHeader.Authorization, Authorization);
  58. }
  59. //设置代理服务器
  60. //WebProxy proxy = new WebProxy(); //定义一个网关对象
  61. //proxy.Address = new Uri(WxPayConfig.PROXY_URL); //网关服务器端口:端口
  62. //request.Proxy = proxy;
  63. //设置POST的数据类型和长度
  64. request.ContentType =contenttype;
  65. byte[] data = System.Text.Encoding.UTF8.GetBytes(xml);
  66. request.ContentLength = data.Length;
  67. //是否使用证书
  68. if (isUseCert)
  69. {
  70. //复制微信DEMO的,这里不用证书
  71. //string path = HttpContext.Current.Request.PhysicalApplicationPath;
  72. //X509Certificate2 cert = new X509Certificate2(path + WxPayConfig.SSLCERT_PATH, WxPayConfig.SSLCERT_PASSWORD);
  73. //request.ClientCertificates.Add(cert);
  74. //Log.Debug("WxPayApi", "PostXml used cert");
  75. }
  76. //往服务器写入数据
  77. reqStream = request.GetRequestStream();
  78. reqStream.Write(data, 0, data.Length);
  79. reqStream.Close();
  80. //获取服务端返回
  81. response = (HttpWebResponse)request.GetResponse();
  82. //获取服务端返回数据
  83. StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
  84. result = sr.ReadToEnd().Trim();
  85. sr.Close();
  86. }
  87. catch (Exception e)
  88. {
  89. // Log.Error("HttpService", e.ToString());
  90. throw e;
  91. }
  92. finally
  93. {
  94. //关闭连接和流
  95. if (response != null)
  96. {
  97. response.Close();
  98. }
  99. if(request != null)
  100. {
  101. request.Abort();
  102. }
  103. }
  104. return result;
  105. }
  106. /// <summary>
  107. /// 处理http GET请求,返回数据
  108. /// </summary>
  109. /// <param name="url">请求的url地址</param>
  110. /// <returns>http GET成功后返回的数据,失败抛WebException异常</returns>
  111. public static string Get(string url)
  112. {
  113. System.GC.Collect();
  114. string result = "";
  115. HttpWebRequest request = null;
  116. HttpWebResponse response = null;
  117. //请求url以获取数据
  118. try
  119. {
  120. //设置最大连接数
  121. ServicePointManager.DefaultConnectionLimit = 200;
  122. //设置https验证方式
  123. if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))
  124. {
  125. ServicePointManager.ServerCertificateValidationCallback =
  126. new RemoteCertificateValidationCallback(CheckValidationResult);
  127. }
  128. /***************************************************************
  129. * 下面设置HttpWebRequest的相关属性
  130. * ************************************************************/
  131. request = (HttpWebRequest)WebRequest.Create(url);
  132. request.Method = "GET";
  133. //设置代理
  134. //WebProxy proxy = new WebProxy();
  135. //proxy.Address = new Uri(WxPayConfig.PROXY_URL);
  136. //request.Proxy = proxy;
  137. //获取服务器返回
  138. response = (HttpWebResponse)request.GetResponse();
  139. //获取HTTP返回数据
  140. StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
  141. result = sr.ReadToEnd().Trim();
  142. sr.Close();
  143. }
  144. catch (Exception e)
  145. {
  146. throw e;
  147. }
  148. finally
  149. {
  150. //关闭连接和流
  151. if (response != null)
  152. {
  153. response.Close();
  154. }
  155. if (request != null)
  156. {
  157. request.Abort();
  158. }
  159. }
  160. return result;
  161. }
  162. }
  163. }

弄了一上午,到处问人到处查,发现下面的代码可以用于17K网站,

  1. var handler = new HttpClientHandler()
  2. {
  3. AutomaticDecompression = System.Net.DecompressionMethods.GZip | System.Net.DecompressionMethods.Deflate,
  4. UseCookies=false,
  5. };
  6. var httpClient = new HttpClient(handler);
  7. var requestMessage = new HttpRequestMessage(HttpMethod.Get, url);
  8. requestMessage.Headers.Add("Accept-encoding", "gzip, deflate, br, zstd");
  9. var message = await httpClient.SendAsync(requestMessage);
  10. var content = await message.Content.ReadAsStringAsync();
  11. //后来发现这段代码前几次可以抓取到,然后又抓不到了。。只能用下面的模拟浏览器打开网页抓取源代码了

后来又来了个更狠的,用PuppeteerSharp, 相当于用代码来控制让系统中的chrome浏览器打开一个网页,然后再来获取这个网页的源代码

  1. using PuppeteerSharp; //nuget引入一下
  2. namespace ConsoleApp2
  3. {
  4. internal class Program
  5. {
  6. static async Task Main(string[] args)
  7. {
  8. await new BrowserFetcher().DownloadAsync(BrowserTag.Stable); //自动下载他提供的无头浏览器,不用这一行就得在下面指定本地的浏览器
  9. var browser = await Puppeteer.LaunchAsync(new LaunchOptions
  10. {
  11. //ExecutablePath= "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
  12. Headless = true
  13. });
  14. var page = await browser.NewPageAsync();
  15. await page.GoToAsync("https://www.17k.com/book/554720.html");
  16. await page.WaitForTimeoutAsync(2000);
  17. string html = await page.GetContentAsync();
  18. Console.WriteLine(html);
  19. await browser.CloseAsync();
  20. }
  21. }
  22. }

然后还有一个playwright的也能实现操作浏览器打开网页的功能,用于自动化测试的,以前有记录过这个名字,不过一直没有时间看。。。主要是“懒”。。。。。

Installation | Playwright .NET

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家小花儿/article/detail/300588
推荐阅读
相关标签
  

闽ICP备14008679号