赞
踩
登录的处理。因为有些网页数据需要登陆后才能提取。这里要使用ieHTTPHeaders来提取登录时的提交信息。抓取网页
HtmlAgilityPack.HtmlDocument htmlDoc; if (!string.IsNullOrEmpty(登录URL)) { htmlDoc = htmlWeb.Load(登录URL, 提交的用户验证信息, 获取数据的网页URL); } else { htmlDoc = htmlWeb.Load(获取数据的网页URL); }
ArrayList list = new ArrayList(); list.add("//table/tr[1]/td"); list.add("//table/tr[2]/td"); //获取循环的节点的xpath,比如://table/tr HtmlNodeCollection repeatNodes = htmlDoc.DocumentNode.SelectNodes("//table/tr"); //循环节点 foreach (HtmlNode node in repeatNodes) { //循环获取数据 foreach (string dataPath in list) { HtmlNode dataNode = node.SelectSingleNode(list); if (dataNode != null) { string text = dataNode.InnerText; } } }
如果出现乱码,调整编码集为gb2312或者是utf-8
htmlWeb.DefaultEncoding = System.Text.Encoding.GetEncoding(strEncode);
完整例子:
using System; using System.Collections.Generic; using System.Text; using Microsoft.VisualStudio.TestTools.WebTesting; using HtmlAgilityPack; public class WebTest1Coded : WebTest { public override IEnumerator<WebTestRequest> GetRequestEnumerator() { WebTestRequest request1 = new WebTestRequest("http://www.microsoft.com/"); request1.ValidateResponse += new EventHandler<ValidationEventArgs>(request1_ValidateResponse); yield return request1; } void request1_ValidateResponse(object sender, ValidationEventArgs e) { //load the response body string as an HtmlAgilityPack.HtmlDocument HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(e.Response.BodyString); //locate the "Nav" element HtmlNode navNode = doc.GetElementbyId("Nav"); //pick the first <li> element HtmlNode firstNavItemNode = navNode.SelectSingleNode(".//li"); //validate the first list item in the Nav element says "Windows" e.IsValid = firstNavItemNode.InnerText == "Windows"; } }
相关软件点击下载
摘自网络:http://www.cnblogs.com/chuncn/archive/2009/09/07/1561564.html