我使用htmlagility来获取网页数据,但我尝试了所有使用www.cloudflare.com保护网页的方法。重定向页面不可能在htmlagility中处理,因为它们不能用meta或js重定向,我想它们会检查你是否已经用cookie检查过了,我用c#模拟失败了。当我得到这个页面时,html代码来自登陆cloadflare页面。
发布于 2021-07-21 21:39:23
现在的答案应该包括Flaresolverr项目。它的目的是使用Docker将其部署为容器,因此您只需传递一个端口,它就会运行。
它不会影响您的项目,因为您不需要导入库。目前支持。我看到的唯一的缺点是,你需要安装Docker才能让它工作。
发布于 2015-09-07 00:57:35
使用WebClient
获取页面的html,
我写了下面的类来处理cookie,
只要在构造函数中传递CookieContainer
实例即可。
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Linq;
using System.Net;
using System.Text;
namespace NitinJS
{
public class SmsWebClient : WebClient
{
public SmsWebClient(CookieContainer container, Dictionary<string, string> Headers)
: this(container)
{
foreach (var keyVal in Headers)
{
this.Headers[keyVal.Key] = keyVal.Value;
}
}
public SmsWebClient(bool flgAddContentType = true)
: this(new CookieContainer(), flgAddContentType)
{
}
public SmsWebClient(CookieContainer container, bool flgAddContentType = true)
{
this.Encoding = Encoding.UTF8;
System.Net.ServicePointManager.Expect100Continue = false;
ServicePointManager.MaxServicePointIdleTime = 2000;
this.container = container;
if (flgAddContentType)
this.Headers["Content-Type"] = "application/json";//"application/x-www-form-urlencoded";
this.Headers["Accept"] = "application/json, text/javascript, */*; q=0.01";// "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
//this.Headers["Accept-Encoding"] = "gzip, deflate";
this.Headers["Accept-Language"] = "en-US,en;q=0.5";
this.Headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; rv:23.0) Gecko/20100101 Firefox/23.0";
this.Headers["X-Requested-With"] = "XMLHttpRequest";
//this.Headers["Connection"] = "keep-alive";
}
private readonly CookieContainer container = new CookieContainer();
protected override WebRequest GetWebRequest(Uri address)
{
WebRequest r = base.GetWebRequest(address);
var request = r as HttpWebRequest;
if (request != null)
{
request.CookieContainer = container;
request.Timeout = 3600000; //20 * 60 * 1000
}
return r;
}
protected override WebResponse GetWebResponse(WebRequest request, IAsyncResult result)
{
WebResponse response = base.GetWebResponse(request, result);
ReadCookies(response);
return response;
}
protected override WebResponse GetWebResponse(WebRequest request)
{
WebResponse response = base.GetWebResponse(request);
ReadCookies(response);
return response;
}
private void ReadCookies(WebResponse r)
{
var response = r as HttpWebResponse;
if (response != null)
{
CookieCollection cookies = response.Cookies;
container.Add(cookies);
}
}
}
}
用法:
CookieContainer cookies = new CookieContainer();
SmsWebClient client = new SmsWebClient(cookies);
string html = client.DownloadString("http://www.google.com");
https://stackoverflow.com/questions/32425973
复制相似问题