HtmlAgility - 如何处理中间的HTML?

内容来源于 Stack Overflow,并遵循CC BY-SA 3.0许可协议进行翻译与使用

  • 回答 (2)
  • 关注 (0)
  • 查看 (307)

我有一个网站“example.com” :

// Pass html content of the site.com to a string
string htmlCode = client.DownloadString("http://example.com");

HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(WebUtility.HtmlDecode(htmlCode));

对于一些网站,我有确切的html我想要。但是有些站点返回由表单组成的html,或者带有空体和某种脚本。

脚本一的例子:

<!DOCTYPE html>
<html><head>
<meta http-equiv="Pragma" content="no-cache">
<meta http-equiv="Expires" content="-1">
<meta http-equiv="CacheControl" content="no-cache">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<link rel="shortcut icon" href="data:;base64,iVBORw0KGgo=">

<script>

(function(){
    var securemsg;
    var dosl7_common;

window["bobcmn"] = "111110111110102000000022000000052000000002a4b927ad200000096300000000300000000300000006/TSPD/300000008TSPD_101300000005https200000000200000000";

window.hQv=!!window.hQv;try{(function(){try{var __,i_,j_=1,O_=1,z_=1,s_=1,S_=1,Ji=1,li=1,oi=1,si=1;for(var Si=0;Si<i_;++Si)j_+=2,O_+=2,z_+=2,s_+=2,S_+=2,Ji+=2,li+=2,oi+=2,si+=3;__=j_+O_+z_+s_+S_+Ji+li+oi+si;window.SL===__&&(window.SL=++__)}catch(JI){window.SL=__}var OI=!0;function SI(_){_&&(OI=!1,document.cookie="brav=ad");return OI}function _j(){}SI(window[_j.name]===_j);SI("function"!==typeof ie9rgb4);SI(/\x3c/.test(function(){return"\x3c"})&!/x3d/.test(function(){return"'x3'+'d';"}));
var ij=window.attachEvent||/mobi/i.test(window["\x6e\x61vi\x67a\x74\x6f\x72"]["\x75\x73e\x72A\x67\x65\x6et"]),Ij=+new Date+6E5,Jj,lj,Oj=setTimeout,zj=ij?3E4:6E3;function Zj(){if(!document.querySelector)return!0;var _=+new Date,l=_>Ij;if(l)return SI(!1);l=lj&&Jj+zj<_;l=SI(l);Jj=_;lj||(lj=!0,Oj(function(){lj=!1},1));return l}Zj();var sj=[17795081,27611931586,1558153217];
function Sj(_){_="string"===typeof _?_:_.toString(36);var l=window[_];if(!l.toString)return;var O=""+l;window[_]=function(_,O){lj=!1;return l(_,O)};window[_].toString=function(){return O}}for(var si=0;si<sj.length;++si)Sj(sj[si]);SI(!1!==window.hQv);
(function iJ(){if(!Zj())return;var l=!1;function O(l){for(var z=0;l--;)z+=Z(document.documentElement,null);return z}function Z(l,z){var O="vi";z=z||new s;return o_(l,function(l){l.setAttribute("data-"+O,z._s());return Z(l,z)},null)}function s(){this.Lz=1;this.Jz=0;this.il=this.Lz;this.c=null;this._s=function(){this.c=this.Jz+this.il;if(!isFinite(this.c))return this.reset(),this._s();this.Jz=this.il;this.il=this.c;this.c=null;return this.il};this.reset=function(){this.Lz++;this.Jz=0;this.il=this.Lz}}
var S=!1;function z(l,z){if(!Zj())return;var O=document.createElement(l);z=z||document.body;z.appendChild(O);O&&O.style&&(O.style.display="none");Zj()}function J_(z,O){if(!Zj())return;O=O||z;var Z="|";function s(l){l=l.split(Z);var z=[];for(var O=0;O<l.length;++O){var S="",I_=l[O].split(",");for(var J_=0;J_<I_.length;++J_)S+=I_[J_][J_];z.push(S)}return z}var J_=0,o_="datalist,details,embed,figure,hrimg,strong,article,formaddress|audio,blockquote,area,source,input|canvas,form,link,tbase,option,details,article";
o_.split(Z);o_=s(o_);o_=new RegExp(o_.join(Z),"g");while(o_.exec(z))o_=new RegExp((""+new Date)[8],"g"),l&&(S=Zj()),++J_;return Zj()?O(J_&&1):void 0}function o_(l,O,Z){if(!Zj())return;(Z=Z||S)&&z("div",l);l=l.children;var s=0;for(var J_ in l){Z=l[J_];try{Z instanceof HTMLElement&&(O(Z),++s)}catch(o_){}}return Zj()?s:void 0}J_(iJ,O);Zj()})();var IJ=82;window.oz={zz:"0820fdace1017800ebdf62cbc35cbeca5d8b435652ee3d253bb2e03195f77060a34ecc0424666f18abca1759ee2fa744800dfad86d4269514242d4fceed9d9c70b54e28c9b8c3fbf20a4971c6cf7cf3e60654d34ea06fc0747a30d8d8807f58873200a982d1d45fb8ed817474e167ab24b6ec97b833fc5141c0ef332e22dc753"};function I(_){return 396>_}
function J(_){var l=arguments.length,O=[];for(var Z=1;Z<l;++Z)O.push(arguments[Z]-_);return String.fromCharCode.apply(String,O)}function L(_,l){_+=l;return _.toString(36)}(function JJ(l){return l?0:JJ(l)*JJ(l)})(OI);})();}catch(x){document.cookie='brav=oex'+x;}finally{ie9rgb4=void(0);};function ie9rgb4(a,b){return a>>b>>0};

})();

</script>

<script type="text/javascript" src="/TSPD/084fc6184bab20009b43f88181dfc281050b986fbf5cd6e7067eeb760574cf33392dd93acd61a34b?type=8"></script>

<script>

(function(){
    var securemsg;
    var dosl7_common;

window["blobfp"] = "1111111110112000003e82ff5ac71e30000004a91d2b9750979230f005996dcd100001c20be2e63e7a47a6a80ea7aac3f26b85092554439d9300000020http://re.security.f5aas.com/re/";


})();

</script>

<script type="text/javascript" src="/TSPD/084fc6184bab20009b43f88181dfc281050b986fbf5cd6e7067eeb760574cf33392dd93acd61a34b?type=11"></script>
<noscript>Please enable JavaScript to view the page content.</noscript>
</head><body>
</body></html>

我如何处理这个“ecnryption”系统并获得我想要的最终html,即在浏览器上检查源代码时看到的“example.com”上的html?

提问于
用户回答回答于

这是由于页面加载了Ajax调用,使用PhantomJS网络驱动程序可以解决这一问题。

用户回答回答于

你的Web服务器前面有F5服务器吗?因为我有类似的情况,在我的代码中注入了Javascript。

扫码关注云+社区

领取腾讯云代金券