首页
学习
活动
专区
工具
TVP
发布
社区首页 >问答首页 >如何从URL中删除HTML、CSS和Javascript代码

如何从URL中删除HTML、CSS和Javascript代码
EN

Stack Overflow用户
提问于 2018-07-05 08:53:08
回答 2查看 485关注 0票数 0

我想从一个网址中删除所有的代码,只得到文本。我正在尝试用这些代码来做这件事,但是我还没有得到我想要的东西,因为我仍然有javascript代码。

代码语言:javascript
复制
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
soup = BeautifulSoup(html, "html.parser")
print(soup.html.string)
for string in soup.stripped_strings:
    print("string: " + repr(string))

我得到的一个例子是:

代码语言:javascript
复制
string: 'Advertise with us'
string: 'Ad choices'
string: 'Copyright © 2018 BBC.'
string: 'The BBC is not responsible for the content of external sites.'
string: 'Read about our approach to external linking.'
string: '/*<![CDATA[*/ if (window.bbcdotcom && window.bbcdotcom.analytics && bbcdotcom.config && !bbcdotcom.config.isSportApp()) { bbcdotcom.analytics.page(); } /*]]>*/'
string: '/*<![CDATA[*/ if (window.bbcdotcom && bbcdotcom.currencyProviders) { bbcdotcom.currencyProviders.write(); } /*]]>*/'
string: '/*<![CDATA[*/ if (window.bbcdotcom && bbcdotcom.currencyProviders) { bbcdotcom.currencyProviders.postWrite(); } /*]]>*/'
string: '/*<![CDATA[*/ if (window.bbcdotcom && bbcdotcom.data && bbcdotcom.data.stats && bbcdotcom.data.stats === 1 && bbcdotcom.utils && window.location.pathname === \'/\' && window.bbccookies && bbccookies.readPolicy(\'performance\') ) { var wwhpEdition = bbcdotcom.utils.getMetaPropertyContent(\'wwhp-edition\'); var _sf_async_config={}; /** CONFIGURATION START **/ _sf_async_config.uid = 50924; _sf_async_config.domain = "bbc.co.uk"; _sf_async_config.title = "Homepage"+(wwhpEdition !== \'\' ? \' - \'+wwhpEdition : \'\'); _sf_async_config.sections = "Homepage"+(wwhpEdition !== \'\' ? \', Homepage - \'+wwhpEdition : \'\'); _sf_async_config.region = wwhpEdition; _sf_async_config.path = "/"+(wwhpEdition !== \'\' ? \'?\'+wwhpEdition : \'\'); /** CONFIGURATION END **/ (function(){ function loadChartbeat() { window._sf_endpt=(new Date()).getTime(); var e = document.createElement("script"); e.setAttribute("language", "javascript"); e.setAttribute("type", "text/javascript"); e.setAttribute(\'src\', \'//static.chartbeat.com/js/chartbeat.js\'); document.body.appendChild(e); } var oldonload = window.onload; window.onload = (typeof window.onload != "function") ? loadChartbeat : function() { oldonload(); loadChartbeat(); }; })(); } /*]]>*/'
string: '/*<![CDATA[*/ (function() { window.bbcdotcom.bodyLast = true; }()); /*]]>*/'
string: "(function() {\n    'use strict';\n\n    var promoManager = {\n        url: '',\n        promoLoaded: false,\n                makeUrl: function (theme, site, win) {\n            var loc = win? win.location : window.location,\n                proto = loc.protocol,\n                host = loc.host,\n                url = proto + '//' + ((proto.match(/s:/i) && !host.match(/^www\\.(int|test)\\./i))? 'ssl.' : 'www.'),\n                themes = ['light', 'dark'];\n\n            if ( host.match(/^(?:www|ssl|m)\\.(int|test|stage|live)\\.bbc\\./i) ) {\n                url += RegExp.$1 + '.';\n            }\n            else if ( host.match(/^pal\\.sandbox\\./i) ) {\n                url += 'test.';\n            }\n\n                        theme = themes[ +(theme === themes[0]) ];\n           \n           return url + 'bbc.co.uk/navpromo/card/' + site + '/' + theme;\n        },\n                init: function(node) {\n            var disabledByCookie = (document.cookie.indexOf('ckns_orb_nopromo=1') > -1),\n                that = this;\n            \n            if (window.promomanagerOverride) {\n                for (var p in promomanagerOverride) {\n                    that[p] = promomanagerOverride[p];\n                }\n            }\n                \n            if ( window.orb.fig('uk') && !disabledByCookie ) {\n                require(['orb/async/_footerpromo', 'istats-1'], function(promo, istats) {\n\n                    var virtualSite = istats.getSite() || 'default';\n                    that.url = (window.promomanagerOverride || that).makeUrl('light', virtualSite);\n\n                    if (that.url) { \n                        promo.load(that.url, node, {\n                                                          onSuccess: function(e) {\n                                if(e.status === 'success') {\n                                    node.parentNode.className = node.parentNode.className + ' orb-footer-promo-loaded';\n                                    promoManager.promoLoaded = true;\n                                    promoManager.event('promo-loaded').fire(e);\n                                }\n                             },\n                             onError: function() {\n                                istats.log('error', 'orb-footer-promo-failed');\n                                bbccookies.set('ckns_orb_nopromo=1; expires=' + new Date(new Date().getTime() + 1000 * 60 * 10).toGMTString() + ';path=/;');\n                             }\n                        });   \n                    }\n                });\n            }\n        }\n    };\n    \n        \n    define('orb/promomanager', ['orb/lib/_event'], function (event) {\n        event.mixin(promoManager);\n        return promoManager;\n    });\n    \n    require(['orb/promomanager'], function (promoManager) {\n        promoManager.init(document.getElementById('navp-orb-footer-promo'));\n    })\n})();"
string: 'require.config({\n            paths: {\n                "mybbc/templates": \'//mybbc.files.bbci.co.uk/notification-ui/3.8.4/templates\',\n                "mybbc/notifications": \'//mybbc.files.bbci.co.uk/notification-ui/3.8.4/js\'\n            }\n        });\n\n        require([\'mybbc/notifications/NotificationsMain\', \'idcta/idcta-1\'], function (NotificationsMain, idcta) {\n            var loadNotifications = function (fig) {\n                if (fig.geo.isUK()) {\n                    NotificationsMain.run(idcta, \'//mybbc.files.bbci.co.uk/notification-ui/3.8.4/\');\n                }\n            };\n            var orbFig = window.orb.fig;\n            if (typeof orbFig.load === \'function\') {\n                // Use new async API from Orbit\n                orbFig.load(loadNotifications, loadNotifications);\n            } else {\n                // Use old sync-only API from PAL orbfig project\n                loadNotifications(orbFig);\n            }\n        });'
string: "if (typeof require !== 'undefined') { require(['istats-1'], function(istats){ istats.track('external', { region: document.getElementsByTagName('body')[0] }); istats.track('download', { region: document.getElementsByTagName('body')[0] }); }); }"
string: 'if (window.SEARCHBOX.suppress === false && window.SEARCHBOX.locale && /^en-?.*?/.test(window.SEARCHBOX.locale)) {\n    require.config({\n      paths: {\n        "search/searchbox": window.SEARCHBOX.searchboxAppStaticPrefix,\n        "disco-layer": "//nav.files.bbci.co.uk/discovery-layer/0.0.1-272.f146f82/app"\n      }\n    });\n\n    var orbFig = window.orb.fig;\n\n    var loadSearchSuggest = function (fig) {\n        if (fig.geo.isUK()) {\n            require([\'search/searchbox/searchboxDrawer\'], function (SearchboxDrawer) {\n              SearchboxDrawer.run(window.SEARCHBOX);\n            });\n        }\n    };\n    if (typeof orbFig.load === \'function\') {\n        // Use new async API from Orbit\n        // In event of fig failure provide search suggest functionality by default\n        orbFig.load(loadSearchSuggest, loadSearchSuggest);\n    } else {\n        // Use old sync-only API from PAL orbfig project\n        loadSearchSuggest(orbFig);\n    }\n\n        var loadDiscoveryLayer = function (fig) {\n            if (fig.geo.isUK()) {\n                require([\'disco-layer\'], function (discoLayer) {\n                  discoLayer.run("//nav.files.bbci.co.uk/discovery-layer/content/", "//nav.files.bbci.co.uk/discovery-layer/0.0.1-272.f146f82/main.css");\n                });\n            }\n        };\n        if (typeof orbFig.load === \'function\') {\n            orbFig.load(loadDiscoveryLayer);\n        } else {\n            loadDiscoveryLayer(orbFig);\n        }\n  }'
/*
* 提示:该行代码过长,系统自动注释不进行高亮。一键复制会移除系统注释 
* string: '"use strict";require(["orb/lib/_script"],function(r){var a="https://nav.files.bbci.co.uk/orbit-webmodules/0.0.1-92.87f4a89/cookie-banner/cookie-prompt/",s=a+"bbccookies.min.css",i="ckns_privacy",c="ckns_policy",l="ckns_explicit",d="1",u={},b={ads:!0,personalisation:!0,performance:!0,necessary:!0},p="111";var m={personalisation:"ckps_.+|X-AB-iplayer-.+|ACTVTYMKR|BBC_EXAMPLE_COOKIE|BBCIplayer|BBCiPlayerM|BBCIplayerSession|BBCMediaselector|BBCPostcoder|bbctravel|CGISESSID|ed|food-view|forceDesktop|h4|IMRID|locserv|MyLang|myloc|NTABS|ttduserPrefs|V5|WEATHER|BBCScienceDiscoveryPlaylist_.+|bitratePref|correctAnswerCount|genreCookie|highestQuestionScore|incorrectAnswerCount|longestStreak|MSCSProfile|programmes-oap-expanded|quickestAnswer|score|servicePanel|slowestAnswer|totalTimeForAllFormatted|v|BBCwords|score|correctAnswerCount|highestQuestionScore|hploc|BGUID|BBCWEACITY|mstouch|myway|BBCNewsCustomisation|cbbc_anim|cbeebies_snd|bbcsr_usersx|cbeebies_rd|BBC-Latest_Blogs|zh-enc|pref_loc|m|bbcEmp.+|recs-.+|_lvd2|_lvs2|tick|_fcap_CAM1|_rcc2",performance:"ckpf_.+|optimizely.*|BBCLiveStatsClick|id|_em_.+|cookies_enabled|mbox|mbox-admin|mc_.+|omniture_unique|s_.+|sc_.+|adpolicyAdDisplayFrequency|s1|ns_session|ns_cookietest|ns_ux|NO-SA|tr_pr1|gvsurvey|bbcsurvey|si_v|sa_labels|obuid|mm_.+|mmid|mmcore.+|mmpa.+|dtCookie|dtPC|rxVisitor|rxvt|dtSa|dtLatC",ads:"ckad_.+|rsi_segs|c",necessary:"ckns_.+|BBC-UID|blq\\\\.dPref|SSO2-UID|BBC-H2-User|rmRpDetectReal|bbcComSurvey|IDENTITY_ENV|IDENTITY|IDENTITY-HTTPS|IDENTITY_SESSION|BBCCOMMENTSMODULESESSID|bbcBump.+|IVOTE_VOTE_HISTORY|pulse|BBCPG|BBCPGstat|ecos\\\\.dt"};function n(e){var n=("; "+document.cookie).split("; "+e+"=");return 2===n.length?n.pop().split(";").shift():null}function v(e){return null!==n(e)}function f(e,n,o){var t=e+"="+(n=(n+"").replace(/[^!#$&-+\\--:<-\\[\\]-~]/g,encodeURIComponent));return t+=(o=o||{}).path?";path="+o.path:"",t+=o.domain?";domain="+o.domain:"",t+=o.expires?";expires="+o.expires.toUTCString():"",t+=o.secure?";secure":""}function w(e,n){if(void 0===e)return null;var o={};o.expires=new Date(0),n&&(o.domain=n.domain,o.path=n.path),document.cookie=f(e,"removed",o)}function t(e,n){var o=window.location.pathname.split("/");for(w(e,n);o.length;){var t=o.join("/");""===t&&(t="/"),w(e,{domain:n,path:t}),w(e,{domain:"."+n,path:t}),o.pop()}}function y(e){var n=window.location.hostname.split(".");for(w(e),t(e);n.length&&-1==="|co.uk|com|".indexOf("|"+n.join(".")+"|");){if(n.length){var o=n.join(".");w(e,{domain:o}),t(e,o)}n.shift()}}function o(e){var n,o,t=(n=new RegExp("(?:^|; ?)"+c+"=(\\\\d)(\\\\d)(\\\\d)[ie]?($|;)"),(o=document.cookie.match(n))?{ads:!!+o[1],personalisation:!!+o[2],performance:!!+o[3],necessary:!0}:b);return e?t[e]:t}function k(e){return function(e){var n=JSON.stringify(e);if(void 0!==u[n])return u[n];var o="";for(var t in e)e.hasOwnProperty(t)&&m[t]&&!0===e[t]&&(o+=(o?"|":"")+m[t]);return u[n]=new RegExp("^("+(o||".*")+")$","i"),u[n]}(o()).test(e)}function e(){y("");for(var e=document.cookie.split(";"),n=0;n<e.length;n++){var o=e[n].split("=")[0].trim();k(o)||y(o)}}function g(e,n){var o,t=((o=new Date).setYear(o.getFullYear()+1),o),i=f(e,n,{path:"/",domain:".bbc.co.uk",expires:t});document.cookie=i;var c=f(e,n,{path:"/",domain:".bbc.com",expires:t});document.cookie=c}function E(){var e=n(l);return null!==e&&e===d}function B(e,n){e.style.display="none",n.style.display="none"}function h(e,n){E()||(e.style.display="none",n.style.display="block",window.bbcpage.trackRegion(n,{linkLocation:"nav-banner-cookies"}),window.bbcuser.logEvent("view","nav-banner-cookies"))}function _(){var e,n,o=document.getElementById("bbcprivacy"),t=document.getElementById("bbccookies");document.getElementById("bbcprivacy-continue-button").addEventListener("click",function(){B(o,t),h(o,t),window.bbcuser.logEvent("click-cta","nav-banner-privacy")}),document.getElementById("bbccookies-continue-button").addEventListener("click",function(){B(o,t),g(l,d),g(c,p),window.bbcuser.logEvent("click-continue","nav-banner-cookies")}),document.getElementById("bbccookies-settings").getElementsByTagName("a")[0].addEventListener("click",function(){g(l,d)}),v(i)?h(o,t):(n=t,(e=o).style.display="block",n.style.display="none",window.bbcpage.trackRegion(e,{linkLocation:"nav-banner-privacy"}),window.bbcuser.logEvent("view","nav-banner-privacy"),g(i,"1"))}function C(e,o){var n,t,i=(t="en","string"==typeof(n=e)&&2<=n.length&&(t=n.trim().substring(0,2).toLowerCase()),-1!==["en","cy","gd","ga"].indexOf(t)?t:"en"),c=a+i+".js";window.bbcpage.loadCSS(s).then(function(){r.jsonp(c,function(e){var n=e.replace(/<<bbcUrlPrefix>>/g,function(){if(window.orb.bbcUrlPrefix){var e=window.orb.bbcUrlPrefix+"bbc.com",n=window.location.host.match(/(bbc(?:\\.co\\.uk))(:\\d+)?(\\/.*)?$/i);return n&&"bbc.co.uk"===n[1]&&(e=e.replace("bbc.com","bbc.co.uk")),e}return""}());document.getElementById("cookiePrompt").innerHTML=n,o()},{callbackName:"cookiePrompt"})})}!function(n,o){var t=!1,e=!0,i=n.document,c=i.documentElement,r=i.addEventListener,a=r?"addEventListener":"attachEvent",s=r?"removeEventListener":"detachEvent",l=r?"":"on",d=function(e){"readystatechange"===e.type&&"complete"!==i.readyState||(("load"===e.type?n:i)[s](l+e.type,d,!1),!t&&(t=!0)&&o.call(n,e.type||e))},u=function(){try{c.doScroll("left")}catch(e){return void setTimeout(u,50)}d("poll")};if("complete"===i.readyState)o.call(n,"lazy");else{if(!r&&c.doScroll){try{e=!n.frameElement}catch(e){}e&&u()}i[a](l+"DOMContentLoaded",d,!1),i[a](l+"readystatechange",d,!1),n[a](l+"load",d,!1)}}(window,e),window.addEventListener?window.addEventListener("beforeunload",e,!1):window.attachEvent?window.attachEvent("onbeforeunload",e):window.onbeforeunload=e,Promise.all([window.bbcuser.isUKCombined(),window.bbcuser.isEU(),window.bbcpage.getLanguage()]).then(function(e){var n=e[0];if(e[1]||n){v(c)||(g(l,"0"),g(c,p)),v(c)||g(c,p);var o=v(i);if(!E()||!o)C(e[2],_)}}).catch(function(){C("en",_)})});'
*/
string: 'require(["istats-1","orb/cookies"],function(t,e){function o(){return"true"===s&&a}function i(){!c&&o()?setTimeout(function(){t.invoke()},"500"):t.invoke()}var n=navigator.userAgent.toLowerCase(),a=!(n.indexOf("msie")>-1)||parseInt(n.split("msie")[1],10)>10,s="true",c=!1,r=window.orb.fig;if(e.isAllowed("s1")){var u=function(e){o()&&e.geo.isUK()&&require(["megavolt-client"],function(e){e&&"function"==typeof e.getMVTIStatsLabels&&e.getMVTIStatsLabels(function(e){t.addLabels(e),c=!0})})};"function"==typeof r.load?r.load(u):u(r);try{if(!require.s.contexts._.config.paths.idcta)return void i();require(["idcta/idcta-1"],function(e){e&&"function"==typeof e.getIStatsLabels&&t.addLabels(e.getIStatsLabels()),i()},function(t){throw t})}catch(t){console&&"function"==typeof console.log&&console.log("an exception occurred while adding idcta labels to istats, invoking istats without them",t),i()}}});'

如何从url中删除所有代码?

编辑I:

如果我用lxml.html做一个解析器:

代码语言:javascript
复制
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
import lxml.html
document = lxml.html.document_fromstring(html)
print(document.text_content())

它不起作用,我已经得到了这个文件的恳求:

代码语言:javascript
复制
        BBC - Homepage            window.bbcredirection={geo:true}  






/*
* 提示:该行代码过长,系统自动注释不进行高亮。一键复制会移除系统注释 
* bbcRequireMap = {"jquery-1":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.7.2", "jquery-1.4":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.4", "jquery-1.9":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.9.1", "jquery-1.12":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-1.12.0.min", "jquery-2.2":"http://static.bbci.co.uk/frameworks/jquery/0.4.1/sharedmodules/jquery-2.2.0.min", "istats-1":"//nav.files.bbci.co.uk/nav-analytics/0.1.0-43/js/istats-1", "swfobject-2":"http://static.bbci.co.uk/frameworks/swfobject/0.1.10/sharedmodules/swfobject-2", "demi-1":"http://static.bbci.co.uk/frameworks/demi/0.10.1/sharedmodules/demi-1", "gelui-1":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1", "cssp!gelui-1/overlay":"http://static.bbci.co.uk/frameworks/gelui/0.9.13/sharedmodules/gelui-1/overlay.css", "relay-1":"http://static.bbci.co.uk/frameworks/relay/0.2.6/sharedmodules/relay-1", "clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1", "canvas-clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/canvas-clock-1", "cssp!clock-1":"http://static.bbci.co.uk/frameworks/clock/0.1.9/sharedmodules/clock-1.css", "jssignals-1":"http://static.bbci.co.uk/frameworks/jssignals/0.3.6/modules/jssignals-1", "jcarousel-1":"http://static.bbci.co.uk/frameworks/jcarousel/0.1.10/modules/jcarousel-1", "bump-3":"//emp.bbci.co.uk/emp/bump-3/bump-3", "ads":"http://static.bbci.co.uk/wwhp/1.126.0/modules/ads", "app":"http://static.bbci.co.uk/wwhp/1.126.0/modules/app", "compiled":"http://static.bbci.co.uk/wwhp/1.126.0/modules/compiled", "definejs":"http://static.bbci.co.uk/wwhp/1.126.0/modules/definejs", "homepage":"http://static.bbci.co.uk/wwhp/1.126.0/modules/homepage", "lib/core":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/core", "lib/module/base":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/module/base", "lib/module/manager":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/module/manager", "lib/timeInterval":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/timeInterval", "lib/util":"http://static.bbci.co.uk/wwhp/1.126.0/modules/lib/util", "modules/header":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/header", "modules/images":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/images", "modules/media":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/media", "modules/video":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video", "modules/video/dataProvider":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/dataProvider", "modules/video/player":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/player", "modules/video/playlist":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/playlist", "modules/video/playlistBuilder":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/video/playlistBuilder", "modules/weather":"http://static.bbci.co.uk/wwhp/1.126.0/modules/modules/weather", "mvt_tasks":"http://static.bbci.co.uk/wwhp/1.126.0/modules/mvt_tasks", "vendor/bower/cookie-monster/cookie-monster":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/cookie-monster/cookie-monster", "vendor/bower/fastclick/fastclick":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/fastclick/fastclick", "vendor/bower/happens/index":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/happens/index", "vendor/bower/html5shiv/html5shiv":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/html5shiv/html5shiv", "vendor/bower/imager.js/Imager":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/imager.js/Imager", "vendor/bower/jquery/jquery":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/jquery/jquery", "vendor/bower/js-breakpoints/breakpoints":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/js-breakpoints/breakpoints", "vendor/bower/modernizr/modernizr":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/modernizr/modernizr", "vendor/bower/moment/moment":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/moment/moment", "vendor/bower/promise-polyfill/Promise":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/promise-polyfill/Promise", "vendor/bower/slick.js/slick":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/slick.js/slick", "vendor/bower/slick.js/slick.min":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/slick.js/slick.min", "vendor/bower/squire/Squire":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/squire/Squire", "vendor/bower/underscore/underscore":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/bower/underscore/underscore", "vendor/pre-built/bbc-video-experience/continuousPlay/module":"http://static.bbci.co.uk/wwhp/1.126.0/modules/vendor/pre-built/bbc-video-experience/continuousPlay/module"}; require({ baseUrl: 'http://static.bbci.co.uk/', paths: bbcRequireMap, waitSeconds: 30 });    /*<![CDATA[*/ if (typeof bbccookies_flag === 'undefined') { bbccookies_flag = 'ON'; } showCTA_flag = true; cta_enabled = (showCTA_flag && (bbccookies_flag === 'ON')); (function(){var m="ckns_policy",q="Thu, 01 Jan 1970 00:00:00 GMT",i={ads:true,personalisation:true,performance:true,necessary:true};function c(u){if(c.cache[u]){return c.cache[u]}var t=u.split("/"),v=[""];do{v.unshift((t.join("/")||"/"));t.pop()}while(v[0]!=="/");c.cache[u]=v;return v}c.cache={};function a(u){if(a.cache[u]){return a.cache[u]}var v=u.split("."),t=[];while(v.length&&"|co.uk|com|".indexOf("|"+v.join(".")+"|")===-1){if(v.length){t.push(v.join("."))}v.shift()}c.cache[u]=t;return t}a.cache={};function s(t,y,u){var E=[""].concat(a(window.location.hostname)),B=c(window.location.pathname),D="",w,C;for(var x=0,A=E.length;x<A;x++){w=E[x];for(var v=0,z=B.length;v<z;v++){C=B[v];D=t+"="+y+";"+(w?"domain="+w+";":"")+(C?"path="+C+";":"")+(u?"expires="+u+";":"");bbccookies.set(D,true)}}}window.bbccookies={POLICY_REFRESH_DATE_MILLIS:new Date(2015,4,21,0,0,0,0).getTime(),POLICY_EXPIRY_COOKIENAME:"ckns_policy_exp",_setEverywhere:s,cookiesEnabled:function(){var t="ckns_testcookie"+Math.floor(Math.random()*100000);this.set(t+"=1");if(this.get().indexOf(t)>-1){e(t);return true}return false},get:function(){return document.cookie},getCrumb:function(t){if(!t){return null}return decodeURIComponent(document.cookie.replace(new RegExp("(?:(?:^|.*;)\\s*"+encodeURIComponent(t).replace(/[\-\.\+\*]/g,"\\$&")+"\\s*\\=\\s*([^;]*).*$)|^.*$"),"$1"))||null},policyRequiresRefresh:function(){var u=new Date();u.setHours(0);u.setMinutes(0);u.setSeconds(0);u.setMilliseconds(0);if(bbccookies.POLICY_REFRESH_DATE_MILLIS<=u.getTime()){var t=bbccookies.getCrumb(bbccookies.POLICY_EXPIRY_COOKIENAME);if(t){t=new Date(parseInt(t));t.setYear(t.getFullYear()-1);return bbccookies.POLICY_REFRESH_DATE_MILLIS>=t.getTime()}else{return true}}else{return false}},_setPolicy:function(t){return f.apply(this,arguments)},readPolicy:function(){return l.apply(this,arguments)},_deletePolicy:function(){s(m,"",q)},_isConfirmed:function(){return n()!==null},_acceptsAll:function(){var t=l();return t&&!(j(t).indexOf("0")>-1)},_getCookieName:function(){return b.apply(this,arguments)},_showPrompt:function(){var t=((!this._isConfirmed()||this.policyRequiresRefresh())&&window.cta_enabled&&this.cookiesEnabled()&&!window.bbccookies_disable);return(window.orb&&window.orb.fig)?t&&(window.orb.fig("no")||window.orb.fig("ck")):t},setDefaultCookiesSingleDomain:function(){f.apply(this,[])},_getPolicy:this.readPolicy};function b(u){var t=(""+u).match(/^([^=]+)(?==)/);return(t&&t.length?t[0]:"")}function j(t){return""+(t.ads?1:0)+(t.personalisation?1:0)+(t.performance?1:0)}function f(x){if(typeof x==="undefined"){x=i}if(typeof arguments[0]==="string"){var u=arguments[0],w=arguments[1];if(u==="necessary"){w=true}x=l();x[u]=w}else{if(typeof arguments[0]==="object"){x.necessary=true}}var v=new Date();v.setYear(v.getFullYear()+1);bbccookies.set(m+"="+j(x)+";domain=bbc.co.uk;path=/;expires="+v.toUTCString()+";");bbccookies.set(m+"="+j(x)+";domain=bbc.com;path=/;expires="+v.toUTCString()+";");bbccookies.set(m+"="+j(x)+";domain=bbci.co.uk;path=/;expires="+v.toUTCString()+";");var t=new Date(v.getTime());t.setMonth(t.getMonth()+1);bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbc.co.uk;path=/;expires="+t.toUTCString()+";");bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbc.com;path=/;expires="+t.toUTCString()+";");bbccookies.set(bbccookies.POLICY_EXPIRY_COOKIENAME+"="+v.getTime()+";domain=bbci.co.uk;path=/;expires="+t.toUTCString()+";");return x}function o(t){if(t===null){return null}var u=t.split("");return{ads:!!+u[0],personalisation:!!+u[1],performance:!!+u[2],necessary:true}}function n(){var t=new RegExp("(?:^|; ?)"+m+"=(\\d\\d\\d)($|;)"),u=document.cookie.match(t);if(!u){return null}return u[1]}function l(t){var u=o(n());if(!u){u=i}if(t){return u[t]}else{return u}}function e(t){return document.cookie=t+"=;expires="+q+";"}var g=!(window.bbccookies_flag==="ON"&&!bbccookies._acceptsAll()&&!window.bbccookies_disable);var k={},d={"personalisation":"ckps_.+|X-AB-iplayer-.+|ACTVTYMKR|BBC_EXAMPLE_COOKIE|BBCIplayer|BBCiPlayerM|BBCIplayerSession|BBCMediaselector|BBCPostcoder|bbctravel|CGISESSID|ed|food-view|forceDesktop|h4|IMRID|locserv|MyLang|myloc|NTABS|ttduserPrefs|V5|WEATHER|BBCScienceDiscoveryPlaylist_.+|bitratePref|correctAnswerCount|genreCookie|highestQuestionScore|incorrectAnswerCount|longestStreak|MSCSProfile|programmes-oap-expanded|quickestAnswer|score|servicePanel|slowestAnswer|totalTimeForAllFormatted|v|BBCwords|score|correctAnswerCount|highestQuestionScore|hploc|BGUID|BBCWEACITY|mstouch|myway|BBCNewsCustomisation|cbbc_anim|cbeebies_snd|bbcsr_usersx|cbeebies_rd|BBC-Latest_Blogs|zh-enc|pref_loc|m|bbcEmp.+|recs-.+|_lvd2|_lvs2|tick|_fcap_CAM1|_rcc2","performance":"ckpf_.+|optimizely.*|BBCLiveStatsClick|id|_em_.+|cookies_enabled|mbox|mbox-admin|mc_.+|omniture_unique|s_.+|sc_.+|adpolicyAdDisplayFrequency|s1|ns_session|ns_cookietest|ns_ux|NO-SA|tr_pr1|gvsurvey|bbcsurvey|si_v|sa_labels|obuid|mm_.+|mmid|mmcore.+|mmpa.+","ads":"ckad_.+|rsi_segs|c","necessary":"ckns_.+|BBC-UID|blq\\.dPref|SSO2-UID|BBC-H2-User|rmRpDetectReal|bbcComSurvey|IDENTITY_ENV|IDENTITY|IDENTITY-HTTPS|IDENTITY_SESSION|BBCCOMMENTSMODULESESSID|bbcBump.+|IVOTE_VOTE_HISTORY|pulse|BBCPG|BBCPGstat|ecos\\.dt"};function r(){var x=document.cookie.replace(/; +/g,";").split(";"),u,v=[];for(var w=0,t=x.length;w<t;w++){u=x[w];v.push(bbccookies._getCookieName(u))}return v}function h(w){var v=JSON.stringify(w);if(typeof(k[v])!=="undefined"){return k[v]}var u="";for(var t in w){if(w.hasOwnProperty(t)&&d[t]){if(w[t]===true){u+=(u?"|":"")+d[t]}}}k[v]=new RegExp("^("+(u?u:".*")+")$","i");return k[v]}bbccookies.getPolicyExpiryDateTime=function(){return bbccookies.POLICY_EXPIRY_COOKIENAME};bbccookies.purge=function(){var u=bbccookies.readPolicy(),w=r(),x;for(var v=0,t=w.length;v<t;v++){if(!bbccookies.isAllowed(w[v],u)){x=new Date();x.setTime(0);x=x.toUTCString();s(w[v],"deleted",x)}}};function p(){if(g){return}bbccookies.purge();contentLoaded(window,bbccookies.purge);if(window.addEventListener){window.addEventListener("beforeunload",bbccookies.purge,false)}else{if(window.attachEvent){window.attachEvent("onbeforeunload",bbccookies.purge)}else{window.onbeforeunload=bbccookies.purge}}}bbccookies.set=function(u,t){if(g){return document.cookie=u}var v=bbccookies._getCookieName(u);if(t||bbccookies.isAllowed(v)){return document.cookie=u}return null};bbccookies.isAllowed=function(v){var u=bbccookies.readPolicy();var t=h(u);return t.test(v)};p()})();
*/
/*!
 * contentloaded.js
 *
 * Author: Diego Perini (diego.perini at gmail.com)
EN

回答 2

Stack Overflow用户

发布于 2018-07-05 08:58:45

试试lxml.html

代码语言:javascript
复制
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
import lxml.html
document = lxml.html.document_fromstring(html)
print(document.text_content())
票数 0
EN

Stack Overflow用户

发布于 2018-07-05 19:35:13

如果您的目标是删除<script>标签(或任何其他特定类型的标签),您可以这样做:

代码语言:javascript
复制
req = urllib.request.Request(url)
page = urllib.request.urlopen(req)
html = page.read()
soup = BeautifulSoup(html, "html.parser")

while soup.script:
    soup.script.replaceWith(' ')

上面的代码将用单个空格替换所有<script>标记。例如,您可以删除所有脚本标记,然后像您所做的那样从剩余的标记中提取文本。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/51182153

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档