1 获取文章的图片链接
def getHtmlPics(strHtml):
class parseLinks(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links = []
def handle_starttag(self, tag, attrs):
if tag == 'img':
for name, value in attrs:
if name == 'src':
self.links.append(value)
lParser = parseLinks()
lParser.feed(strHtml)
return lParser.links
2 移除html中特定的tag
import HTMLParser
def removeHtmlTag(htmlstr,allowTags):
class parseLinks(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.result = ""
def handle_starttag(self, tag, attrs):
if tag in allowTags:
self.result += self.get_starttag_text()
def handle_endtag(self, tag):
if tag in allowTags:
self.result += "</"+tag+">"
def handle_data(self, data):
self.result+= data
lParser = parseLinks()
lParser.feed(htmlstr)
return lParser.result