<html>
<head>
<style type="text/css">
h1 {text-decoration: overline}
h2 {text-decoration: line-through}
h3 {text-decoration: underline}
h4 {text-decoration:blink}
a {text-decoration: none}
</style>
</head>
<body>
<h1>这是标题 1</h1>
<h2>这是标题 2</h2>
<h3>这是标题 3</h3>
<h4>这是标题 4</h4>
<p><a href="http://www.w3school.com.cn/index.html">这是一个链接</a></p>
</body>
</html>
.important
选择所有有这个类属性的元素<html>
<head>
<style type="text/css">
p.serif{font-family:"Times New Roman",Georgia,Serif}
p.sansserif{font-family:Arial,Verdana,Sans-serif}
</style>
</head>
<body>
<h1>CSS font-family</h1>
<p class="serif">This is a paragraph, shown in the Times New Roman font.</p>
<p class="sansserif">This is a paragraph, shown in the Arial font.</p>
</body>
</html>
#intro
用于选择id=intro的元素p.#intro
<html>
<head>
<style type="text/css">
#serif{font-family:"Times New Roman",Georgia,Serif;color:red}
#sansserif{font-family:Arial,Verdana,Sans-serif}
</style>
</head>
<body>
<h1>CSS font-family</h1>
<p id="serif">This is a paragraph, shown in the Times New Roman font.</p>
<p id="sansserif">This is a paragraph, shown in the Arial font.</p>
</body>
</html>
*[title]
选择所有包含title属性的元素a[href]
选择所有带有href属性的锚元素a[href][title]
,注意这里是要同时满足。a[href="www.so.com"]
<html>
<head>
<style type="text/css">
p.serif{font-family:"Times New Roman",Georgia,Serif;color:red}
p.serif[title="ttt"]{color:blue}
</style>
</head>
<body>
<h1>CSS font-family</h1>
<p class="serif">testa.</p>
<p class="serif" title="ttt">testb.</p>
</body>
</html>
h1 em
<html>
<head>
<style type="text/css">
div em{color:red}
</style>
</head>
<body>
<h1>CSS font-family</h1>
<div>
<em>test</em>
<span>
<em>test2</em>
<span>
</div>
</body>
</html>
h1 > strong
<html>
<head>
<style type="text/css">
div>em{color:red}
</style>
</head>
<body>
<h1>CSS font-family</h1>
<div>
<em>test</em>
<span>
<em>test2</em>
<span>
</div>
</body>
</html>
nodename
选取此节点的所有子节点/
从根节点选取//
从匹配的当前节点选择文档中的节点,而不考虑它们的位置。.
选取当前节点..
选取当前节点的父节点@
选取属性bookstore
选取bookstore元素的所有子节点/bookstore
选取根元素bookstore/bookstore/book
选取属于bookstore的子元素的所有book元素//book
选取所有book元素,而不管它们在文档中的位置。bookstore//book
选择属于bookstore元素的后代的所有book元素//@lang
选取名为lang的所有属性[]
中用来查找某个特定节点或包含某个指定值得节点。/bookstore/book[1]
第一个book元素/bookstore/book[last()]
最后一个book元素/bookstore/book[position()<3]
选择前2个//title[@lang]
选择所拥有有名为lang的属性的title元素/bookstore/book[price>35.00]
import json
obj = {'one': '一', 'two': '二'}
#把字典解析成JSON
encoded = json.dumps(obj)
print(type(encoded))
print(encoded)
#把JSON解析成字典
decoded = json.loads(encoded)
print(type(decoded))
print(decoded)
book.xml
<?xml version="1.0" encoding="ISO-8859-1"?>
<bookstore>
<book>
<title lang="eng">Harry Potter</title>
<price>29.99</price>
</book>
<book>
<title lang="eng">Learning XML</title>
<price>39.95</price>
</book>
</bookstore>
from xml.dom import minidom
doc = minidom.parse('book.xml')
#查找其根节点
root = doc.documentElement
# print(dir(root)) dir 查看其所有方法
#打印根节点的名称
print(root.nodeName)
#查找所有的 book 节点
books = root.getElementsByTagName('book')
print(type(books))
#遍历并获取其节点名称
for book in books:
titles = book.getElementsByTagName('title')
prices = book.getElementsByTagName('price')
print(titles[0].childNodes[0].nodeValue)
print(prices[0].childNodes[0].nodeValue)
import string
from xml.parsers.expat import ParserCreate
class DefaultSaxHandler(object):
def start_element(self, name, attrs):
self.element = name
print('element: %s, attrs: %s' % (name, str(attrs)))
def end_element(self, name):
print('end element: %s' % name)
def char_data(self, text):
if text.strip():
print("%s's text is %s" % (self.element, text))
handler = DefaultSaxHandler()
parser = ParserCreate()
parser.StartElementHandler = handler.start_element
parser.EndElementHandler = handler.end_element
parser.CharacterDataHandler = handler.char_data
with open('book.xml', 'r') as f:
parser.Parse(f.read())
[0-9]
任意一个数字,等价\d
[a-z]
任意一个小写字母[A-Z]
任意一个大写字母[^0-9]
匹配非数字,等价\D
\w
等价[a-z0-9_]
,字母数字下划线\W
等价对\w
取非.
任意字符[]
匹配内部任意字符或子表达式[^]
对字符集合取非*
匹配前面的字符或者子表达式0次或多次+
匹配前一个字符至少1次?
匹配前一个字符0次或1次^
匹配字符串开头$
匹配字符串结束import re
# 3位数字-3到8位数字
m = re.match(r'\d{3}\-\d{3,8}', '010-12345')
# print(dir(m))
print(m.string)
print(m.pos, m.endpos)
# 分组
m = re.match(r'^(\d{3})-(\d{3,8})$', '010-12345')
# 分的所有部分 ('010', '12345')
print(m.groups())
# 0 原始数据 010-12345
print(m.group(0))
# 分的第一部分 010
print(m.group(1))
# 分的第二部分 12345
print(m.group(2))
# 分割
p = re.compile(r'\d+') # 分割规则 遇到数字就行分割
print(type(p))
print(p.split('one1two32three3four4'))
t = '20:15:45'
# 分割规则 冒号分割
m = re.match(r'^(0[0-9]|1[0-9]|2[0-3]|[0-9])\:(0[0-9]|1[0-9]|2[0-9]|3[0-9]|4[0-9]|5[0-9]|[0-9])\:(0[0-9]|1[0-9]|2[0-9]|3[0-9]|4[0-9]|5[0-9]|[0-9])$', t)
print(m.groups())
chromedriver.exe
的路径设置为环境变量
。请确保安装的最新的驱动,不然可能会报错。
find_element(s)_by_tag_name
find_element(s)_by_css_selector
from selenium import webdriver
import time
#声明使用chrome的驱动器
browser = webdriver.Chrome()
#设置等待超时,不超过30秒
browser.set_page_load_timeout(30)
#打开页面
browser.get('http://www.17huo.com/search.html?sq=2&keyword=羊毛')
#检查总共有多少页
page_info = browser.find_element_by_css_selector('body > div.wrap > div.pagem.product_list_pager > div')
# print(page_info.text)
# 共 100 页,每页 24 条
pages = int((page_info.text.split(',')[0]).split(' ')[1])
for page in range(pages):
if page > 2:
break
url = 'http://www.17huo.com/?mod=search&sq=2&keyword=%E7%BE%8A%E6%AF%9B&page=' + str(page + 1)
browser.get(url)
#此页面动态加载,此处模拟 滑轮滚动 页面下滑
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # 不然会load不完整
goods = browser.find_element_by_css_selector('body > div.wrap > div:nth-child(2) > div.p_main > ul').find_elements_by_tag_name('li')
print('%d页有%d件商品' % ((page + 1), len(goods)))
for good in goods:
try:
title = good.find_element_by_css_selector('a:nth-child(1) > p:nth-child(2)').text
price = good.find_element_by_css_selector('div > a > span').text
print(title, price)
except:
print("Exception...........")