将正则表达式转换成内部格式,提高执行效率
strr="PYTHON666Java"
pat=re.compile(r"Python",re.I) #模式修正符:忽略大小写
print(pat.search(strr))
```python
import re
#match函数和search函数
# match函数--匹配开头
# search函数--匹配任意位置
#这两个函数都是一次匹配,匹配到一次就不再往后继续匹配了
strr="javapythonjavahtmlpythonjs"
pat=re.compile(r"python")
print(pat.search(strr).group())
import re
#findall() 查找所有匹配的内容,装到列表中
#finditer() 查找所有匹配的内容,装到迭代器中
strr="hello--------hello-----------\
---------hello-----------------\
---------hello--hello----------------\
----------hello---------hello----hello----------"
pat=re.compile(r"hello")
#print(pat.findall(strr))
data=pat.finditer(strr)
list1=[]
for i in data:
list1.append(i.group())
print(list1)
import re
#split() 按照能够匹配的子串将字符串分割后返回列表
#sub() sub方法 用于替换
strr1="张三,,,李四,,,,,,,,,王五,,,,,,,,赵六"
pat1=re.compile(r",+")
result1=pat1.split(strr1)
strr2="hello 123,hello 456!"
pat2=re.compile(r"\d+")
result2=pat2.sub("666",strr2)
print(result2)
import re
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Ap\
pleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Sa\
fari/537.36"
}
response=requests.get("http://changyongdianhuahaoma.51240.com/",headers=headers).text
pat1=r'<tr bgcolor="#EFF7F0">[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>[\s\S]*?</td>[\s\S]*?</tr>'
pat2=r'<tr bgcolor="#EFF7F0">[\s\S]*?<td>[\s\S]*?</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?</tr>'
pattern1=re.compile(pat1)
pattern2=re.compile(pat2)
data1=pattern1.findall(response)
data2=pattern2.findall(response)
resultlist=[]
for i in range(0,len(data1)):
resultlist.append(data1[i]+data2[i])
print(resultlist)
import urllib.request
import re
# 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
page_num = int(input("请问要爬取第几页呢:"))
page=str((page_num-1)*20)
url="https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start="+page+"&limit=20"
req=urllib.request.Request(url,headers=headers)
data=urllib.request.urlopen(req).read().decode()
pat1=r'"rating":\["(.*?)","\d+"\]'
pat2=r'"title":"(.*?)"'
pattern1=re.compile(pat1,re.I)
pattern2=re.compile(pat2,re.I)
data1=pattern1.findall(data)
data2=pattern2.findall(data)
for x in range(len(data1)):
print("排名:",x+1,"电影名:",data2[x],"豆瓣评分:",data1[x])