urllib_站长素材

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

import urllib.request
from lxml import etree
def creat_request(page):
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
req=urllib.request.Request(url=page,headers=header)
return req
def get_content(req):
response=urllib.request.urlopen(req)
content=response.read().decode('utf-8')
return content
def download_imag(content):
#下载图片
#urllib.request.urlretrieve(url,filename)
tree=etree.HTML(content)
name_li=tree.xpath('//img[@class="lazy"]/@alt')
#一般涉及图片的网站都会设计懒加载 按变之前的
src_li=tree.xpath('//img[@class="lazy"]/@data-original')
# for i in range(len(name_li)):
# url='http:'+name_li[i][:-6]+".jpg"
# print(url)
# #urllib.request.urlretrieve(url,filename)
for i in range(len(name_li)):
name=name_li[i]
src=src_li[i][:-6]+".jpg"
url='https:'+src
print(url)
try:
urllib.request.urlretrieve(url=url, filename="./img/"+name_li[i] + ".jpg")
print("下载完成")
except Exception as e:
continue
def main():
start_num=1#int(input())
end_num=10#int(input())
for i in range(start_num,end_num+1):
if i==1:
page = 'https://sc.chinaz.com/tupian/rentiyishu.html'
else:
page = 'https://sc.chinaz.com/tupian/rentiyishu_'+str(i)+'.html'
req=creat_request(page)
content=get_content(req)
download_imag(content)

main()


urllib_站长素材
https://ianwusb.blog/2024/07/26/urllib_站长素材/
作者
Ianwusb
发布于
2024年7月26日
许可协议