准备工作
使用 python3 环境,用 pip3 安装需要用到的 modules
1
2
3
4
5
6
7
8
# 安装 pip3
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
python3 get-pip.py
pip3 -v
# 安装 modules
pip3 install bs4 xlwt
爬取思路
- 打开需要爬取的网站,一般来说,图片网站都会有:分类、图集、页码,在浏览器中找到这些元素
- 第一步:获取全部图集信息
- 图集名称、图集编号、图集地址、图集封面
- 第二步:将图集信息储存在数据库中或Excel中
- 第三步:从数据库中获取 url,对图集下的图片进行爬取下载,图集下载成功后标记数据库图集为已爬取状态
代码实现
- 引入 modules
1
2
3
4
5
6
7
8
9
import os
import sqlite3
import sys
import urllib.request
import re
import xlwt
from bs4 import BeautifulSoup
- 请求头设置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 浏览器请求头
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
# "Referer": "https://www.xtongs.com/"
}
# 图片下载请求头
img_header = [
('User-Agent',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'),
('Referer', 'https://xtongs.com/')
]
# 获取页面内容
def get_page_content(url):
print('获取页面内容:%s' % url)
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request, timeout=5)
html = response.read().decode()
except Exception as e:
print(e)
return BeautifulSoup(html, "html.parser")
-
获取网站专辑总页数,这个可以浏览器里看一眼总页数,不麻烦
-
保存专辑信息到本地
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 专辑数据爬取到本地
def album_to_local():
print('开始')
# 爬取地址
baseurl = 'https://xtongs.com/'
save_path = '专辑.xls'
print('爬取地址:%s' % baseurl)
# 总页数
# total_page = get_album_total_page(baseurl)
# 这个自己上网站上看一眼就行
total_page = 58
print('专辑总页数:%d' % total_page)
print('要从第几页开始?请输入数字:')
try:
page = int(input())
except Exception as e:
print(e)
print('页码必须是大于0的整数,不要超过最大页码')
sys.exit()
if page < 1:
print('页码从1开始')
page = 1
# 爬取专辑信息
save_album_list(baseurl, total_page, page)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 爬取保存专辑信息
def save_album_list(baseurl, total_page, page):
print('获取图集信息')
for i in range(page - 1, int(total_page)):
data_list = []
print(i + 1)
# 修改专辑地址
url = baseurl + 'page_' + str(i + 1) + '.html'
soup = get_page_content(url)
# 修改专辑信息获取 find_all_next
for item in soup.find(class_="main").find_all(class_='thumbmm'):
item = str(item)
print(item)
# <a href="https://xtongs.com/web/1001.html" target="_blank" title="自然风景之美丽夕阳"><img src="https://xtongs.com/static/24/03/db/aHR0cHM6Ly9udXlvdTguY2MvemJfdXNlcnMvdXBsb2FkLzIwMjIvMDQvMjAyMjA0MjQxNjUwNzY2MzgzMzcwNDgxLmpwZw-190-285-index-a.jpg"/></a>
data = []
album_id = re.findall(re_album_id, item)[0]
album_name = re.findall(re_album_name, item)[0]
album_link = re.findall(re_album_link, item)[0]
cover_link = re.findall(re_cover_link, item)[0]
data.append(album_id)
data.append(album_name)
data.append(album_link)
data.append(cover_link)
data.append('0')
data_list.append(data)
# 每页的图集保存到数据
save_to_db(data_list)
# 所有的图集保存到 Excel
# save_to_excel(data_list, 'test.xlsx')
return
# 保存到excel测试
def save_to_excel(data_list, save_path):
print('保存execl')
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('专辑', cell_overwrite_ok=True)
col = ('专辑id', '专辑名称', '专辑地址', '专辑封面')
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, len(data_list)):
print("第%d条" % i)
data = data_list[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(save_path)
- 专辑信息正则与图片正则
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# <a href="https://xtongs.com/web/1001.html" target="_blank" title="自然风景之美丽夕阳"><img src="https://xtongs.com/static/24/03/db/aHR0cHM6Ly9udXlvdTguY2MvemJfdXNlcnMvdXBsb2FkLzIwMjIvMDQvMjAyMjA0MjQxNjUwNzY2MzgzMzcwNDgxLmpwZw-190-285-index-a.jpg"/></a>
# 专辑 ID
re_album_id = re.compile(r'.*\/(.*?)\.html')
# 专辑标题
re_album_name = re.compile(r'title="(.*)"><img')
# 专辑链接地址
re_album_link = re.compile(r'href="(.*)" target="_blank" title')
# 专辑封面图片
re_cover_link = re.compile(r'src="(.*?)"')
# ##########################################
# 图片地址
re_image_src = re.compile(r'src="(.*?)"')
# 图片 ID
re_image_id = re.compile(r'.*\/(.*?)\.jpg')
# 数据库储存地址
db_path = 'spider2022.db'
- 创建 sqlite 数据库 spider2022.db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
-- auto-generated definition
create table album
(
id integer not null
constraint album_pk
primary key autoincrement,
album_id integer,
album_name text,
album_link text,
cover_link text,
status integer
);
create unique index album_id_index
on album (id);
-- auto-generated definition
create table failed
(
url string not null
);
- 专辑保存到数据库
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 保存专辑信息到数据库
def save_to_db(data_list):
print('保存到数据库')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
for data in data_list:
data[1] = data[1].replace("'", '"')
data[1] = "'" + data[1] + "'"
data[2] = "'" + data[2] + "'"
data[3] = "'" + data[3] + "'"
sql = '''
insert into album (
album_id, album_name, album_link, cover_link, status
) values (%s)
''' % ",".join(data)
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
- 下载图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 获取当爬取队列
def get_album():
print('获取当爬取队列')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
select * from album where status = 0 limit 1
'''
cur.execute(sql)
res = cur.fetchall()
cur.close()
conn.close()
return res
# 获取专辑内总页数
def get_total_page(url):
print('获取页数')
soup = get_page_content(url)
# 需要修改正则
# res = soup.find_all('a', class_="page-numbers")
res = soup.find_all('ul', id="dm-fy")
print(res)
breakpoint()
return res[-2].text
# 更新当爬取队列信息
def update_album(id):
print('更新当爬取队列:%d' % id)
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
update album set status = 1 where id = %d
''' % id
cur.execute(sql)
conn.commit()
conn.close()
print('更新成功!')
return
# 下载失败 URL 储存到数据库
def failed_to_db(img_page_url):
print('失败 url 保存到数据库')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
insert into failed (
url
) values ('%s')
''' % img_page_url
print(sql)
cur.execute(sql)
conn.commit()
# 获取下载失败的队列
def get_failed():
print('获取失败队列')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
select * from failed limit 1
'''
cur.execute(sql)
res = cur.fetchall()
cur.close()
conn.close()
return res
# 下载成功后移除 URL
def failed_remove(url):
print('删除下载成功的 url')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
delete from failed where url = '%s'
''' % url
print(sql)
cur.execute(sql)
conn.commit()
下载图片方法可以拆出来下载方法 和 图片地址获取方法,但是这里我很懒不想搞了,熬夜不好,早点睡了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 下载图片
def download_images():
print('下载图片')
while 1:
album = get_album()
if not album:
print('全部完成!')
break
album_id = album[0][1]
album_name = album[0][2]
url = album[0][3]
try:
soup = get_page_content(url)
# 获取页数
# res_page = soup.select('.pagenavi > a > span')
res_page = soup.select('#dm-fy > li > a')
# album_page = res_page[-2].text
# 根据网站实际情况,这里最后一页是广告,取倒数第二页为总页数
album_page = res_page[-3].text
# 文件夹名称
folder_path = './pic/%d_%s/' % (album_id, album_name)
except Exception as e:
print('出错了!继续下次循环!')
print('=' * 30)
continue
for i in range(0, int(album_page)):
# 需要修改地址
img_page_url = url + '?page=' + str(i + 1)
soup = get_page_content(img_page_url)
# 修改图片所在主题元素
res = str(soup.find(class_="entry"))
try:
image_src = str(re.findall(re_image_src, res)[0])
# print(image_src)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0]
opener = urllib.request.build_opener()
opener.addheaders = img_header
urllib.request.install_opener(opener)
urllib.request.urlretrieve(image_src, filepath)
except Exception as e:
print(e)
print('出现错误,跳过本次,重新爬取整个专辑')
print(album_id, album_name)
print('-' * 30)
# 将出错的 img_page_url 放到数据库 下次再爬
failed_to_db(img_page_url)
continue
update_album(album[0][0])
while 1:
failed = get_failed()
if not failed:
print('全部完成!')
break
print(failed)
url = album[0][0]
soup = get_page_content(url)
# 修改图片所在主题元素
res = str(soup.find(class_="entry"))
try:
image_src = str(re.findall(re_image_src, res)[0])
print(image_src)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0]
opener = urllib.request.build_opener()
opener.addheaders = img_header
urllib.request.install_opener(opener)
urllib.request.urlretrieve(image_src, filepath)
except Exception as e:
print(e)
print('出现错误,跳过本次,重新爬取整个专辑')
print(album_id, album_name)
print('-' * 30)
# 将出错的 img_page_url 放到数据库 下次再爬
failed_to_db(img_page_url)
continue
# 删除 failed 队列
failed_remove(url)
- 主程序
1
2
3
4
5
6
7
8
9
def main():
# 专辑数据爬取到本地
album_to_local()
# 下载图片
download_images()
if __name__ == '__main__':
main()
豆瓣电影 top250 爬取
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Author:Xtongs
# -*- coding = utf-8 -*-
# @Time: 2023/3/2 下午7:14
# @Author: xtong
# @File: main.py
# @Software: VScode
# 爬取地址:https://movie.douban.com/top250
import urllib.request
import re
import xlwt
from bs4 import BeautifulSoup
# 浏览器请求头
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
# "Referer": "https://movie.douban.com/"
}
# 需要修改的地方,注意正则要与网站对应
# 电影排名
re_movie_rank = re.compile(r'<em class="">(.*?)</em>')
# 电影标题
# re_album_name = re.compile(r'.*>(.*)</a></span>')
re_album_name = re.compile(r'<span class="title">(.*?)</span>')
# 电影链接地址
# re_album_link = re.compile(r'href="(.*)" target="_blank"><img')
re_album_link = re.compile(r'<a href="(.*)">')
# 电影封面图片
# re_cover_link = re.compile(r'data-original="(.*?)"')
re_cover_link = re.compile(r'src="(.*?)"')
def main():
# 电影数据爬取到本地
album_to_local()
# 电影数据爬取到本地
def album_to_local():
print('开始')
# 爬取地址
baseurl = 'https://movie.douban.com/top250'
save_path = '豆瓣250.xls'
print('爬取地址:%s' % baseurl)
# 爬取电影信息
save_movie_list(baseurl, 10)
# 获取页面内容
def get_page_content(url):
print('获取页面内容:%s' % url)
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request, timeout=5)
html = response.read().decode()
except Exception as e:
print(e)
return BeautifulSoup(html, "html.parser")
# 爬取保存电影信息
def save_movie_list(baseurl, total_page):
print('获取电影信息')
all_data = []
for i in range(0, int(total_page)):
data_list = []
# 修改电影地址
url = baseurl + '?start=' + str((i)*25)
soup = get_page_content(url)
# 修改电影信息获取 find_all_next
for item in soup.find(id="content").find_all(class_='item'):
item = str(item)
# print(item)
data = []
movie_rank = re.findall(re_movie_rank, item)[0]
# print(re.findall(re_album_name, item))
album_name = re.findall(re_album_name, item)[0]
# print(album_name)
album_link = re.findall(re_album_link, item)[0]
# print(album_link)
cover_link = re.findall(re_cover_link, item)[0]
# print(cover_link)
data.append(movie_rank)
data.append(album_name)
data.append(album_link)
data.append(cover_link)
data_list.append(data)
# print(data)
# print(data_list)
# break
# break
# print(data_list)
# break
# save_to_db(data_list)
all_data.extend(data_list)
print(all_data)
save_to_excel(all_data, 'movie.xls')
return data_list
# 保存到excel测试
def save_to_excel(data_list, save_path):
print('保存execl')
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('电影', cell_overwrite_ok=True)
col = ('电影排名', '电影名称', '介绍地址', '电影海报')
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, len(data_list)):
print("第%d条" % i)
data = data_list[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(save_path)
if __name__ == '__main__':
main()