Python简易爬虫实现 使用 urllib、bs4 爬取图片网站

准备工作

使用 python3 环境,用 pip3 安装需要用到的 modules

1
2
3
4
5
6
7
8
# 安装 pip3
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
python3 get-pip.py
pip3 -v

# 安装 modules
pip3 install bs4 xlwt

爬取思路

  • 打开需要爬取的网站,一般来说,图片网站都会有:分类、图集、页码,在浏览器中找到这些元素
  • 第一步:获取全部图集信息
    • 图集名称、图集编号、图集地址、图集封面
  • 第二步:将图集信息储存在数据库中或Excel中
  • 第三步:从数据库中获取 url,对图集下的图片进行爬取下载,图集下载成功后标记数据库图集为已爬取状态

代码实现

  • 引入 modules
1
2
3
4
5
6
7
8
9
import os
import sqlite3
import sys
import urllib.request
import re

import xlwt
from bs4 import BeautifulSoup

  • 请求头设置
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 浏览器请求头
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
# "Referer": "https://www.xtongs.com/"
}
# 图片下载请求头
img_header = [
('User-Agent',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36'),
('Referer', 'https://xtongs.com/')
]


# 获取页面内容
def get_page_content(url):
print('获取页面内容:%s' % url)
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request, timeout=5)
html = response.read().decode()
except Exception as e:
print(e)
return BeautifulSoup(html, "html.parser")
  • 获取网站专辑总页数,这个可以浏览器里看一眼总页数,不麻烦

  • 保存专辑信息到本地

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 专辑数据爬取到本地
def album_to_local():
print('开始')
# 爬取地址
baseurl = 'https://xtongs.com/'
save_path = '专辑.xls'
print('爬取地址:%s' % baseurl)
# 总页数
# total_page = get_album_total_page(baseurl)
# 这个自己上网站上看一眼就行
total_page = 58
print('专辑总页数:%d' % total_page)
print('要从第几页开始?请输入数字:')
try:
page = int(input())
except Exception as e:
print(e)
print('页码必须是大于0的整数,不要超过最大页码')
sys.exit()
if page < 1:
print('页码从1开始')
page = 1
# 爬取专辑信息
save_album_list(baseurl, total_page, page)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

# 爬取保存专辑信息
def save_album_list(baseurl, total_page, page):
print('获取图集信息')
for i in range(page - 1, int(total_page)):
data_list = []
print(i + 1)
# 修改专辑地址
url = baseurl + 'page_' + str(i + 1) + '.html'
soup = get_page_content(url)
# 修改专辑信息获取 find_all_next
for item in soup.find(class_="main").find_all(class_='thumbmm'):
item = str(item)
print(item)
# <a href="https://xtongs.com/web/1001.html" target="_blank" title="自然风景之美丽夕阳"><img src="https://xtongs.com/static/24/03/db/aHR0cHM6Ly9udXlvdTguY2MvemJfdXNlcnMvdXBsb2FkLzIwMjIvMDQvMjAyMjA0MjQxNjUwNzY2MzgzMzcwNDgxLmpwZw-190-285-index-a.jpg"/></a>
data = []
album_id = re.findall(re_album_id, item)[0]
album_name = re.findall(re_album_name, item)[0]
album_link = re.findall(re_album_link, item)[0]
cover_link = re.findall(re_cover_link, item)[0]
data.append(album_id)
data.append(album_name)
data.append(album_link)
data.append(cover_link)
data.append('0')
data_list.append(data)
# 每页的图集保存到数据
save_to_db(data_list)
# 所有的图集保存到 Excel
# save_to_excel(data_list, 'test.xlsx')
return


# 保存到excel测试
def save_to_excel(data_list, save_path):
print('保存execl')
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('专辑', cell_overwrite_ok=True)
col = ('专辑id', '专辑名称', '专辑地址', '专辑封面')
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, len(data_list)):
print("第%d条" % i)
data = data_list[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(save_path)
  • 专辑信息正则与图片正则
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# <a href="https://xtongs.com/web/1001.html" target="_blank" title="自然风景之美丽夕阳"><img src="https://xtongs.com/static/24/03/db/aHR0cHM6Ly9udXlvdTguY2MvemJfdXNlcnMvdXBsb2FkLzIwMjIvMDQvMjAyMjA0MjQxNjUwNzY2MzgzMzcwNDgxLmpwZw-190-285-index-a.jpg"/></a>

# 专辑 ID
re_album_id = re.compile(r'.*\/(.*?)\.html')

# 专辑标题
re_album_name = re.compile(r'title="(.*)"><img')

# 专辑链接地址
re_album_link = re.compile(r'href="(.*)" target="_blank" title')

# 专辑封面图片
re_cover_link = re.compile(r'src="(.*?)"')

# ##########################################

# 图片地址
re_image_src = re.compile(r'src="(.*?)"')

# 图片 ID
re_image_id = re.compile(r'.*\/(.*?)\.jpg')


# 数据库储存地址
db_path = 'spider2022.db'

  • 创建 sqlite 数据库 spider2022.db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
-- auto-generated definition
create table album
(
id integer not null
constraint album_pk
primary key autoincrement,
album_id integer,
album_name text,
album_link text,
cover_link text,
status integer
);

create unique index album_id_index
on album (id);

-- auto-generated definition
create table failed
(
url string not null
);

  • 专辑保存到数据库
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 保存专辑信息到数据库
def save_to_db(data_list):
print('保存到数据库')
conn = sqlite3.connect(db_path)
cur = conn.cursor()

for data in data_list:
data[1] = data[1].replace("'", '"')
data[1] = "'" + data[1] + "'"
data[2] = "'" + data[2] + "'"
data[3] = "'" + data[3] + "'"
sql = '''
insert into album (
album_id, album_name, album_link, cover_link, status
) values (%s)
''' % ",".join(data)
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
  • 下载图片
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 获取当爬取队列
def get_album():
print('获取当爬取队列')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
select * from album where status = 0 limit 1
'''
cur.execute(sql)
res = cur.fetchall()
cur.close()
conn.close()
return res


# 获取专辑内总页数
def get_total_page(url):
print('获取页数')
soup = get_page_content(url)
# 需要修改正则
# res = soup.find_all('a', class_="page-numbers")
res = soup.find_all('ul', id="dm-fy")
print(res)
breakpoint()
return res[-2].text


# 更新当爬取队列信息
def update_album(id):
print('更新当爬取队列:%d' % id)
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
update album set status = 1 where id = %d
''' % id
cur.execute(sql)
conn.commit()
conn.close()
print('更新成功!')
return


# 下载失败 URL 储存到数据库
def failed_to_db(img_page_url):
print('失败 url 保存到数据库')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
insert into failed (
url
) values ('%s')
''' % img_page_url
print(sql)
cur.execute(sql)
conn.commit()


# 获取下载失败的队列
def get_failed():
print('获取失败队列')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
select * from failed limit 1
'''
cur.execute(sql)
res = cur.fetchall()
cur.close()
conn.close()
return res


# 下载成功后移除 URL
def failed_remove(url):
print('删除下载成功的 url')
conn = sqlite3.connect(db_path)
cur = conn.cursor()
sql = '''
delete from failed where url = '%s'
''' % url
print(sql)
cur.execute(sql)
conn.commit()

下载图片方法可以拆出来下载方法 和 图片地址获取方法,但是这里我很懒不想搞了,熬夜不好,早点睡了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 下载图片
def download_images():
print('下载图片')
while 1:
album = get_album()
if not album:
print('全部完成!')
break
album_id = album[0][1]
album_name = album[0][2]
url = album[0][3]

try:
soup = get_page_content(url)

# 获取页数
# res_page = soup.select('.pagenavi > a > span')
res_page = soup.select('#dm-fy > li > a')
# album_page = res_page[-2].text
# 根据网站实际情况,这里最后一页是广告,取倒数第二页为总页数
album_page = res_page[-3].text
# 文件夹名称
folder_path = './pic/%d_%s/' % (album_id, album_name)
except Exception as e:
print('出错了!继续下次循环!')
print('=' * 30)
continue

for i in range(0, int(album_page)):
# 需要修改地址
img_page_url = url + '?page=' + str(i + 1)

soup = get_page_content(img_page_url)
# 修改图片所在主题元素
res = str(soup.find(class_="entry"))

try:
image_src = str(re.findall(re_image_src, res)[0])
# print(image_src)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0]

opener = urllib.request.build_opener()
opener.addheaders = img_header
urllib.request.install_opener(opener)

urllib.request.urlretrieve(image_src, filepath)
except Exception as e:
print(e)
print('出现错误,跳过本次,重新爬取整个专辑')
print(album_id, album_name)
print('-' * 30)
# 将出错的 img_page_url 放到数据库 下次再爬
failed_to_db(img_page_url)
continue

update_album(album[0][0])

while 1:
failed = get_failed()
if not failed:
print('全部完成!')
break

print(failed)
url = album[0][0]

soup = get_page_content(url)
# 修改图片所在主题元素
res = str(soup.find(class_="entry"))

try:
image_src = str(re.findall(re_image_src, res)[0])
print(image_src)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0]

opener = urllib.request.build_opener()
opener.addheaders = img_header
urllib.request.install_opener(opener)

urllib.request.urlretrieve(image_src, filepath)
except Exception as e:
print(e)
print('出现错误,跳过本次,重新爬取整个专辑')
print(album_id, album_name)
print('-' * 30)
# 将出错的 img_page_url 放到数据库 下次再爬
failed_to_db(img_page_url)
continue

# 删除 failed 队列
failed_remove(url)

  • 主程序
1
2
3
4
5
6
7
8
9

def main():
# 专辑数据爬取到本地
album_to_local()
# 下载图片
download_images()

if __name__ == '__main__':
main()

豆瓣电影 top250 爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Author:Xtongs
# -*- coding = utf-8 -*-
# @Time: 2023/3/2 下午7:14
# @Author: xtong
# @File: main.py
# @Software: VScode

# 爬取地址:https://movie.douban.com/top250

import urllib.request
import re

import xlwt
from bs4 import BeautifulSoup

# 浏览器请求头
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
# "Referer": "https://movie.douban.com/"
}

# 需要修改的地方,注意正则要与网站对应

# 电影排名
re_movie_rank = re.compile(r'<em class="">(.*?)</em>')

# 电影标题
# re_album_name = re.compile(r'.*>(.*)</a></span>')
re_album_name = re.compile(r'<span class="title">(.*?)</span>')

# 电影链接地址
# re_album_link = re.compile(r'href="(.*)" target="_blank"><img')
re_album_link = re.compile(r'<a href="(.*)">')

# 电影封面图片
# re_cover_link = re.compile(r'data-original="(.*?)"')
re_cover_link = re.compile(r'src="(.*?)"')



def main():
# 电影数据爬取到本地
album_to_local()


# 电影数据爬取到本地
def album_to_local():
print('开始')
# 爬取地址
baseurl = 'https://movie.douban.com/top250'
save_path = '豆瓣250.xls'
print('爬取地址:%s' % baseurl)
# 爬取电影信息
save_movie_list(baseurl, 10)


# 获取页面内容
def get_page_content(url):
print('获取页面内容:%s' % url)
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request, timeout=5)
html = response.read().decode()
except Exception as e:
print(e)
return BeautifulSoup(html, "html.parser")



# 爬取保存电影信息
def save_movie_list(baseurl, total_page):
print('获取电影信息')
all_data = []
for i in range(0, int(total_page)):
data_list = []
# 修改电影地址
url = baseurl + '?start=' + str((i)*25)
soup = get_page_content(url)
# 修改电影信息获取 find_all_next
for item in soup.find(id="content").find_all(class_='item'):
item = str(item)
# print(item)
data = []
movie_rank = re.findall(re_movie_rank, item)[0]
# print(re.findall(re_album_name, item))
album_name = re.findall(re_album_name, item)[0]
# print(album_name)
album_link = re.findall(re_album_link, item)[0]
# print(album_link)
cover_link = re.findall(re_cover_link, item)[0]
# print(cover_link)
data.append(movie_rank)
data.append(album_name)
data.append(album_link)
data.append(cover_link)
data_list.append(data)
# print(data)
# print(data_list)
# break
# break
# print(data_list)
# break
# save_to_db(data_list)
all_data.extend(data_list)
print(all_data)
save_to_excel(all_data, 'movie.xls')
return data_list


# 保存到excel测试
def save_to_excel(data_list, save_path):
print('保存execl')
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('电影', cell_overwrite_ok=True)
col = ('电影排名', '电影名称', '介绍地址', '电影海报')
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, len(data_list)):
print("第%d条" % i)
data = data_list[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(save_path)



if __name__ == '__main__':
main()

参考资料