技术笔记 python Python简易爬虫实现 使用 urllib、bs4 爬取图片网站 Xtong 2022-05-12 2025-01-05 准备工作 使用 python3 环境,用 pip3 安装需要用到的 modules
1 2 3 4 5 6 7 8 curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py python3 get-pip.py pip3 -v pip3 install bs4 xlwt
爬取思路
打开需要爬取的网站,一般来说,图片网站都会有:分类、图集、页码,在浏览器中找到这些元素
第一步:获取全部图集信息
第二步:将图集信息储存在数据库中或Excel中
第三步:从数据库中获取 url,对图集下的图片进行爬取下载,图集下载成功后标记数据库图集为已爬取状态
代码实现
1 2 3 4 5 6 7 8 9 import osimport sqlite3import sysimport urllib.requestimport reimport xlwtfrom bs4 import BeautifulSoup
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 head = { "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36" , } img_header = [ ('User-Agent' , 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36' ), ('Referer' , 'https://xtongs.com/' ) ] def get_page_content (url ): print ('获取页面内容:%s' % url) request = urllib.request.Request(url, headers=head) html = '' try : response = urllib.request.urlopen(request, timeout=5 ) html = response.read().decode() except Exception as e: print (e) return BeautifulSoup(html, "html.parser" )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 def album_to_local (): print ('开始' ) baseurl = 'https://xtongs.com/' save_path = '专辑.xls' print ('爬取地址:%s' % baseurl) total_page = 58 print ('专辑总页数:%d' % total_page) print ('要从第几页开始?请输入数字:' ) try : page = int (input ()) except Exception as e: print (e) print ('页码必须是大于0的整数,不要超过最大页码' ) sys.exit() if page < 1 : print ('页码从1开始' ) page = 1 save_album_list(baseurl, total_page, page)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 def save_album_list (baseurl, total_page, page ): print ('获取图集信息' ) for i in range (page - 1 , int (total_page)): data_list = [] print (i + 1 ) url = baseurl + 'page_' + str (i + 1 ) + '.html' soup = get_page_content(url) for item in soup.find(class_="main" ).find_all(class_='thumbmm' ): item = str (item) print (item) data = [] album_id = re.findall(re_album_id, item)[0 ] album_name = re.findall(re_album_name, item)[0 ] album_link = re.findall(re_album_link, item)[0 ] cover_link = re.findall(re_cover_link, item)[0 ] data.append(album_id) data.append(album_name) data.append(album_link) data.append(cover_link) data.append('0' ) data_list.append(data) save_to_db(data_list) return def save_to_excel (data_list, save_path ): print ('保存execl' ) book = xlwt.Workbook(encoding="utf-8" , style_compression=0 ) sheet = book.add_sheet('专辑' , cell_overwrite_ok=True ) col = ('专辑id' , '专辑名称' , '专辑地址' , '专辑封面' ) for i in range (0 , 4 ): sheet.write(0 , i, col[i]) for i in range (0 , len (data_list)): print ("第%d条" % i) data = data_list[i] for j in range (0 , 4 ): sheet.write(i + 1 , j, data[j]) book.save(save_path)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 re_album_id = re.compile (r'.*\/(.*?)\.html' ) re_album_name = re.compile (r'title="(.*)"><img' ) re_album_link = re.compile (r'href="(.*)" target="_blank" title' ) re_cover_link = re.compile (r'src="(.*?)"' ) re_image_src = re.compile (r'src="(.*?)"' ) re_image_id = re.compile (r'.*\/(.*?)\.jpg' ) db_path = 'spider2022.db'
创建 sqlite 数据库 spider2022.db
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 create table album( id integer not null constraint album_pk primary key autoincrement, album_id integer , album_name text, album_link text, cover_link text, status integer ); create unique index album_id_index on album (id); create table failed( url string not null );
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 def save_to_db (data_list ): print ('保存到数据库' ) conn = sqlite3.connect(db_path) cur = conn.cursor() for data in data_list: data[1 ] = data[1 ].replace("'" , '"' ) data[1 ] = "'" + data[1 ] + "'" data[2 ] = "'" + data[2 ] + "'" data[3 ] = "'" + data[3 ] + "'" sql = ''' insert into album ( album_id, album_name, album_link, cover_link, status ) values (%s) ''' % "," .join(data) print (sql) cur.execute(sql) conn.commit() cur.close() conn.close()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 def get_album (): print ('获取当爬取队列' ) conn = sqlite3.connect(db_path) cur = conn.cursor() sql = ''' select * from album where status = 0 limit 1 ''' cur.execute(sql) res = cur.fetchall() cur.close() conn.close() return res def get_total_page (url ): print ('获取页数' ) soup = get_page_content(url) res = soup.find_all('ul' , id ="dm-fy" ) print (res) breakpoint () return res[-2 ].text def update_album (id ): print ('更新当爬取队列:%d' % id ) conn = sqlite3.connect(db_path) cur = conn.cursor() sql = ''' update album set status = 1 where id = %d ''' % id cur.execute(sql) conn.commit() conn.close() print ('更新成功!' ) return def failed_to_db (img_page_url ): print ('失败 url 保存到数据库' ) conn = sqlite3.connect(db_path) cur = conn.cursor() sql = ''' insert into failed ( url ) values ('%s') ''' % img_page_url print (sql) cur.execute(sql) conn.commit() def get_failed (): print ('获取失败队列' ) conn = sqlite3.connect(db_path) cur = conn.cursor() sql = ''' select * from failed limit 1 ''' cur.execute(sql) res = cur.fetchall() cur.close() conn.close() return res def failed_remove (url ): print ('删除下载成功的 url' ) conn = sqlite3.connect(db_path) cur = conn.cursor() sql = ''' delete from failed where url = '%s' ''' % url print (sql) cur.execute(sql) conn.commit()
下载图片方法可以拆出来下载方法 和 图片地址获取方法,但是这里我很懒不想搞了,熬夜不好,早点睡了
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 def download_images (): print ('下载图片' ) while 1 : album = get_album() if not album: print ('全部完成!' ) break album_id = album[0 ][1 ] album_name = album[0 ][2 ] url = album[0 ][3 ] try : soup = get_page_content(url) res_page = soup.select('#dm-fy > li > a' ) album_page = res_page[-3 ].text folder_path = './pic/%d_%s/' % (album_id, album_name) except Exception as e: print ('出错了!继续下次循环!' ) print ('=' * 30 ) continue for i in range (0 , int (album_page)): img_page_url = url + '?page=' + str (i + 1 ) soup = get_page_content(img_page_url) res = str (soup.find(class_="entry" )) try : image_src = str (re.findall(re_image_src, res)[0 ]) if not os.path.exists(folder_path): os.makedirs(folder_path) filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0 ] opener = urllib.request.build_opener() opener.addheaders = img_header urllib.request.install_opener(opener) urllib.request.urlretrieve(image_src, filepath) except Exception as e: print (e) print ('出现错误,跳过本次,重新爬取整个专辑' ) print (album_id, album_name) print ('-' * 30 ) failed_to_db(img_page_url) continue update_album(album[0 ][0 ]) while 1 : failed = get_failed() if not failed: print ('全部完成!' ) break print (failed) url = album[0 ][0 ] soup = get_page_content(url) res = str (soup.find(class_="entry" )) try : image_src = str (re.findall(re_image_src, res)[0 ]) print (image_src) if not os.path.exists(folder_path): os.makedirs(folder_path) filepath = folder_path + '%s.jpg' % re.findall(re_image_id, image_src)[0 ] opener = urllib.request.build_opener() opener.addheaders = img_header urllib.request.install_opener(opener) urllib.request.urlretrieve(image_src, filepath) except Exception as e: print (e) print ('出现错误,跳过本次,重新爬取整个专辑' ) print (album_id, album_name) print ('-' * 30 ) failed_to_db(img_page_url) continue failed_remove(url)
1 2 3 4 5 6 7 8 9 def main (): album_to_local() download_images() if __name__ == '__main__' : main()
豆瓣电影 top250 爬取 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 import urllib.requestimport reimport xlwtfrom bs4 import BeautifulSouphead = { "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36" , } re_movie_rank = re.compile (r'<em class="">(.*?)</em>' ) re_album_name = re.compile (r'<span class="title">(.*?)</span>' ) re_album_link = re.compile (r'<a href="(.*)">' ) re_cover_link = re.compile (r'src="(.*?)"' ) def main (): album_to_local() def album_to_local (): print ('开始' ) baseurl = 'https://movie.douban.com/top250' save_path = '豆瓣250.xls' print ('爬取地址:%s' % baseurl) save_movie_list(baseurl, 10 ) def get_page_content (url ): print ('获取页面内容:%s' % url) request = urllib.request.Request(url, headers=head) html = '' try : response = urllib.request.urlopen(request, timeout=5 ) html = response.read().decode() except Exception as e: print (e) return BeautifulSoup(html, "html.parser" ) def save_movie_list (baseurl, total_page ): print ('获取电影信息' ) all_data = [] for i in range (0 , int (total_page)): data_list = [] url = baseurl + '?start=' + str ((i)*25 ) soup = get_page_content(url) for item in soup.find(id ="content" ).find_all(class_='item' ): item = str (item) data = [] movie_rank = re.findall(re_movie_rank, item)[0 ] album_name = re.findall(re_album_name, item)[0 ] album_link = re.findall(re_album_link, item)[0 ] cover_link = re.findall(re_cover_link, item)[0 ] data.append(movie_rank) data.append(album_name) data.append(album_link) data.append(cover_link) data_list.append(data) all_data.extend(data_list) print (all_data) save_to_excel(all_data, 'movie.xls' ) return data_list def save_to_excel (data_list, save_path ): print ('保存execl' ) book = xlwt.Workbook(encoding="utf-8" , style_compression=0 ) sheet = book.add_sheet('电影' , cell_overwrite_ok=True ) col = ('电影排名' , '电影名称' , '介绍地址' , '电影海报' ) for i in range (0 , 4 ): sheet.write(0 , i, col[i]) for i in range (0 , len (data_list)): print ("第%d条" % i) data = data_list[i] for j in range (0 , 4 ): sheet.write(i + 1 , j, data[j]) book.save(save_path) if __name__ == '__main__' : main()
参考资料