欢迎访问我的网站与公众号!点击与扫码即可进入,谢谢关注!

线程池基本使用

原则: 线程池处理的是阻塞且耗时的操作

1
2
3
4
5
6
7
8
# 导入线程池模块对应的类
from multiprocessing.dummy import Pool
# 实例化一个线程池对象,开启4个线程对象
pool = Pool(4)
# 将列表中每个元素传递给get_content处理
# 参数: 阻塞的函数,参数
# 返回: 每个返回值组成的列表
page_contets = pool.map(get_content, urls)

实例1: (梨视频爬取)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# ...引入对应包
# ...请求页面数据
tree = etree.HTML(page_text)
li_list = tree.xpath('//url[@id="listvideoListUl"]/li')
urls = []
for li in li_list:
# 获取每个视频的链接和名字
detail_url = 'https://www.pearvideo.com'+li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
# 对详情页发起请求
detail_page_text = requests.get(url=detail_url,headers).text
# 从详情页解析出视频的url(查看xhr和response)
# 教程中用正则解析
video_url = res.finadall('src="(.*?)",vdoUrl', detail_page_text)[0]
# 把名称和url存储起来
dic = {'name':name, 'url':video_url}
urls.append(dic)

# 对视频发起请求,二进制,然后存储数据
def get_video_content(dic):
url = dic['url']
data = requests.get(url=url,headers=headers).content
with open(dic['name'], 'wb') as f:
f.write(data)

pool = Pool(4)
pool.map(get_video_content, urls)

pool.close()
# 主线程等待线程结束之后才关闭
pool.join()