使用生产者与消费者模式爬取腾讯招聘网的招聘信息
腾讯招聘:https://careers.tencent.com/目录1.找接口2.生产者与消费者模式分析3.生产者4.消费者5.主函数1.找接口我们去腾讯招聘网站去找有关python的招聘信息,在搜索框输入python,接口变成:https://careers.tencent.com/search.html?keyword=python我们用这个接口直...
·
腾讯招聘:https://careers.tencent.com/
目录
1.找接口
我们去腾讯招聘网站去找有关python的招聘信息,在搜索框输入python,接口变成:
https://careers.tencent.com/search.html?keyword=python
我们用这个接口直接去请求网页资源的话,会发现没有数据,只抓到了网页的框架
代码如下:
import requests
from lxml import etree
url = 'https://careers.tencent.com/search.html?keyword=python'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
content = response.content.decode('utf-8')
with open('job.html', 'w', encoding='utf-8') as fp:
fp.write(content)
我们上述代码中把请求到网页的内容保存到了job.html
程序运行完之后,点开job.html 在浏览器中打开:效果如下
这种情况很有可能是ajax请求,我们需要重新去找接口
打开F12 network-->XHR 找到如下:
找一下请求头的链接:
这个接口里面有很多参数,我们可以删去没有用的,pageIndex 是用来传页码的,我们可以直接传页码
https://careers.tencent.com/tencentcareer/api/post/Query?keyword=python&pageIndex={}&pageSize=10
重新开始请求,ajax请求响应回来的是json数据格式
2.生产者与消费者模式分析
我们在整个过程需要请求接口,然后再解析数据
生产者用来请求接口,消费者用来解析数据
3.生产者
从page_queue里面取出来page,拼接好url
# 1.写子类 需要继承父类Thread类 复写run()方法
class Thread_producer(threading.Thread):
def __init__(self, name, page_queue):
# 参数一般以简短为主,所以选page
threading.Thread.__init__(self)
# 拿到任务队列
self.page_queue = page_queue
self.name = name
def run(self):
# 开始时间
time_start = time.time()
print(self.name, "开始时间:", time_start)
while True:
if self.page_queue.empty():
break
else:
print('当前的线程:', self.name, "将要从队列中取任务")
page = self.page_queue.get()
print(self.name, "将要从队列中取任务是:", page)
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?keyword=python&pageIndex={}&pageSize=10'.format(
page)
self.get_content(url=url)
print(self.name, '完成的任务是', page)
time_end = time.time()
print(self.name, '完成时间是:', time_end)
print(self.name, '耗时是:', time_end - time_start)
def get_content(self, url):
headers = {
'user - agent': 'Mozilla / 5.0(Windows NT 6.1;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 74.0.3729.108Safari / 537.36'
}
response = requests.get(url=url, headers=headers).content.decode('utf-8')
content_queue.put(response)
4.消费者
class Thread_customer(threading.Thread):
def __init__(self, name):
# 参数一般以简短为主,所以选page
threading.Thread.__init__(self)
# 拿到任务队列
self.name = name
def run(self):
# 开始时间
time_start = time.time()
print(self.name, "开始时间:", time_start)
while True:
if content_queue.empty() and flag:
break
else:
try:
# print('当前的线程:', self.name, "将要从队列中取任务")
response = content_queue.get(block=False)
print(self.name, "将要从队列中取任务是:", page)
self.get_data(response)
print(self.name, '完成的任务是', page)
except:
pass
time_end = time.time()
print(self.name, '完成时间是:', time_end)
print(self.name, '耗时是:', time_end - time_start)
def get_data(self, response):
# 将json字符串转换成标准的python的数据格式
content_list = json.loads(response)['Data']['Posts']
# 爬取岗位数据
for content in content_list:
# 岗位名称
name = content['RecruitPostName']
# 工作国家
country = content['CountryName']
# 工作地区
address = content['LocationName']
# 职责
duty = content['Responsibility']
# 详情链接
detail_link = content['PostURL']
info={
'岗位名称':name,
'工作国家':country,
'工作地区':address,
'职责':duty,
'详情链接':detail_link,
}
# 保存
with open('tecent_job.txt', 'a', encoding='utf-8') as fp:
fp.write(str(info))
5.主函数
flag = False
content_queue = Queue()
if __name__ == '__main__':
# 任务开始时间
t_start = time.time()
print("主线程开始时间:", t_start)
# 1.创建任务队列:存放所有的page
page_queue = Queue()
for page in range(1, 61):
page_queue.put(page)
# 2.生成生产者线程:
producer_name = ['p1', 'p2', 'p3']
producer_thread = []
for name in producer_name:
crawl = Thread_producer(name, page_queue)
crawl.start()
producer_thread.append(crawl)
# join()不能写在这里,因为这样写就类似于单线程 c1完了才启动c2
# 3.生成消费者线程:
customer_name = ['c1', 'c2', 'c3']
customer_thread = []
for name in customer_name:
crawl = Thread_customer(name)
crawl.start()
customer_thread.append(crawl)
# 阻塞线程 让子线程都完成任务后,主线程再往下进行
for thread in producer_thread:
thread.join()
flag = True
for thread in customer_thread:
thread.join()
# 主进程结束时间
t_end = time.time()
print(t_end)
print('完成时间是:', t_end - t_start)
# 三个线程跑三次run 三个线程爬60页数据
更多推荐
已为社区贡献3条内容
所有评论(0)