python多线程爬取网页名称写入到excel
#!/usr/bin/env python# coding: utf-8# In[1]:import pandas as pdimport threadingimport requestsfrom bs4 import BeautifulSoupfrom time import sleepfrom datetime import datetime# In[2]:df = pd.read_excel
·
参考文档
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import threading
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
# In[2]:
df = pd.read_excel("网站对应名字.xlsx")
# In[16]:
sites = df.URL
data_count = len(sites)
thread_count = 16
threads = []
n_loops = range(thread_count)
# In[17]:
names = [None]*data_count
# In[18]:
def get_url_title(site):
try:
html = requests.get(site)
soup = BeautifulSoup(html.content)
return soup.find("title").text
except BaseException:
return "网址有误"
# In[19]:
# 从改点开始
def write_title(start):
# 引用全局变量
global data_count,thread_count,names
for i in range(start,data_count,thread_count):
names[i] = get_url_title(sites[i])
print(i,names[i])
# In[20]:
def main():
global threads,n_loops
for i in n_loops:
t = threading.Thread(target=write_title,args=(i,))
threads.append(t)
# 启动 多个线程
for i in n_loops:
threads[i].start()
# wait for all threads to finish
for i in n_loops:
threads[i].join()
# In[21]:
if __name__ == '__main__':
main()
# In[22]:
names
# In[10]:
names
# In[11]:
len(names)
# In[12]:
df.info
# In[23]:
import multiprocessing
print(multiprocessing.cpu_count())
# In[ ]:
更多推荐
已为社区贡献4条内容
所有评论(0)