python多线程爬取网页名称写入到excel

#!/usr/bin/env python# coding: utf-8# In[1]:import pandas as pdimport threadingimport requestsfrom bs4 import BeautifulSoupfrom time import sleepfrom datetime import datetime# In[2]:df = pd.read_excel

玲珑·

794人浏览 · 2021-08-30 22:20:35

玲珑· · 2021-08-30 22:20:35 发布

参考文档

BS4 中文文档

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import threading 
import requests
from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime


# In[2]:


df = pd.read_excel("网站对应名字.xlsx")


# In[16]:


sites = df.URL
data_count = len(sites)
thread_count = 16
threads = []
n_loops = range(thread_count)


# In[17]:


names = [None]*data_count


# In[18]:


def get_url_title(site):
    try:
        html = requests.get(site)
        soup = BeautifulSoup(html.content)
        return soup.find("title").text
    except BaseException:
        return "网址有误"


# In[19]:


# 从改点开始
def write_title(start):
    # 引用全局变量
    global data_count,thread_count,names
    for i in range(start,data_count,thread_count):
        names[i] = get_url_title(sites[i])
        print(i,names[i])


# In[20]:


def main():
    global threads,n_loops
    for i in n_loops:
        t = threading.Thread(target=write_title,args=(i,))
        threads.append(t)
    # 启动 多个线程
    for i in n_loops:
        threads[i].start()
    # wait for all threads to finish
    for i in n_loops:      
        threads[i].join()    


# In[21]:


if __name__ == '__main__':
    main()


# In[22]:


names


# In[10]:


names


# In[11]:


len(names)


# In[12]:


df.info


# In[23]:


import multiprocessing
print(multiprocessing.cpu_count())


# In[ ]: