#!/usr/bin/python
#encoding:utf-8
import requests
from bs4 import BeautifulSoup
import codecs
import xlwt
from xlutils.copy import copy
from xlrd import open_workbook
import os


class Spider():
    def __init__(self):
        self.url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?'
        self.headers={
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate, sdch',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
        }


        self.data={
            'ji':'上海',
            'kw':'大数据',
            'p':1,
            'isadv':0
        }
        self.filename='./zlzp.xls'
        f=xlwt.Workbook(encoding='utf-8')
        sheet1=f.add_sheet('sheet1')
        row=['position', 'company', 'salary', 'address', 'data']
        for i in range(len(row)):
            sheet1.write(0,i,row[i])
        f.save(self.filename)




    def Zlzp(self):
        html=requests.get(self.url,headers=self.headers,params=self.data)
        soup=BeautifulSoup(html.text,'html.parser')


# f=codecs.open('./zhilian.html','w','utf-8')
# f.write(html.text)
# f.close()


        newlist=soup.find('div',{'class':'newlist_list_content'})
        tables=newlist.findAll('table',{'newlist'})
        line = 1


        for i in range(1,len(tables)):
            table=tables[i]
            link=table.find('a')['href']
            link=str(link)
            position, company, salary, address, data=self.get_info(link)
            print position, company, salary, address, data
            rb=open_workbook(self.filename)
            wb=copy(rb)
            jobData=[position.decode('utf-8'), company.decode('utf-8'), salary.decode('utf-8'), address.decode('utf-8'), data.decode('utf-8')]
            sheet=wb.get_sheet(0)
            for j in range(len(jobData)):
                sheet.write(line,j,jobData)
            line+=1
            os.remove(self.filename)
            wb.save(self.filename)


    def get_info(self,link):
            header = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'zh-CN,zh;q=0.8',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
            }


            html=requests.get(link,headers=header)
            soup=BeautifulSoup(html.text,'html.parser')
            try:
                tfb=soup.find('div',{'class':'top-fixed-box'})


                position=tfb.find('h1').text.encode('utf-8')  #地点
                company=tfb.find('h2').text.encode('utf-8')   #公司
                tpl=soup.find('div',{'class':'terminalpage-left'})


                tuc=tpl.find('ul',{'class':'terminal-ul clearfix'})
                lis=tuc.findAll('li')


                salary=lis[0].find('strong').text.encode('utf-8')  #工资
                address=lis[1].find('strong').text.encode('utf-8') #地点
                data=lis[2].find('strong').text.encode('utf-8')    #日期


                return position,company,salary,address,data
            except Exception as e:
                print e


if __name__ == '__main__':
    spider=Spider()
    spider.Zlzp()
Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐