博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Boss直聘招聘信息爬取
阅读量:2048 次
发布时间:2019-04-28

本文共 9375 字,大约阅读时间需要 31 分钟。

  1. 利用selenium进行爬取,数据为CSV文件
  2. 编写时间:2020年03月16日(若爬取失败,应该是网站更新造成的。)
from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsimport timefrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWait                    # available since 2.4.0from selenium.webdriver.support import expected_conditions as EC           # available since 2.26.0import csvimport reimport sysimport osimport randomclass Boss:    def __init__(self):        # 设置 chrome 无界面化模式        self.chrome_options = Options()        # self.chrome_options.add_argument('--headless')        # self.chrome_options.add_argument('--disable-gpu')        self.driver = webdriver.Chrome(chrome_options=self.chrome_options)    def get_url(self, search='python'):        """        获取搜索职位的url, demo里面默认搜索python        :param search:        :return:        """        gangwei_lst = ["Java", "测试", ".net", "安卓", "UI", "前端"]        for search in gangwei_lst:            xuhao = 1            # 创建文件            wr.csv_init(search)            for y in range(1, 11):                try:                    self.driver.switch_to.window(sreach_window)                except:                    pass                url = 'https://www.zhipin.com/c101090100/?query=' + str(search) + '&page=' + str(y) + '&ka=page-' + str(y)                self.driver.get(url)                # 获取当前窗口                sreach_window = self.driver.current_window_handle                # 每页有 30 条内容                for x in range(1, 31):                    data = []                    try:                        self.driver.find_element_by_xpath("//*[text()='没有找到相关职位,修改筛选条件试一下']")                    except:                        self.driver.switch_to.window(sreach_window)                        try:                            xpath_yanzhengma = self.driver.find_element_by_id("verifyMessage").text                            print(xpath_yanzhengma)                            if "当前IP地址可能存在异常访问行为,完成验证后即可正常使用" in xpath_yanzhengma:                                print('输入验证码验证')                                os.system("pause")                        except:                            pass                        # 公司名称                        try:                            xpath_gongsi_name = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]/div[2]/div/h3/a'                            WebDriverWait(self.driver, 60, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_gongsi_name)))                            gongsi_name = self.driver.find_element_by_xpath(xpath_gongsi_name).text                        except:                            print('爬取完成!')                        # 薪资                        try:                            xpath_xinzi = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]/div[1]/a/div[2]/span'                            WebDriverWait(self.driver, 3, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_xinzi)))                            xinzi = self.driver.find_element_by_xpath(xpath_xinzi).text                        except:                            xinzi = ""                        # 岗位名称                        try:                            xpath_gangwei = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]/div[1]/a/div[1]/span[1]'                            WebDriverWait(self.driver, 3, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_gangwei)))                            gangwei = self.driver.find_element_by_xpath(xpath_gangwei).text                        except:                            gangwei = ""                        # 公司大小                        try:                            xpath_size = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]/div[2]/div/p'                            WebDriverWait(self.driver, 3, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_size)))                            type_size = self.driver.find_element_by_xpath(xpath_size).text                            gongsi_size = re.findall('\d+-\d+人', type_size)[0]  # 正则表达式提取数字,返回一个列表                            if gongsi_size == '':                                gongsi_size = re.findall('\d+', type_size)  # 正则表达式提取数字,返回一个列表                                gongsi_type = type_size.split(gongsi_size)[0]                            else:                                gongsi_type = type_size.split(gongsi_size)[0]                        except:                            gongsi_size = ""                            gongsi_type = ""                        # 公司福利                        try:                            xpath_fuli = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[2]/div[2]'                            WebDriverWait(self.driver, 1, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_fuli)))                            gongsi_fuli = self.driver.find_element_by_xpath(xpath_fuli).text                        except:                            gongsi_fuli = ""                        # 工作经验                        try:                            xpath_jingyan = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]/div[1]/a/div[2]/p'                            WebDriverWait(self.driver, 1, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_jingyan)))                            jingyan_xueli = self.driver.find_element_by_xpath(xpath_jingyan).text                            xueli = ['硕士', '大专', '本科', '博士', '专科']                            for xue in xueli:                                if xue in jingyan_xueli:                                    gongsi_jingyan = jingyan_xueli.split(xue)[0]                                    gongsi_xueli = jingyan_xueli.split(gongsi_jingyan)[1]                                    break                        except:                            gongsi_jingyan = ""                            gongsi_xueli = ""                        # 详情页                        try:                            time.sleep(random.randint(1, 4))                            xpath_dingwei = '//*[@id="main"]/div/div[2]/ul/li[' + str(x) + ']/div/div[1]'                            WebDriverWait(self.driver, 1, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_dingwei)))                            continue1 = self.driver.find_element_by_xpath(xpath_dingwei)                            continue1.click()                            all_window = self.driver.window_handles                            for handle in all_window:                                if handle != sreach_window:                                    self.driver.switch_to.window(handle)                                    # 岗位描述                                    try:                                        xpath_miaoshu = '//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div'                                        WebDriverWait(self.driver, 5, 0.5).until(EC.presence_of_element_located((By.XPATH, xpath_miaoshu)))                                        miaoshu = self.driver.find_element_by_xpath(xpath_miaoshu).text                                    except:                                        miaoshu = ""                                    # 工作地址                                    try:                                        dizhi = self.driver.find_element_by_class_name("location-address").text                                    except:                                        dizhi = ""                                    self.driver.close()  # 关闭当前标识的窗口                        except:                            miaoshu = ""                            dizhi = ""                        data.append(xuhao)                        data.append(gongsi_name)                        data.append(gongsi_size)                        data.append(gongsi_type)                        data.append(gangwei)                        data.append(gongsi_xueli)                        data.append(gongsi_jingyan)                        data.append(miaoshu)                        # 年龄为空                        data.append("")                        # 工作时间为空                        data.append("")                        data.append(gongsi_fuli)                        data.append(xinzi)                        data.append(dizhi)                        # 备注                        data.append("")                        wr.write(data)                        print("已完成" + str(xuhao) + "条")                        time.sleep(random.randint(1, 5))                        xuhao += 1                    else:                        self.driver.refresh()class WriteDataToCSV:    def csv_init(self, path):        self.path = "./result/" + str(path) + ".csv"        # 1. 创建文件对象        self.f = open(self.path, 'a+', encoding='utf-8', newline="")        # 2. 基于文件对象构建 csv写入对象        self.csv_writer = csv.writer(self.f)        # 3. 构建列表头        self.csv_writer.writerow(["序号", "企业名称", "企业规模", "性质/行业", "岗位名称", "学历要求",                             "工作经验", "专业要求", "年龄要求", "工作时间", "社保福利", "薪酬范围",                             "工作地点", "备注"])        # 4. 关闭文件        self.f.close()    def write(self, data):        with open(self.path, 'a+', encoding='utf-8', newline="") as f:            csv_writer = csv.writer(f)            # 4. 写入csv文件内容            csv_writer.writerow(data)if __name__ == '__main__':    wr = WriteDataToCSV()    Boss().get_url()

实际效果:

在这里插入图片描述

转载地址:http://nfhof.baihongyu.com/

你可能感兴趣的文章
caffe:用自己的图像数据训练模型
查看>>
ubuntu下clion中配置opencv的CMakeLists.txt
查看>>
什么是卷积 卷积有什么用
查看>>
有趣的机器学习概念纵览:从多元拟合,神经网络到深度学习,给每个感兴趣的人
查看>>
K-近邻算法:KNN
查看>>
solver及其配置
查看>>
图说C++对象模型:对象内存布局详解
查看>>
【Java基础】Java类的加载和对象创建流程的详细分析
查看>>
JAVA多线程之volatile 与 synchronized 的比较
查看>>
Java多线程知识点总结
查看>>
Java集合框架知识梳理
查看>>
java中IO流知识梳理
查看>>
word2010如何保持在公式后面键入空格后或添加文字不变小?
查看>>
笔试题(一)—— java基础
查看>>
笔试题(二)—— sql语句
查看>>
Redis学习笔记(二)— 在linux下搭建redis服务器
查看>>
Redis学习笔记(三)—— 使用redis客户端连接windows和linux下的redis并解决无法连接redis的问题
查看>>
Eclipse配置错误——An internal error occurred during: "Building workspace".GC overhead limit exceeded
查看>>
Intellij IDEA使用(一)—— 安装Intellij IDEA(ideaIU-2017.2.3)并完成Intellij IDEA的简单配置
查看>>
Intellij IDEA使用(二)—— 在Intellij IDEA中配置JDK(SDK)
查看>>