爬取某网站招聘爬虫
1、学会如何翻页
思路分析:
a、获取首页的数据
b、寻找下一页的地址,进行翻页,获取数据
步骤:
a、修改start_urls
b、检查allowed_domains
c、parse方法进行解析
2、网页关键词查找:ctrl+f,ctrl+v
""" 1、items中确定要爬取的字段,规范爬取的数据,防止遗漏 2、创建爬虫,域名xx.com 3、写url,parse解析数据 对数据进行xpath分析,确定数据 """ import scrapy class xxItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() link = scrapy.Field() depart = scrapy.Field() category = scrapy.Field() type = scrapy.Field() address = scrapy.Field() num = scrapy.Field() date = scrapy.Field()
# 创建的job.py文件中运行 import scrapy from wangyi.items import xxItem class JobSpider(scrapy.Spider): name = 'job' allowed_domains = ['xx.com'] # 1、更改初始url start_urls = ['https://hr.xx.com/position/list.do'] def parse(self, response): # 完成爬虫 # 1、解析响应信息,获取节点列表, node_list = response.xpath('//*[@class="position-tb"]/tbody/tr') # print(len(node_list)) 输出40,实际长度20,需要处理数据 # # 2、抽取数据,遍历节点列表,过滤数据 for num, node in enumerate(node_list): # 设置过滤条件,将目标节点获取出来 if num % 2 == 0: item = WangyiItem() item['name'] = node.xpath('./td[1]/a/text()').extract_first() # response.urljoin()用于拼接相对路径的url item['link'] = response.urljoin(node.xpath('./td[1]/a/@href').extract_first()) item['depart'] = node.xpath('./td[2]/text()').extract_first() item['category'] = node.xpath('./td[3]/text()').extract_first() item['type'] = node.xpath('./td[4]/text()').extract_first() item['address'] = node.xpath('./td[5]/text()').extract_first() item['num'] = node.xpath('./td[6]/text()').extract_first().strip() item['date'] = node.xpath('./td[7]/text()').extract_first() yield item # 模拟翻页 part_url = response.xpath('/html/body/div[2]/div[2]/div[2]/div/a[last()]/@href').extract_first() # 判断终止条件 if part_url != 'javascript:;': next_url = response.urljoin(part_url) # 构建请求对象,并且返回给引擎,最后去piplines保存数据 yield scrapy.Request( url=next_url, callback=self.parse )
import json class xxPipeline: def __init__(self): self.file = open('xx.json', 'w') def process_item(self, item, spider): item = dict(item) str_data = json.dumps(item, ensure_ascii=False) + ',\n' # 写入数据 self.file.write(str_data) return item def __del__(self): self.file.close()
ITEM_PIPELINES = { 'xx.pipelines.xxPipeline': 300, }