Handling redirecting <301> from Indeed with Scrapy - python

I'm building a person scraper for Indeed primarily to practice on - I've set it up so that I extract details per 100 results in each page. By using the search query, I have a seed-list of cities and types of jobs looped within an f-string of the indeed url. I have these results stored as a dictionary, so that I can get the degree types as a column when these results are read into pandas.
My issue is that I keep getting Redirecting (301), I suppose that's because not all the links fulfil the requirement of a salary. Alternatively, I have included meta={'handle_httpstatus_list': [301]} but then I get no results regardless.
Here's my scraper:
class IndeedItem(scrapy.Item):
job_title = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
company = Field(output_processor = TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
max_results_per_city = 1000
#names = pd.read_csv("indeed_names.csv")
#degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names':['London', 'Manchester']})
degree = pd.DataFrame({'degrees':['degree+Finance+£25','degree+Engineering+£25'], 'degree_type':['Finance', 'Engineering']})
start_urls = defaultdict(list)
for city in names.names:
for qualification,name in zip(degree.degrees, degree.degree_type):
start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}%2C000&l={city}&fromage=7&filter=0&limit=100')
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2
}
def start_requests(self):
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback = self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs = {
'page_count':0,
'category':category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#id="mosaic-zone-jobcards"]//div')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector = jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/h2[#class="jobTitle jobTitle-color-purple jobTitle-newJob"]/span//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/a/div[#class="slider_container"]/div[#class="slider_list"]/div[#class="slider_item"]/div[#class="job_seen_beacon"]/table[#class="jobCard_mainContent"]/tbody/tr/td[#class="resultContent"]/div[#class="heading6 company_location tapItem-gutter"]/pre/span[#class="companyName"]//text()')
yield loader.load_item
next_page = response.xpath('//ul[#class="pagination-list"]/li[5]/a//#href').get()
page_count += 1
if next_page is not None:
yield response.follow(
next_page,
callback = self.parse,
cb_kwargs = {
'page_count': page_count,
'category': category
}
)

I didn't had any 301 status, but the start_urls gave me problems and your xpath was off
This fix the xpath:
import scrapy
from pandas._libs.internals import defaultdict
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
import pandas as pd
class IndeedItem(scrapy.Item):
job_title = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
company = Field(output_processor=TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY': 2
}
max_results_per_city = 1000
# names = pd.read_csv("indeed_names.csv")
# degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names': ['London', 'Manchester']})
degree = pd.DataFrame({'degrees': ['degree+Finance+£25,000', 'degree+Engineering+£25,000'], 'degree_type': ['Finance', 'Engineering']})
start_urls = defaultdict(list)
def start_requests(self):
for city in self.names.names:
for qualification, name in zip(self.degree.degrees, self.degree.degree_type):
self.start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}&l={city}&fromage=7&filter=0&limit=100')
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback=self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs={
'page_count': 0,
'category': category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[#class="slider_container"]')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector=jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/span[#title]//text()')
loader.add_xpath('salary', './/div[#class="salary-snippet"]/span//text()')
loader.add_xpath('company', './/span[#class="companyName"]//text()')
yield loader.load_item()
next_page = response.xpath('//ul[#class="pagination-list"]//li[last()]/a/#href').get()
page_count += 1
if next_page:
yield response.follow(
next_page,
callback=self.parse,
cb_kwargs={
'page_count': page_count,
'category': category
}
)
If you can give an example for a url that redirects I can try to help you.

Related

Getting Variant product details by sending POST requests in scrapy

I am trying to scrape a website for product details and some of the products have variants. The product data changes when you click on the quantity and it sends a post request to get all the data. I am trying to make those requests in my spider but it doesn't return any data. Plus I am trying to add the variants to different rows in a csv file. All the answers I have found online don't help me at all and I don't know what more to do. How do I make the post request work and how do add the variant data to the output.
Here's the code:
import scrapy
import os
import json
from slugify import slugify
class GpSpider(scrapy.Spider):
name = 'gp'
start_urls = ['https://goldpet.pt/3-cao']
# urls = ['https://goldpet.pt/3-cao','https://goldpet.pt/4-gato','https://goldpet.pt/7-roedor','https://goldpet.pt/6-ave','https://goldpet.pt/5-peixe','https://goldpet.pt/281-reptil']
# for url in urls:
# start_urls.append(url)
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
def parse(self, response):
products = response.css('h2.h3.product-title>a::attr(href)').extract()
for product in products:
product_link = product
yield scrapy.Request(product_link, callback=self.parse_products)
# next_page = response.css('a.next.js-search-link::attr(href)').get()
# if next_page:
# yield scrapy.Request(next_page, callback=self.parse)
def parse_products(self, response):
image_links = []
for img in response.css('img.js-qv-product-cover::attr(src)').getall():
image_links.append(img)
item = {}
item['Title'] = response.css('h1.h1.product-title::text').get()
item['Descrição'] = response.css('div.product-description>p::text').extract()
item['Marca'] = response.css('img.img.img-thumbnail.manufacturer-logo::attr(alt)').get()
if item['Marca'] is None:
item['Marca'] = 'No brand'
item['Quantidade'] = response.css('span.radio-label::text').extract()
item['Idade'] = response.xpath('//dt[text()="Idade"]/following-sibling::dd/text()').get()
item['Porte'] = response.xpath('//dt[text()="Porte"]/following-sibling::dd/text()').get()
item['Características'] = response.xpath('//dt[text()="Características"]/following-sibling::dd/text()').get()
item['Gama'] = response.xpath('//dt[text()="Gama"]/following-sibling::dd/text()').get()
item['Alimento'] = response.xpath('//dt[text()="Alimento"]/following-sibling::dd/text()').get()
item['ean13'] = response.xpath('//dt[text()="ean13"]/following-sibling::dd/text()').get()
item['Price'] = response.css('div.current-price>span::text').get().replace('\xa0€','').strip()
item['product_url'] = response.url
item['image_urls'] = image_links
breadcrumbs = list(filter(None, map(str.strip, response.css('li[itemprop=itemListElement]>a>span::text').extract())))
try:
item['category'] = breadcrumbs[0]
except:
item['category'] = ''
try:
item['sub_category1'] = breadcrumbs[1]
except:
item['sub_category1'] = ''
try:
item['sub_category2'] = breadcrumbs[2]
except:
item['sub_category2'] = ''
product_img = response.css('img.js-qv-product-cover::attr(src)').getall()
item['img_urls'] = product_img[0]
ext = item['img_urls'].split('?')[0].rsplit('.', 1)[-1]
filename = slugify(item['Title']) + '_1.' + ext
item['Photo_0'] = filename
item['Photo_Path0'] = os.path.join('product images', 'images', item['Marca'], filename)
for i in range(10):
item[f'Photo_{i + 1}'] = ''
item[f'Photo_Path_{i + 1}'] = ''
for i, image in enumerate(product_img[1:]):
ext = image.split('?')[0].rsplit('.', 1)[-1]
filename = slugify(item['Title']) + f'_{i + 1}.{ext}'
item[f'Photo_{i + 1}'] = filename
item[f'Photo_Path_{i + 1}'] = os.path.join('product images', 'images', item['Marca'], filename)
variants = response.css('div.products-variants')
if variants:
for variant in variants:
var_item = item.copy()
group = response.css('li.input-container.float-xs-left label input.input-radio::attr(value)').get()
token = response.css('div.product-actions form#add-to-cart-or-refresh input::attr(value)').get()
product_id = response.css('input#product_page_product_id::attr(value)').get()
customization_id = response.css('input#product_customization_id::attr(value)').get()
ajax_url = f'https://goldpet.pt/index.php?controller=product&token=d41d8cd98f00b204e9800998ecf8427e&id_product=19107&id_customization=0&group%5B8%5D={group}&qty=1'
payload = {"controller" : item['title']
,"token" : token
,"id_product" : product_id
,"id_customization" : customization_id
,"group%5B8%5D" : group
,"qty" : '1'
}
yield scrapy.Request(ajax_url,callback=self.parse_variants, method="POST", body=json.dumps(payload), headers={'Content-Type': 'application/x-www-form-urlencoded'})
def parse_variants(self,response):
yield json.loads(response.text)

Create Xpath using scrapy

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)

Scrape Html Table they will provide some empty result

import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
def parse(self, response):
for link in response.xpath("//div[#class='exbox-name']/a/#href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[#class="expo-table general-color"]//tr')
table = {}
for row in rows:
key = row.xpath('.//td[1]//text()').get(default='').strip()
value = row.xpath('.//td[2]/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table
I am trying to scrape table but they will not give the information of Telefono,Fax,Email,Membro di,Social check these
{'Indirizzo': 'Dr.-Auner-Str. 21a', 'Città': 'Raaba / Graz', 'Nazionalità': 'Austria', 'Sito web': '', 'Stand': 'Pad. 5 B22 C27', 'Telefono': '', 'Fax': '', 'E-mail': '', 'Social': ''}
the link of page is http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh
The values for telephone and fax etc are in an a tag therefore you need to adjust your xpath selectors to account for those cases.
See below sample
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['http://smartcatalog.emo-milano.com/it/catalogo/elenco-alfabetico/400/A']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
def parse(self, response):
for link in response.xpath("//div[#class='exbox-name']/a/#href"):
yield response.follow(link.get(),callback=self.parse_book)
def parse_book(self, response):
rows = response.xpath('//table[#class="expo-table general-color"]/tr')
table = {}
for row in rows:
key = row.xpath('./td[1]//text()').get(default='').strip()
value = row.xpath('./td[2]/text() ').getall()
value = ''.join(value).strip()
if not value:
value = row.xpath('./td[2]/a/text() ').getall()
value = ''.join(value).strip()
table.update({key: value})
yield table

CrawlSpider with Splash, only first link is crawled & processed

I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.

Can't fetch all the titles from a webpage

I'm trying to parse all the categories and their nested categories recursivelly from this webpage which ultimately leads to such page and finally this innermost page from where I would like to fetch all the product titles.
The script can follow the above steps. However, when it comes to fetch all the titles from result pages traversing all next pages, the script gets fewer content than how many there are.
This is what I've written:
class mySpider(scrapy.Spider):
name = "myspider"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
def parse(self,response):
cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links)
else:
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse)
def parse_all_links(self,response):
for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
target_link = response.urljoin(item.strip())
yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = urljoin(base_url,next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links)
def parse_main_content(self,response):
item = response.css("h1::text").get()
print(item)
How can I get all the titles available in that category?
The script gets different number of results every time I run it.
Your main issue is that you need to use separate cookiejar for each "/products/list_pages/" to get next page correctly. I used a class variable cookie for this (see my code) and got same result (4293 items) several times.
Here is my code (I don't download product page (just read product title from a list of products):
class mySpider(scrapy.Spider):
name = "phoenixcontact"
start_urls = ['https://www.phoenixcontact.com/online/portal/gb?1dmy&urile=wcm%3apath%3a/gben/web/main/products/subcategory_pages/Cables_P-10/e3a9792d-bafa-4e89-8e3f-8b1a45bd2682']
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
cookie = 1
def parse(self,response):
# cookie = response.headers.getlist('Set-Cookie')[1].decode().split(";")[0]
for item in response.xpath("//div[./h3[contains(.,'Category')]]/ul/li/a/#href").getall():
item_link = response.urljoin(item.strip())
if "/products/list_pages/" in item_link:
cookie = self.cookie
self.cookie += 1
yield scrapy.Request(item_link,headers=self.headers,meta={'cookiejar': cookie},callback=self.parse_all_links, cb_kwargs={'page_number': 1})
else:
yield scrapy.Request(item_link,headers=self.headers,callback=self.parse)
def parse_all_links(self,response, page_number):
# if page_number > 1:
# with open("Samples/Page.htm", "wb") as f:
# f.write(response.body)
# for item in response.css("[class='pxc-sales-data-wrp'][data-product-key] h3 > a[href][onclick]::attr(href)").getall():
for item in response.xpath('//div[#data-product-key]//h3//a'):
target_link = response.urljoin(item.xpath('./#href').get())
item_title = item.xpath('./text()').get()
yield {'title': item_title}
# yield scrapy.Request(target_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_main_content)
next_page = response.css("a.pxc-pager-next::attr(href)").get()
if next_page:
base_url = response.css("base::attr(href)").get()
next_page_link = response.urljoin(next_page)
yield scrapy.Request(next_page_link,headers=self.headers,meta={'cookiejar': response.meta['cookiejar']},callback=self.parse_all_links, cb_kwargs={'page_number': page_number + 1})

Categories