Python web crawling using Scrapy unable to fill form - python

I am trying to crawl this site which requires me to fill the form with the postal code to reach stores info.
the website: https://www.aldi-sued.de/de/filialen.html
I have written following code but don't know whats wrong. Please help:
from __future__ import unicode_literals
import logging
import scrapy
from scrapy.loader import ItemLoader
from ..items import StoreItem
logger = logging.getLogger(__name__)
class StoreSpider(scrapy.Spider):
name = "aldib"
start_urls = ["https://www.aldi-sued.de/de/filialen.html"]
def parse(self, response):
yield scrapy.FormRequest(url="https://www.aldi-sued.de/de/filialen.html",
formdata={"search": "38644"},
callback=self.parse_stores)
def parse_stores(self, response):
for store in response.css('div.dealer-list > div.dealer-item-content'):
name = store.xpath("span.dealer-name > strong::text").extract()
sl = ItemLoader(item=StoreItem(), selector=store, response=response)
sl.add_value("Name", name)
yield sl.load_item()
I suspect about there are 2 forms in website that one for site search and one for store search and I unable to choose the which one on the first code. So I changed the request part with
def parse(self, response):
yield scrapy.FormRequest.from_response(response,
formid="storeSearchForm",
formdata={"search": "38644"},
callback=self.parse_stores)
At the end I still cannot to reach the name of the stores in that postal code.

Related

Why do my two scrapy program parts work separately, but not when i merge them?

My problem is that my two scrapy program parts do what they are supposed to do independently, but when I merge them I get no output.
The program is supposed to follow a large number of links at different depth levels. On the last and deepest level the program should render the html page with the python library scrapy_requests and extract the desired information with the help of the itemloader and save it in a csv.
spider.py looks like this:
import scrapy
from scrapy_requests import HtmlRequest
from SR.items import SrItem
from scrapy.loader import ItemLoader
class SrspiderSpider(scrapy.Spider):
name = 'SRspider'
start_urls =['https://www.microbiologyresearch.org/content/journal/ijsem/browse']
def parse(self, response):
# get all volume links and follow them
for volume in response.css('h3.h5 a::attr(href)'):
yield response.follow(volume.get(), callback=self.parse_issues)
def parse_issues(self, response):
# get all issue links within a volume and follow them
for issue in response.css('li.issue a::attr(href)'):
yield response.follow(issue.get(), callback=self.parse_articles)
def parse_articles(self, response):
# get all article links within a issue and follow them
for article in response.css('span.articleTitle.js-articleTitle.title a::attr(href)'):
yield response.follow(article.get(), callback=self.parse_html)
def parse_html(self, response):
url = response.request.url
yield HtmlRequest(url=url, callback=self.parse_content, render=True, options={'sleep': 1})
def parse_content(self,response):
loader = ItemLoader(item = SrItem(), selector=response)
loader.add_xpath('title', "//h1[#class='item-meta-data__item-title']")
loader.add_xpath('abstract', "//div[#class='articleSection article-abstract']/p")
loader.add_xpath('paragraph', "//div[#class='articleSection']/p")
yield loader.load_item()
items.py look like this:
import scrapy
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags, replace_escape_chars
class SrItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars), output_processor = TakeFirst())
abstract = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars), output_processor = TakeFirst())
paragraph = scrapy.Field(input_processor = MapCompose(remove_tags, replace_escape_chars))
if i want to follow all links it is possible and the program part works. if i render a page and extract the needed data it works too. but if i put the two parts together and try to get everything in one step it doesn't work. what could be the reason? the page is javascript and unfortunately has to be rendered to get the information.
My output command in the shell is: scrapy crawl SRspider -o test.csv

Parse out unneeded data using xpath

I am using Scrapy to scrape a website. After I access that website I need to get the id value of each category and use that value to redirect to the JSON web page where the data I need to scrape is. In the image below is a partial snapshot of the HTML code showing the categories along with their value id number that I need. I need that id value so I can plug it into the end of this url and redirect to that url. "http://www.starcitygames.com/buylist/search?search-type=category&id=" And I need to do that for all the categories. I have the code I got so far below but right now using xpath to get those ids but it is returning the entire list of Id's at once instead of one at a time. Pus it gives me other data I do not need.
HTML Code
Currently what I am receiving for category_id
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
class LoginSpider(scrapy.Spider):
name = "LoginSpider"
start_urls = ["http://www.starcitygames.com/buylist/"]
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formcss='#existing_users form',
formdata={'ex_usr_email': 'email#example.com', 'ex_usr_pass': 'passowrd'},
callback=self.after_login
)
def after_login(self, response):
item = NameItem()
category_id = response.xpath('//*[#id="bl-category-options"]/option/#value')
Pretty simple:
for catetegory_id in response.xpath('//select[#id="bl-category-options"]/option/#value').getall():
yield scrapy.Request(
url="http://www.starcitygames.com/buylist/search?search-type=category&id={category_id}".format(category_id=category_id),
callback=self.parse_json_response,
)

Scrapy spider outputs empy csv file

This is my first question here and I'm learning how to code by myself so please bear with me.
I'm working on a final CS50 project which I'm trying to built a website that aggregates online Spanish course from edx.org and other open online couses websites maybe. I'm using scrapy framework to scrap the filter results of Spanish courses on edx.org... Here is my first scrapy spider which I'm trying to get in each courses link to then get it's name (after I get the code right, also get the description, course url and more stuff).
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
from scrapy.loader import ItemLoader
class Course_item(Item):
name = Field()
#description = Field()
#img_url = Field()
class Course_spider(CrawlSpider):
name = 'CourseSpider'
allowed_domains = ['https://www.edx.org/']
start_urls = ['https://www.edx.org/course/?language=Spanish']
rules = (Rule(LinkExtractor(allow=r'/course'), callback='parse_item', follow='True'),)
def parse_item(self, response):
item = ItemLoader(Course_item, response)
item.add_xpath('name', '//*[#id="course-intro-heading"]/text()')
yield item.load_item()
When I run the spider with "scrapy runspider edxSpider.py -o edx.csv -t csv" I get an empty csv file and I also think is not getting into the right spanish courses results.
Basically I want to get in each courses of this link edx Spanish courses and get the name, description, provider, page url and img url.
Any ideas for why might be the problem?
You can't get edx content with a simple request, it uses javascript rendering for getting the course element dynamically, so CrawlSpider won't work on this case, because you need to find specific elements inside the response body to generate a new Request that will get what you need.
The real request (to get the urls of the courses) is this one, but you need to generate it from the previous response body (although you could just visit it an also get the correct data).
So, to generate the real request, you need data that is inside a script tag:
from scrapy import Spider
import re
import json
class Course_spider(Spider):
name = 'CourseSpider'
allowed_domains = ['edx.org']
start_urls = ['https://www.edx.org/course/?language=Spanish']
def parse(self, response):
script_text = response.xpath('//script[contains(text(), "Drupal.settings")]').extract_first()
parseable_json_data = re.search(r'Drupal.settings, ({.+})', script_text).group(1)
json_data = json.loads(parseable_json_data)
...
Now you have what you need on json_data and only need to create the string URL.
This page use JavaScript to get data from server and add to page.
It uses urls like
https://www.edx.org/api/catalog/v2/courses/course-v1:IDBx+IDB33x+3T2017
Last part is course's number which you can find in HTML
<main id="course-info-page" data-course-id="course-v1:IDBx+IDB33x+3T2017">
Code
from scrapy.http import Request
from scrapy.item import Field, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractor import LinkExtractor
from scrapy.loader import ItemLoader
import json
class Course_spider(CrawlSpider):
name = 'CourseSpider'
allowed_domains = ['www.edx.org']
start_urls = ['https://www.edx.org/course/?language=Spanish']
rules = (Rule(LinkExtractor(allow=r'/course'), callback='parse_item', follow='True'),)
def parse_item(self, response):
print('parse_item url:', response.url)
course_id = response.xpath('//*[#id="course-info-page"]/#data-course-id').extract_first()
if course_id:
url = 'https://www.edx.org/api/catalog/v2/courses/' + course_id
yield Request(url, callback=self.parse_json)
def parse_json(self, response):
print('parse_json url:', response.url)
item = json.loads(response.body)
return item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(Course_spider)
c.start()
from scrapy.http import Request
from scrapy import Spider
import json
class edx_scraper(Spider):
name = "edxScraper"
start_urls = [
'https://www.edx.org/api/v1/catalog/search?selected_facets[]=content_type_exact%3Acourserun&selected_facets[]=language_exact%3ASpanish&page=1&page_size=9&partner=edx&hidden=0&content_type[]=courserun&content_type[]=program&featured_course_ids=course-v1%3AHarvardX+CS50B+Business%2Ccourse-v1%3AMicrosoft+DAT206x+1T2018%2Ccourse-v1%3ALinuxFoundationX+LFS171x+3T2017%2Ccourse-v1%3AHarvardX+HDS2825x+1T2018%2Ccourse-v1%3AMITx+6.00.1x+2T2017_2%2Ccourse-v1%3AWageningenX+NUTR101x+1T2018&featured_programs_uuids=452d5bbb-00a4-4cc9-99d7-d7dd43c2bece%2Cbef7201a-6f97-40ad-ad17-d5ea8be1eec8%2C9b729425-b524-4344-baaa-107abdee62c6%2Cfb8c5b14-f8d2-4ae1-a3ec-c7d4d6363e26%2Ca9cbdeb6-5fc0-44ef-97f7-9ed605a149db%2Cf977e7e8-6376-400f-aec6-84dcdb7e9c73'
]
def parse(self, response):
data = json.loads(response.text)
for course in data['objects']['results']:
url = 'https://www.edx.org/api/catalog/v2/courses/' + course['key']
yield response.follow(url, self.course_parse)
if 'next' in data['objects'] is not None:
yield response.follow(data['objects']['next'], self.parse)
def course_parse(self, response):
course = json.loads(response.text)
yield{
'name': course['title'],
'effort': course['effort'],
}

How to make Scrapy crawl only 1 page (make it non recursive)?

I'm using the latest version of scrapy (http://doc.scrapy.org/en/latest/index.html) and am trying to figure out how to make scrapy crawl only the URL(s) fed to it as part of start_url list. In most cases I want to crawl only 1 page, but in some cases there may be multiple pages that I will specify. I don't want it to crawl to other pages.
I've tried setting the depth level=1 but I'm not sure that in testing it accomplished what I was hoping to achieve.
Any help will be greatly appreciated!
Thank you!
2015-12-22 - Code update:
# -*- coding: utf-8 -*-
import scrapy
from generic.items import GenericItem
class GenericspiderSpider(scrapy.Spider):
name = "genericspider"
def __init__(self, domain, start_url, entity_id):
self.allowed_domains = [domain]
self.start_urls = [start_url]
self.entity_id = entity_id
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath("//body//a"):
item = GenericItem()
item['entity_id'] = self.entity_id
# gets the actual email address
item['emails'] = response.xpath("//a[starts-with(#href, 'mailto')]").re(r'mailto:\s*(.*?)"')
yield item
Below, in the first response, you mention using a generic spider --- isn't that what I'm doing in the code? Also are you suggesting I remove the
callback=self.parse_dir_contents
from the parse function?
Thank you.
looks like you are using CrawlSpider which is a special kind of Spider to crawl multiple categories inside pages.
For only crawling the urls specified inside start_urls just override the parse method, as that is the default callback of the start requests.
Below is a code for the spider that will scrape the title from a blog (Note: the xpath might not be the same for every blog)
Filename: /spiders/my_spider.py
class MySpider(scrapy.Spider):
name = "craig"
allowed_domains = ["www.blogtrepreneur.com"]
start_urls = ["http://www.blogtrepreneur.com/the-best-juice-cleanse-for-weight-loss/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
dive = response.xpath('//div[#id="tve_editor"]')
items = []
item = DmozItem()
item["title"] = response.xpath('//h1/text()').extract()
item["article"] = response.xpath('//div[#id="tve_editor"]//p//text()').extract()
items.append(item)
return items
The above code will only fetch the title and the article body of the given article.
I got the same problem, because I was using
import scrapy from scrapy.spiders import CrawlSpider
Then I changed to
import scrapy from scrapy.spiders import Spider
And change the class to
class mySpider(Spider):

scrapy crawl multiple pages, extracting data and saving into mysql

Hi can someone help me out I seem to be stuck, I am learning how to crawl and save into mysql us scrapy. I am trying to get scrapy to crawl all of the website pages. Starting with "start_urls", but it does not seem to automatically crawl all of the pages only the one, it does save into mysql with pipelines.py. It does also crawl all pages when provided with urls in a f = open("urls.txt") as well as saves data using pipelines.py.
here is my code
test.py
import scrapy
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from gotp.items import GotPItem
from scrapy.log import *
from gotp.settings import *
from gotp.items import *
class GotP(CrawlSpider):
name = "gotp"
allowed_domains = ["www.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/sss"]
rules = [
Rule(SgmlLinkExtractor(
allow=('')),
callback ="parse",
follow=True
)
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
prices = hxs.select("//div[#class="sliderforward arrow"]")
for price in prices:
item = GotPItem()
item ["price"] = price.select("text()").extract()
yield item
If I understand correctly, you are trying to follow the pagination and extract the results.
In this case, you can avoid using CrawlSpider and use regular Spider class.
The idea would be to parse the first page, extract total results count, calculate how much pages to go and yield scrapy.Request instances to the same URL providing s GET parameter value.
Implementation example:
import scrapy
class GotP(scrapy.Spider):
name = "gotp"
allowed_domains = ["www.sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/sss"]
results_per_page = 100
def parse(self, response):
total_count = int(response.xpath('//span[#class="totalcount"]/text()').extract()[0])
for page in xrange(0, total_count, self.results_per_page):
yield scrapy.Request("http://sfbay.craigslist.org/search/sss?s=%s&" % page, callback=self.parse_result, dont_filter=True)
def parse_result(self, response):
results = response.xpath("//p[#data-pid]")
for result in results:
try:
print result.xpath(".//span[#class='price']/text()").extract()[0]
except IndexError:
print "Unknown price"
This would follow the pagination and print prices on the console. Hope this is a good starting point.

Categories