Extract data from ajax

Extract data from ajax - python

I'm trying to extract data (title, price and description) from ajax but it doesn't work even by changing the user agent
Link : https://scrapingclub.com/exercise/detail_header/
Ajax (data want to extract) : https://scrapingclub.com/exercise/ajaxdetail_header/
import scrapy
class UseragentSpider(scrapy.Spider):
name = 'useragent'
allowed_domains = ['scrapingclub.com/exercise/ajaxdetail_header/']
start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/']
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
def parse(self, response):
cardb= response.xpath("//div[#class='card-body']")
for thing in cardb:
title= thing.xpath(".//h3")
yield {'title' : title}
Error log :
2020-09-07 20:34:39 [scrapy.core.engine] INFO: Spider opened
2020-09-07 20:34:39 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-09-07 20:34:39 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://scrapingclub.com/robots.txt> (referer: None)
2020-09-07 20:34:40 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://scrapingclub.com/exercise/ajaxdetail_header/> (referer: None)
2020-09-07 20:34:40 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://scrapingclub.com/exercise/ajaxdetail_header/>: HTTP status code is not handled or not allowed

AJAX requests should send header
'X-Requested-With': 'XMLHttpRequest'
but not all servers check it. But this server check it. But it doesn't check User-Agent.
Server sends data as JSON so xpath will be useless.
I tested it with requests instead of scrapy because it was simpler for me.
import requests
headers = {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'
response = requests.get(url, headers=headers)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
Result:
{'img_path': '/static/img/00959-A.jpg', 'price': '$24.99', 'description': 'Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.', 'title': 'Crinkled Flounced Blouse'}
type: <class 'dict'>
keys: dict_keys(['img_path', 'price', 'description', 'title'])
--- manually ---
price: $24.99
title: Crinkled Flounced Blouse
--- for-loop ---
img_path: /static/img/00959-A.jpg
price: $24.99
description: Blouse in airy, crinkled fabric with a printed pattern. Small stand-up collar, concealed buttons at front, and flounces at front. Long sleeves with buttons at cuffs. Rounded hem. 100% polyester. Machine wash cold.
title: Crinkled Flounced Blouse
EDIT:
The same with Scrapy. I use function start_requests() to create Request() with header 'X-Requested-With'
You can put all code in one file and run python script.py without creating project.
import scrapy
import json
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
url = 'https://scrapingclub.com/exercise/ajaxdetail_header/'
headers = {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
yield scrapy.http.Request(url, headers=headers)
def parse(self, response):
print('url:', response.url)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
#'FEED_FORMAT': 'csv', # csv, json, xml
#'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()
EDIT:
The same using setting DEFAULT_REQUEST_HEADERS
import scrapy
import json
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['https://scrapingclub.com/exercise/ajaxdetail_header/']
def parse(self, response):
print('url:', response.url)
#print('headers:', response.request.headers)
data = response.json()
print(data)
print('type:', type(data))
print('keys:', data.keys())
print('--- manually ---')
print('price:', data['price'])
print('title:', data['title'])
print('--- for-loop ---')
for key, value in data.items():
print('{}: {}'.format(key, value))
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
'DEFAULT_REQUEST_HEADERS': {
#'User-Agent': 'Mozilla/5.0',
'X-Requested-With': 'XMLHttpRequest',
}
# save in file CSV, JSON or XML
#'FEED_FORMAT': 'csv', # csv, json, xml
#'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

Related

Webscraping with Selenium, problems with scraping child pages

And trying to scrape this website with selenium.
https://startupbase.com.br/home/startups?q=&states=all&cities=all&segments=Constru%C3%A7%C3%A3o%20Civil~Imobili%C3%A1rio&targets=all&phases=all&models=all&badges=all
What I need: to enter in every child page and extract a lot of information and do this for all the company that is shown.
The code:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument("window-size=1400,600")
from fake_useragent import UserAgent
ua = UserAgent()
a = ua.random
user_agent = ua.random
print(user_agent)
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome('chromedriver')
driver.get("https://startupbase.com.br/home/startups?q=&states=all&cities=all&segments=Construção%20Civil~Imobiliário&targets=all&phases=all&models=all&badges=all")
import time
time.sleep(3)
cookies_button = driver.find_element_by_xpath("//button[contains(text(), 'Accept')]")
cookies_button.click()
time.sleep(3)
# Lists that we will iterate to
founder_name = []
name_company = []
site_url = []
local = []
mercado = []
publico_alvo = []
modelo_receita = []
momento = []
sobre = []
fundacao = []
tamanho_time = []
linkedin_company = []
linkedin_founder = []
atualizacao = []
while True:
time.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
code = soup.prettify()
print(code)
containers = soup.find_all("div", {"class": "search-body__item"})
for container in containers:
internal_page = container.find('a', href=True)
The is still in the beginning because I'm trying to enter into the child pages and I can't that.
I've already tried:
internal_page = driver.find_element_by_xpath("/html/body/app-root/ng-component/app-layout/div/div/div/div/div/app-layout-column/ng-component/div/ais-instantsearch/div/div/div/div[2]/section/ais-infinite-hits/div/div[2]/a")
internal_page.click()
Could someone give a light, please?

You can use a different approach than to simulate clicking all the buttons.
If you check the link of each start up base it is https://startupbase.com.br/c/startup/ with start up base name separated by dashes
So you can use a base url
base_url = 'https://startupbase.com.br/c/startup/{}'
You can get the titles of every start up base using the following css selector .org__title.sb-size-6
titles = ['-'.join(title.text.split()) for title in driver.find_elements_by_css_selector('.org__title.sb-size-6')]
After that you can iterate through all titles and add its name after the base url, seperated by dashes instead of spaces
for title in titles:
url = base_url.format(title)
And do whatever code you want to request using the url variable
Code:
base_url = 'https://startupbase.com.br/c/startup/{}'
titles = ['-'.join(title.text.split()) for title in driver.find_elements_by_css_selector('.org__title.sb-size-6')]
for title in titles:
url = base_url.format(title)

You can do that easily using scrapy as api calls json response and following the post method.
CODE:
import scrapy
import json
class ScrollSpider(scrapy.Spider):
body = '{"requests":[{"indexName":"prod_STARTUPBASE","params":"maxValuesPerFacet=100&query=&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&page=0&facets=%5B%22segments.primary%22%2C%22state%22%2C%22place%22%2C%22business_target%22%2C%22business_phase%22%2C%22business_model%22%2C%22badges.name%22%5D&tagFilters=&facetFilters=%5B%5B%22segments.primary%3AConstru%C3%A7%C3%A3o%20Civil%22%2C%22segments.primary%3AImobili%C3%A1rio%22%5D%5D"},{"indexName":"prod_STARTUPBASE","params":"maxValuesPerFacet=100&query=&highlightPreTag=__ais-highlight__&highlightPostTag=__%2Fais-highlight__&page=0&hitsPerPage=1&attributesToRetrieve=%5B%5D&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&analytics=false&clickAnalytics=false&facets=segments.primary"}]}'
name = 'scroll'
def start_requests(self):
yield scrapy.Request(
url= 'https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399',
callback = self.parse,
method = 'POST',
body = self.body,
headers = {'content-type': 'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
)
def parse(self, response):
resp = json.loads(response.body)
hits = resp['results'][0]['hits']
for hit in hits:
yield {
'Name':hit['name']
}
Output:
{'Name': 'Constr Up'}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'Agenciou!'}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'inQuality System'}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'Constructweb - Gestão eficiente de reformas'}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'Apê Fácil'}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'Glück Imóveis '}
2021-09-04 08:07:35 [scrapy.core.scraper] DEBUG: Scraped from <200 https://fwtbnxlfs6-3.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20angular%20(9.0.7)%3B%20angular-instantsearch%20(3.0.0-beta.5)%3B%20instantsearch.js%20(4.7.0)%3B%20JS%20Helper%20(3.1.2)&x-algolia-application-id=FWTBNXLFS6&x-algolia-api-key=e5fef9eab51259b54d385c6f010cc399>
{'Name': 'ArqColab'}
... so on
Response:
downloader/response_status_count/200

Like I've said in the comment I made to your question. You are selecting the wrong element. You have to select the parent element of the <a> since this is what contains the action. You have to use xpath to get to the parent element.
internal_page = driver.find_element_by_xpath("/html/body/app-root/ng-component/app-layout/div/div/div/div/div/app-layout-column/ng-component/div/ais-instantsearch/div/div/div/div[2]/section/ais-infinite-hits/div/div[2]/a")
internal_page.find_element_by_xpath("./..").click()

Scrapy spider is not working when trying to iterate over crawled urls

I'm kinda of newb with Scrapy. My spider is not working properly when I'm trying to scrape the data from forum. When I'm running my spider, it gives me only the printed urls and stops after. So I think that the problem is in compatibility of two function parse and parse_data but I may be wrong. Here is my code:
import scrapy, time
class ForumSpiderSpider(scrapy.Spider):
name = 'forum_spider'
allowed_domains = ['visforvoltage.org/latest_tech/']
start_urls = ['http://visforvoltage.org/latest_tech//']
def parse(self, response):
for href in response.css(r"tbody a[href*='/forum/']::attr(href)").extract():
url = response.urljoin(href)
print(url)
req = scrapy.Request(url, callback=self.parse_data)
time.sleep(10)
yield req
def parse_data(self, response):
for url in response.css('html').extract():
data = {}
data['name'] = response.css(r"div[class='author-pane-line author-name'] span[class='username']::text").extract()
data['date'] = response.css(r"div[class='forum-posted-on']:contains('-') ::text").extract()
data['title'] = response.css(r"div[class='section'] h1[class='title']::text").extract()
data['body'] = response.css(r"div[class='field-items'] p::text").extract()
yield data
next_page = response.css(r"li[class='pager-next'] a[href*='page=']::attr(href)").extract()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse)
Here is the output:
2020-07-23 23:09:58 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'visforvoltage.org': <GET https://visforvoltage.org/forum/14521-aquired-a123-m1-cells-need-charger-and-bms>
https://visforvoltage.org/forum/14448-battery-charger-problems
https://visforvoltage.org/forum/14191-vectrix-trickle-charger
https://visforvoltage.org/forum/14460-what-epoxy-would-you-recommend-loose-magnet-repair
https://visforvoltage.org/forum/14429-importance-correct-grounding-and-well-built-plugs
https://visforvoltage.org/forum/14457-147v-charger-24v-lead-acid-charger-and-dying-vectrix-cells
https://visforvoltage.org/forum/6723-lithium-safety-e-bike
https://visforvoltage.org/forum/11488-how-does-24v-4-wire-reversible-motor-work
https://visforvoltage.org/forum/14444-new-sevcon-gen-4-80v-sale
https://visforvoltage.org/forum/14443-new-sevcon-gen-4-80v-sale
https://visforvoltage.org/forum/12495-3500w-hub-motor-question-about-real-power-and-breaker
https://visforvoltage.org/forum/14402-vectrix-vx-1-battery-pack-problem
https://visforvoltage.org/forum/14068-vectrix-trickle-charger
https://visforvoltage.org/forum/2931-drill-motors
https://visforvoltage.org/forum/14384-help-repairing-gio-hub-motor-freewheel-sprocket
https://visforvoltage.org/forum/14381-zev-charger
https://visforvoltage.org/forum/8726-performance-unite-my1020-1000w-motor
https://visforvoltage.org/forum/7012-controler-mod-veloteq
https://visforvoltage.org/forum/14331-scooter-chargers-general-nfpanec
https://visforvoltage.org/forum/14320-charging-nissan-leaf-cells-lifepo4-charger
https://visforvoltage.org/forum/3763-newber-needs-help-new-gift-kollmorgan-hub-motor
https://visforvoltage.org/forum/14096-european-bldc-controller-seller
https://visforvoltage.org/forum/14242-lithium-bms-vs-manual-battery-balancing
https://visforvoltage.org/forum/14236-mosfet-wiring-ignition-key
https://visforvoltage.org/forum/2007-ok-dumb-question-time%3A-about-golf-cart-controllers
https://visforvoltage.org/forum/10524-my-mf70-recommended-powerpoles-arrived-today
https://visforvoltage.org/forum/9460-how-determine-battery-capacity
https://visforvoltage.org/forum/7705-tricking-0-5-v-hall-effect-throttle
https://visforvoltage.org/forum/13446-overcharged-lead-acid-battery-what-do
https://visforvoltage.org/forum/14157-reliable-high-performance-battery-enoeco-bt-p380
https://visforvoltage.org/forum/2702-hands-test-48-volt-20-ah-lifepo4-pack-ping-battery
https://visforvoltage.org/forum/14034-simple-and-cheap-ev-can-bus-adaptor
https://visforvoltage.org/forum/13933-zivan-ng-3-charger-specs-and-use
https://visforvoltage.org/forum/13099-controllers
https://visforvoltage.org/forum/13866-electric-motor-werks-demos-25-kilowatt-diy-chademo-leaf
https://visforvoltage.org/forum/13796-motor-theory-ac-vs-bldc
https://visforvoltage.org/forum/6184-bypass-bms-lifepo4-good-idea-or-not
https://visforvoltage.org/forum/13763-positive-feedback-kelly-controller
https://visforvoltage.org/forum/13764-any-users-smart-battery-drop-replacement-zapino-and-others
https://visforvoltage.org/forum/13760-contactor-or-fuse-position-circuit-rules-why
https://visforvoltage.org/forum/13759-contactor-or-fuse-position-circuit-rules-why
https://visforvoltage.org/forum/12725-repairing-lithium-battery-pack
https://visforvoltage.org/forum/13752-questions-sepex-motor-theory
https://visforvoltage.org/forum/13738-programming-curtis-controller-software
https://visforvoltage.org/forum/13741-making-own-simple-controller
https://visforvoltage.org/forum/12420-idea-charging-electric-car-portably-wo-relying-electricity-infrastructure
2020-07-23 23:17:28 [scrapy.extensions.logstats] INFO: Crawled 2 pages (at 2 pages/min), scraped 0 items (at 0 items/min)
2020-07-23 23:17:28 [scrapy.core.engine] INFO: Closing spider (finished)
As I see it didn't iterate over these links and collect the data from them. What could be the reason for that?
I will really appreciate for any help. Thank you!

It's work for me.
import scrapy, time
class ForumSpiderSpider(scrapy.Spider):
name = 'forum_spider'
allowed_domains = ['visforvoltage.org/latest_tech/']
start_urls = ['http://visforvoltage.org/latest_tech/']
def parse(self, response):
for href in response.css(r"tbody a[href*='/forum/']::attr(href)").extract():
url = response.urljoin(href)
req = scrapy.Request(url, callback=self.parse_data, dont_filter=True)
yield req
def parse_data(self, response):
for url in response.css('html'):
data = {}
data['name'] = url.css(r"div[class='author-pane-line author-name'] span[class='username']::text").extract()
data['date'] = url.css(r"div[class='forum-posted-on']:contains('-') ::text").extract()
data['title'] = url.css(r"div[class='section'] h1[class='title']::text").extract()
data['body'] = url.css(r"div[class='field-items'] p::text").extract()
yield data
next_page = response.css(r"li[class='pager-next'] a[href*='page=']::attr(href)").extract()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse)

The issue probably is that the requests are getting filtered, as they are not part of the allowed domain.
allowed_domains = ['visforvoltage.org/latest_tech/']
New requests url:
https://visforvoltage.org/forum/14448-battery-charger-problems
https://visforvoltage.org/forum/14191-vectrix-trickle-charger
...
Since the requests are to the url visforvoltage.org/forum/ and not to the visforvoltage.org/latest_tech/
You can remove the allowed domain property entirely, or change to:
allowed_domains = ['visforvoltage.org']
This will make them crawl the page, you will see a different value in this line in your log:
2020-07-23 23:17:28 [scrapy.extensions.logstats] INFO: Crawled 2 pages (at 2 pages/min), scraped 0 items (at 0 items/min)
However the selectors in the parsing don't seem right.
This selector will select the whole page, and the extract() method will return it as a list. So you will have a list, with only one string that composed of all the HTML of the page.
response.css('html').extract()
You can read more on selectors and the getall()/extract() method here.

Sequential scraping from multiple start_urls leading to error in parsing

First, highest appreciation for all of your work answering noob questions like this one.
Second, as it seems to be a quite common problem I was finding (IMO) related questions such as:
Scrapy: Wait for a specific url to be parsed before parsing others
However, at my current state of understanding it is not straightforward to adapt the suggestions in my specific case and I would really appreciate your help.
Problem Outline: running on (Python 3.7.1, Scrapy 1.5.1)
I want to scrape data from every link collected on pages like this
https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1
then from all links on another collection
https://www.gipfelbuch.ch/gipfelbuch/touren/seite/650
I manage to get the desired information (only two elements shown here) if I run the spider for one (e.g. page 1 or 650) at a time. (Note that I restircted the length of links that is crawled per page to 2.) However, once I have multiple start start_urls (setting two elements in the list [1,650] in the code below) the parsed data is no more consistent. Apparently at least one element is not found by xpath. I am suspecting some (or a lot of) incorrect logic how I handle/pass the requests that leads not to the intendet order for parsing.
Code:
class SlfSpider1Spider(CrawlSpider):
name = 'slf_spider1'
custom_settings = { 'CONCURRENT_REQUESTS': '1' }
allowed_domains = ['gipfelbuch.ch']
start_urls = ['https://www.gipfelbuch.ch/gipfelbuch/touren/seite/'+str(i) for i in [1,650]]
# Method which starts the requests by vicisting all URLS specified in start_urls
def start_requests(self):
for url in self.start_urls:
print('#### START REQUESTS: ',url)
yield scrapy.Request(url, callback=self.parse_verhaeltnisse, dont_filter=True)
def parse_verhaeltnisse(self,response):
links = response.xpath('//td//#href').extract()
for link in links[0:2]:
print('##### PARSING: ',link)
abs_link = 'https://www.gipfelbuch.ch/'+link
yield scrapy.Request(abs_link, callback=self.parse_gipfelbuch_item, dont_filter=True)
def parse_gipfelbuch_item(self, response):
route = response.xpath('/html/body/main/div[4]/div[#class="col_f"]//div[#class="togglebox cont_item mt"]//div[#class="label_container"]')
print('#### PARSER OUTPUT: ')
key=[route[i].xpath('string(./label)').extract()[0] for i in range(len(route))]
value=[route[i].xpath('string(div[#class="label_content"])').extract()[0] for i in range(len(route))]
fields = dict(zip(key,value))
print('Route: ', fields['Gipfelname'])
print('Comments: ', fields['Verhältnis-Beschreibung'])
print('Length of dict extracted from Route: {}'.format(len(route)))
return
Command prompt
2019-03-18 15:42:27 [scrapy.core.engine] INFO: Spider opened
2019-03-18 15:42:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-03-18 15:42:27 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024
#### START REQUESTS: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1
2019-03-18 15:42:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1> (referer: None)
#### START REQUESTS: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/650
##### PARSING: /gipfelbuch/detail/id/101559/Skitour_Snowboardtour/Beaufort
##### PARSING: /gipfelbuch/detail/id/101557/Skitour_Snowboardtour/Blinnenhorn
2019-03-18 15:42:30 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch/gipfelbuch/touren/seite/650> (referer: None)
##### PARSING: /gipfelbuch/detail/id/69022/Alpine_Wanderung/Schwaendeliflue
##### PARSING: /gipfelbuch/detail/id/69021/Schneeschuhtour/Cima_Portule
2019-03-18 15:42:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch//gipfelbuch/detail/id/101557/Skitour_Snowboardtour/Blinnenhorn> (referer: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1)
#### PARSER OUTPUT:
Route: Blinnenhorn/Corno Cieco
Comments: Am Samstag Aufstieg zur Corno Gries Hütte, ca. 2,5h ab All Acqua. Zustieg problemslos auf guter Spur. Zur Verwunderung waren wir die einzigsten auf der Hütte. Danke an Monika für die herzliche Bewirtung...
Length of dict extracted from Route: 27
2019-03-18 15:42:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch//gipfelbuch/detail/id/69021/Schneeschuhtour/Cima_Portule> (referer: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/650)
#### PARSER OUTPUT:
Route: Cima Portule
Comments: Sehr viel Schnee in dieser Gegend und viel Spirarbeit geleiset, deshalb auch viel Zeit gebraucht.
Length of dict extracted from Route: 19
2019-03-18 15:42:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch//gipfelbuch/detail/id/69022/Alpine_Wanderung/Schwaendeliflue> (referer: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/650)
#### PARSER OUTPUT:
Route: Schwändeliflue
Comments: Wege und Pfade meist schneefrei, da im Gebiet viel Hochmoor ist, z.t. sumpfig. Oberhalb 1600m und in Schattenlagen bis 1400m etwas Schnee (max.Schuhtief). Wetter sonnig und sehr warm für die Jahreszeit, T-Shirt - Wetter, Frühlingshaft....
Length of dict extracted from Route: 17
2019-03-18 15:42:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.gipfelbuch.ch//gipfelbuch/detail/id/101559/Skitour_Snowboardtour/Beaufort> (referer: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1)
#### PARSER OUTPUT:
Route: Beaufort
2019-03-18 15:42:40 [scrapy.core.scraper] **ERROR: Spider error processing <GET https://www.gipfelbuch.ch//gipfelbuch/detail/id/101559/Skitour_Snowboardtour/Beaufort> (referer: https://www.gipfelbuch.ch/gipfelbuch/touren/seite/1)
Traceback (most recent call last):
File "C:\Users\Lenovo\Anaconda3\lib\site-packages\twisted\internet\defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\Lenovo\Dropbox\Code\avalanche\scrapy\slf1\slf1\spiders\slf_spider1.py", line 38, in parse_gipfelbuch_item
print('Comments: ', fields['Verhältnis-Beschreibung'])
**KeyError: 'Verhältnis-Beschreibung'****
2019-03-18 15:42:40 [scrapy.core.engine] INFO: Closing spider (finished)
Question:
How do I have to structure the first (for links) and second (for content) parsing commands correctly? Why is the "PARSE OUTPUT" not in the order i would expect (first for page 1, links top to bottom, then page 2, links top to bottom)?
I already tried to reduce the number of CONCURRENT_REQUESTS = 1 and DOWNLOAD_DELAY = 2.
I hope the question is clear enough... big thanks in advance.

If the problem is to visit more URLs at the same time, you can visit one by one, using the signal spider_idle (https://docs.scrapy.org/en/latest/topics/signals.html).
The idea is the following:
1.start_requests only visits the first URL
2.when the spider gets idle, the method spider_idle is called
3.the method spider_idle deletes the first URL and visits the second URL
4.so on...
The code would be something like this (I didn't try it):
class SlfSpider1Spider(CrawlSpider):
name = 'slf_spider1'
custom_settings = { 'CONCURRENT_REQUESTS': '1' }
allowed_domains = ['gipfelbuch.ch']
start_urls = ['https://www.gipfelbuch.ch/gipfelbuch/touren/seite/'+str(i) for i in [1,650]]
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SlfSpider1Spider, cls).from_crawler(crawler, *args, **kwargs)
# Here you set which method the spider has to run when it gets idle
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
return spider
# Method which starts the requests by vicisting all URLS specified in start_urls
def start_requests(self):
# the spider visits only the first provided URL
url = self.start_urls[0]:
print('#### START REQUESTS: ',url)
yield scrapy.Request(url, callback=self.parse_verhaeltnisse, dont_filter=True)
def parse_verhaeltnisse(self,response):
links = response.xpath('//td//#href').extract()
for link in links[0:2]:
print('##### PARSING: ',link)
abs_link = 'https://www.gipfelbuch.ch/'+link
yield scrapy.Request(abs_link, callback=self.parse_gipfelbuch_item, dont_filter=True)
def parse_gipfelbuch_item(self, response):
route = response.xpath('/html/body/main/div[4]/div[#class="col_f"]//div[#class="togglebox cont_item mt"]//div[#class="label_container"]')
print('#### PARSER OUTPUT: ')
key=[route[i].xpath('string(./label)').extract()[0] for i in range(len(route))]
value=[route[i].xpath('string(div[#class="label_content"])').extract()[0] for i in range(len(route))]
fields = dict(zip(key,value))
print('Route: ', fields['Gipfelname'])
print('Comments: ', fields['Verhältnis-Beschreibung'])
print('Length of dict extracted from Route: {}'.format(len(route)))
return
# When the spider gets idle, it deletes the first url and visits the second, and so on...
def spider_idle(self, spider):
del(self.start_urls[0])
if len(self.start_urls)>0:
url = self.start_urls[0]
self.crawler.engine.crawl(Request(url, callback=self.parse_verhaeltnisse, dont_filter=True), spider)

Scrapy pagination is not working and optimized spider

Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items

When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)

Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass

Scrapy Python loop to next unscraped link

i'm trying to make my spider go over a list and scrape all the url's it can find following them scraping some data and returning to continue on the next unscraped link if i run the spider i can see that it returns back to the starting page but tries to scrape the same page again and just quits afterwards any code suggestions pretty new to python.
import scrapy
import re
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
request = scrapy.Request(item['listurl'], callback=self.parseBasicListingInfo)
yield request
def parseBasicListingInfo(item, response):
item = ListResidentialItem()
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
return item
to clarify:
i'm passing [0] so it only takes the first link of the list
but i want it to continue using the next unscraped link
output after running the spider :
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/robots.txt> (referer: None)
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/list> (referer: None)
2016-07-18 12:11:21 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/link1> (referer: http://www.domain.com/list)
2016-07-18 12:11:21 [scrapy] DEBUG: Scraped from <200 http://www.domain.com/link1>
{'title': [u'\rlink1\r']}

This should just work fine. Change the domain and xpath and see
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ProdItems(scrapy.Item):
listurl = scrapy.Field()
title = scrapy.Field()
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
list_urls = sel.xpath('//a[#id="link101"]/#href').extract()
for url in list_urls:
item['listurl'] = url
yield scrapy.Request(url, callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
yield item

This is the line that's causing your problem:
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
The "//" means "from the start of the document" which means that it scans from the very first tag and will always find the same first link. What you need to do is search relative to the start of the current tag using ".//" which means "from this tag onwards". Also your current for loop is visiting every tag in the document which is unneccesary. Try this:
def parse(self, response):
for href in response.xpath('//a[#id="link101"]/#href').extract():
item = ProductionItem()
item['listurl'] = href
yield scrapy.Request(href,callback=self.parseBasicListingInfo, meta={'item': item})
The xpath pulls the hrefs out of the links and returns them as a list you can iterate over.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract data from ajax - python

Related

Webscraping with Selenium, problems with scraping child pages

Scrapy spider is not working when trying to iterate over crawled urls

Sequential scraping from multiple start_urls leading to error in parsing

Scrapy pagination is not working and optimized spider

Scrapy Python loop to next unscraped link

Categories

Resources