Scrapy extracting from javascript element - python

I am trying to extract data from the following url by using scrapy.
import re
import json
import scrapy
import unicodedata
from collections import Counter
#command scrapy crawl <spidername> -o <outputfile>
class PostsSpider(scrapy.Spider):
name = "ljcmnt"
start_urls = ['https://asperger.livejournal.com/3084959.html']
def parse(self, response):
comments_ids = response.xpath("//*[#id='comments']/div[5]/div/#data-tid").extract()
print comments_ids
But since the element(data-tid) in java script I am not able to catch it.
Any help in getting the data-tid from the start-url.

Related

Python web crawling using Scrapy unable to fill form

I am trying to crawl this site which requires me to fill the form with the postal code to reach stores info.
the website: https://www.aldi-sued.de/de/filialen.html
I have written following code but don't know whats wrong. Please help:
from __future__ import unicode_literals
import logging
import scrapy
from scrapy.loader import ItemLoader
from ..items import StoreItem
logger = logging.getLogger(__name__)
class StoreSpider(scrapy.Spider):
name = "aldib"
start_urls = ["https://www.aldi-sued.de/de/filialen.html"]
def parse(self, response):
yield scrapy.FormRequest(url="https://www.aldi-sued.de/de/filialen.html",
formdata={"search": "38644"},
callback=self.parse_stores)
def parse_stores(self, response):
for store in response.css('div.dealer-list > div.dealer-item-content'):
name = store.xpath("span.dealer-name > strong::text").extract()
sl = ItemLoader(item=StoreItem(), selector=store, response=response)
sl.add_value("Name", name)
yield sl.load_item()
I suspect about there are 2 forms in website that one for site search and one for store search and I unable to choose the which one on the first code. So I changed the request part with
def parse(self, response):
yield scrapy.FormRequest.from_response(response,
formid="storeSearchForm",
formdata={"search": "38644"},
callback=self.parse_stores)
At the end I still cannot to reach the name of the stores in that postal code.

Scrapy works in shell but not in the code

I am facing an issue while developing my first spider in scrapy. I am able to get the proper information in scrapy shell but it does not work when I implement it in the code. I've read similar posts here but I still was not able to figure out what I'm doing wrong.
import scrapy
from scrapy.loader import ItemLoader
from ..items import ScrapingamazonItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
start_urls = ['https://www.amazon.com/s?k=Office+Chair&lo=grid&crid=1N60K12GUA798&qid=1601040579&sprefix=chair&ref=sr_pg_1']
def parse(self, response):
items = response.css('.s-asin .sg-col-inner')
for item in items:
loader = ItemLoader(item=ScrapingamazonItem(), selector=item)
loader.add_css('ProductName', '.a-color-base.a-text-normal::text')
yield loader.load_item()
I am running it using scrapy crawl amazon_spider -o file.csv. The file returns empty.
Any help is deeply appreciated! :)
Try
for item in items:
loader = ItemLoader(item=ScrapingamazonItem(), selector=item)
loader.add_css('ProductName', '.a-color-base.a-text-normal::text')
yield loader.load_item()

How to use Scrapy sitemap spider on sites with text sitemaps?

I tried using a generic Scrapy.spider to follow links, but it didn't work - so I hit upon the idea of simplifying the process by accessing the sitemap.txt instead, but that didn't work either!
I wrote a simple example (to help me understand the algorithm) of a spider to follow the sitemap specified on my site: https://legion-216909.appspot.com/sitemap.txt It is meant to navigate the URLs specified on the sitemap, print them out to screen and output the results into a links.txt file. The code:
import scrapy
from scrapy.spiders import SitemapSpider
class MySpider(SitemapSpider):
name = "spyder_PAGE"
sitemap_urls = ['https://legion-216909.appspot.com/sitemap.txt']
def parse(self, response):
print(response.url)
return response.url
I ran the above spider as Scrapy crawl spyder_PAGE > links.txt but that returned an empty text file. I have gone through the Scrapy docs multiple times, but there is something missing. Where am I going wrong?
SitemapSpider is expecting an XML sitemap format, causing the spider to exit with this error:
[scrapy.spiders.sitemap] WARNING: Ignoring invalid sitemap: <200 https://legion-216909.appspot.com/sitemap.txt>
Since your sitemap.txt file is just a simple list or URLs, it would be easier to just split them with a string method.
For example:
from scrapy import Spider, Request
class MySpider(Spider):
name = "spyder_PAGE"
start_urls = ['https://legion-216909.appspot.com/sitemap.txt']
def parse(self, response):
links = response.text.split('\n')
for link in links:
# yield a request to get this link
print(link)
# https://legion-216909.appspot.com/index.html
# https://legion-216909.appspot.com/content.htm
# https://legion-216909.appspot.com/Dataset/module_4_literature/Unit_1/.DS_Store
You only need to override _parse_sitemap(self, response) from SitemapSpider with the following:
from scrapy import Request
from scrapy.spiders import SitemapSpider
class MySpider(SitemapSpider):
sitemap_urls = [...]
sitemap_rules = [...]
def _parse_sitemap(self, response):
# yield a request for each url in the txt file that matches your filters
urls = response.text.splitlines()
it = self.sitemap_filter(urls)
for loc in it:
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break

Using Urllib with Scrapy for Pagination

Trying to Scrap Next Page with Scrapy, Python 3.5 using urlib python library
import datetime
import urllib.request
import urllib.error
import urllib.parse
import socket
import scrapy
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from properties.items import PropertiesItem
class BasicSpider(scrapy.Spider):
name = "manual"
allowed_domains = ["web"]
# Start on the first index page
start_urls = (
'http://scrapybook.s3.amazonaws.com/properties/index_00000.html',
)
def parse(self, response):
# Get the next index URLs and yield Requests
next_selector = response.xpath('//*[contains(#class,"next")]//#href')
for url in next_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url))
# Get item URLs and yield Requests
item_selector = response.xpath('//*[#itemprop="url"]/#href')
for url in item_selector.extract():
yield Request(urllib.parse.urljoin(response.url, url), callback=self.parse_item)
def parse(self, response):
l = ItemLoader(item=PropertiesItem(), response=response)
l.add_xpath('title', '//*[#itemprop="name"]/text()')
return l.load_item()
Everything Works Just Fine Without Error, But Scrapy Fetching only First Page, but according to code it should fetch all Next Pages
Here is the Output
[{
"title": [
"bermondsey ec kennington drive acton seven rm",
.......
"mary conversion borders eastham with gas"
}]
// Only Page 0 Titles :(
Is anything Wrong with Request or Urllib Call Syntax ?
PS : Xpath working, Scrapy Shell 'URL'
Let's Start with Wrong Uses Of Python Packages
Using Request without importing it, Fix it by.
from scrapy import Request
Wrong use of urljoin class from urllib, first import it
from urllib.parse import urljoin
now use urljoin direct without calling urllib.parse.urljoin
change it on
yield Request(urllib.parse.urljoin(response.url, url))
yield Request(urllib.parse.urljoin(response.url, url), callback=self.parse_item)
Not calling parse_item
call it on
def parse(self, response): #replace parse to parse_item
PS : If this code, is from Learning Scrapy Book then here is complete git example for python3 Version
https://github.com/Rahulsharma0810/Scrapy-Pagination-URLJOIN-Example
You seem to have two parse functions. So you only have the second one since it overrides the first one.
Just rename the second one to parse_item like the rest of your code seems to indicate.

Python, Scrapy, Selenium: how to attach webdriver to "response" passed into a function to use it for further action

I am trying to use Selenium to obtain value of selected option from a drop down list in a scrapy spider, but am unsure of how to go about it. Its my first interaction with Selenium.
As you can see in the code below, I create a request in parse function which calls parse_page function as a callback. In parse_page I want to extract the value of selected option. I cant figure out how to attach webdriver to the response page sent into parse_page to be able to use it in Select. I have written an obviously wrong code below :(
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.exceptions import CloseSpider
import logging
import scrapy
from scrapy.utils.response import open_in_browser
from scrapy.http import FormRequest
from scrapy.http import Request
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from activityadvisor.items import TruYog
logging.basicConfig()
logger = logging.getLogger()
class TrueYoga(Spider):
name = "trueyoga"
allowed_domains = ["trueyoga.com.sg","trueclassbooking.com.sg"]
start_urls = [
"http://trueclassbooking.com.sg/frames/class-schedules.aspx",
]
def parse(self, response):
clubs=[]
clubs = Selector(response).xpath('//div[#class="club-selections"]/div/div/div/a/#rel').extract()
clubs.sort()
print 'length of clubs = ' , len(clubs), '1st content of clubs = ', clubs
req=[]
for club in clubs:
payload = {'ctl00$cphContents$ddlClub':club}
req.append(FormRequest.from_response(response,formdata = payload, dont_click=True, callback = self.parse_page))
for request in req:
yield request
def parse_page(self, response):
driver = webdriver.Firefox()
driver.get(response)
clubSelect = Select(driver.find_element_by_id("ctl00_cphContents_ddlClub"))
option = clubSelect.first_selected_option
print option.text
Is there any way to obtain this option value in scrapy without using Selenium? My search on google and stackoverflow didn't yield any useful answers so far.
Thanks for help!
I would recommend using Downloader Middleware to pass the Selenium response over to your spider's parse method. Take a look at the example I wrote as an answer to another question.
If you get the response there are the select boxes with their options. One of those options has the attribute selected="selected". I think you should go through this attribute to avoid the usage of Selenium:
def parse_page(self, response):
response.xpath("//select[#id='ctl00_cphContents_ddlClub']//option[#selected = 'selected']").extract()

Categories