I am trying to use python's scrappy to extract course catalog information from a website. The thing is, each course has a link to its full page and I need to iterate through those pages one by one to extract their information, which later, are fed to an SQL database. Anyhow, I don't know how to change the url's in the spider successively. here attached below is my code so far.
import scrapy
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
class QuoteSpider(scrapy.Spider):
name = 'courses'
start_urls = [
'http://catalog.aucegypt.edu/content.php?catoid=36&navoid=1738',
]
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('#href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
#iterate through urls
QuoteSpider.start_urls += fullURL
courseName = response.css('.block_content')
yield {
'courseNum': fullURL,
'test': courseName
}
Usually you need to yield this new URL and process it with corresponding callback:
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('#href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
courseName = response.css('.block_content')
yield scrapy.Request(
url=fullURL,
callback=self.parse_course,
cb_kwargs={
'course_name': courseName,
},
)
def parse_course(self, response, course_name):
# parse you course here...
Related
def parse(self, response):
category_names = []
category_urls = []
for item in response.css("#zg_browseRoot ul li"):
category_url = item.css("a").css(self.CSS_URL).extract()
category_name = item.css("a").css(self.CSS_TEXT).extract()
category_url = [
self.parse_url(category_url, 4) for category_url in category_url
]
(category_url,) = category_url
(category_name,) = category_name
category_names.append(category_name)
category_urls.append(category_url)
for c_name, url in zip(category_names, category_urls):
self.c_name = [c_name]
yield scrapy.Request(url, callback=self.parse_categories)
def parse_url(self, url, number):
parse = urlparse(url)
split = parse.path.split("/")[:number]
return f'{self.BASE_URL}{"/".join(split)}'
def parse_categories(self, response):
sub_names = []
sub_urls = []
for item in response.css("#zg_browseRoot ul ul li"):
sub_name = item.css("a").css(self.CSS_TEXT).extract()
sub_url = item.css("a").css(self.CSS_URL).extract()
sub_url = [self.parse_url(sub_url, 5) for sub_url in sub_url]
(sub_url,) = sub_url
(sub_name,) = sub_name
sub_names.append(sub_name)
sub_urls.append(sub_url)
for sub_name, url in zip(sub_names, sub_urls):
self.sub_name = [sub_name]
# print("{}: {}, {}".format(url, self.sub_name, self.c_name))
yield scrapy.Request(url, callback=self.parse_subcategories)
def parse_subcategories(self, response):
url = self.parse_url(response.request.url, 5)
print(f"{self.c_name}, {self.sub_name}, {url}")
Hello everyone,
I'm having an issue with my Scrapy approach. I'm trying to scrape page which has categories and subcategories in which are items. I want to include category and subcategory with each item scraped.
The problem is that the Scrapys callback function is asynchronous and zipping the URLs with names doesn't seem to work, because the for loop is processed first, URLs are stored in a generator and names are staying behind. Can anyone help me to work around this?
Thanks in advance,
Daniel.
You can pass arbitrary data along with the requests by using th cb_kwargs parameter. You can read about the details here.
Here is a simplified example:
def parse(self, response):
rows = response.xpath('//div[#id="some-element"]')
for row in rows:
request_url = row.xpath('a/#href').get()
category = row.xpath('a/text()').get()
yield Request(
url=request_url,
callback=self.parse_category,
cb_kwargs={'category': category}
)
def parse_category(self, response, category): # Notice category arg in the func
# Process here
yield item
The data inserted in cb_kwargs is passed as a keyword arg into the callback function, so the key in the dict must match the name of the argument in the method definiton.
cb_kwargs were introduced in Scrapy v1.7, if you are using an older version you should use the meta param. You can read about it here, notice that the use is slightly different.
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
I'm working on a scraper using Scrapy. Here is the code:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
next_page = 162
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
def parse_article(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
raise CloseSpider('duplicate article')
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
I want it to only scrape links put up since the last time it was run, so every time it reads an article it compares its published date to the last time the program ran, and, if the article is older, it does not scrape it and kills the program.
The problem here is, there are multiple categories that are all being scraped at the same time with this code, and it's possible that I get to the an older article in one category before I go through all the new articles in another category.
Is it possible to raise something in order to kill just one instance of a function so that the scraper will be able to continue looking through other categories?
edit:
import scrapy
from scrapy.exceptions import CloseSpider
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
def parse(self, response):
with open("irnadate.txt", "rt") as in_file:
irnadate = in_file.read()
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
articleday = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/.*/.*'))
articlemonth = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/(.*)/.*'))
articleyear = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'.*/.*/(.*)'))
articletime = ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)'))
articlestamp = articleyear + articlemonth + articleday + articletime
articlestampint = int(articlestamp)
irnadateint = int(irnadate)
if articlestampint <= irnadateint:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'date': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel2"]/text()').re(r'(.*)/(.*)/(.*)')),
'time': ''.join(response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_NofaDateLabel3"]/text()').re(r'(.*):(.*)')),
'title': ''.join(response.xpath('//*[#id="col-3"]/div/div[1]/div/h1/text()').extract_first()),
'text': ''.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}
The issue with this is that it looks like I am not able to load an article before scraping it to determine its date.
You need some restructuring to your spider. One is that you should not use
for next_article in ('/en/services/162/', '/en/services/163/', '/en/services/164/'):
yield response.follow(next_article, callback=self.parse)
Because every time you are getting a result page you are running the same urls again and again. So they will be filtered anyways after next request. So you should use this in base_urls
base_urls = [
'http://www.irna.ir/en/services/161',
'http://www.irna.ir/en/services/162',
'http://www.irna.ir/en/services/163',
'http://www.irna.ir/en/services/164',
]
def start_requests(self):
for base_url in self.base_urls:
yield scrapy.Request(base_url, meta={'page_number': 1, 'base_url': base_url})
Next in your article parse you should get the date from results
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
# get the date for this article
# if the date is already extracted
date_already_processed = <-Get the date from result page->
if date_already_processed:
break
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
base_url = response.meta['base_url']
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test
I want to start scraping from one page and traverse to 100s of page using next url, which i have written in following code. I need to go to another link in that crawling and extract data and store in the items. I can easily print all the items data to be exported but not able to return from the function as desired.
class UserLoginCrawl(CrawlSpider):
name = "mylogin"
allowed_domains = ['www.example.com']
login_page = "www.example.com/user"
start_urls = ["www.example.com/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(
allow = ('/profile\?page=\d+'),
restrict_xpaths = ('//li[#class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True),]
# ulists = []
def parse_page(self, response):
self.log ('XYZ, Started Crawling %s' %response.url)
items = response.xpath("//div[#id='profile']/div")
for temp in items:
userurl = 'www.example.com'+temp.xpath("./div[#class='name']/a/#href").extract()[0]
yield Request(url=userurl,callback=self.parse_profile_page)
self.log ('XYZ, Finished Crawling %s' %response.url)
# return self.ulists
def parse_profile_page(self, response):
usritem = PostUsers()
self.log ('XYZ, Started Crawling user Profile %s' %response.url)
usritem["userlink"] = response.url
usritem["fullname"] = response.xpath("//h1[#id='page-title']/text()").extract()
relative_url = response.xpath("//div[#id='nav-content']/ul/li[2]/a/#href").extract()[0]
usritem["postlink"] = 'www.example.com'+relative_url
usritem["history"] = response.xpath("//div[#id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
# self.ulists.append(usritem)
print usritem
# return usritem
Use yield usritem at the end of your parse method.
See the second example of Spider Examples