Creating loop to parse table data in scrapy/python - python

Have python script using scrapy , which scrapes the data from a website, allocates it to 3 fields and then generates a .csv. Works ok but with one major problem. All fields contain all of the data, rather than it being separated out for each table row. I'm sure this is due to my loop not working and when it finds the xpath it just grabs all the data for every row before moving on to get data for the other 2 fields, instead of creating seperate rows
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('//table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('//table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('//table/tbody/tr[*]/td[4]/p/text()').extract()
return item
The tr with the * increases in number with each entry on the website I need to crawl, and the other two paths slot in below. How do I edit this so it grabs the first set of data for say //table/tbody/tr[3] only, stores it for all three fields and then moves on to //table/tbody/tr[4] etc??
Update
Works correctly, however I'm trying to add some validation to the pipelines.py file to drop any records where var1 is more than 100%. I'm certain my code below is wrong, and also does "yield" instead of "return" stop the pipeline being used?
from scrapy.exceptions import DropItem
class TestbotPipeline(object):
def process_item(self, item, spider):
if item('var1') > 100%:
return item
else:
raise Dropitem(item)

I think this is what you are looking for:
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
yield item
You loop on the trs and then use relative XPath expressions (./td...), and in each iteration you use the yield instruction.
You can also append each item to a list and return that list outside of the loop) like this (it's equivalent to the code above):
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
items = []
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
items.append(item)
return items

You don't need HtmlXPathSelector. Scrapy already has built-in XPATH selector. Try this:
def parse(self, response):
divs = response.xpath('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.xpath('table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()[0]
item['var2'] = div.xpath('table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()[0]
item['var3'] = div.xpath('table/tbody/tr[*]/td[4]/p/text()').extract()[0]
return item

Related

Scrapy doesn't save all results to csv

I'm trying to scrape a specific web page and although on the console I get all results, on the outputted csv I don't. In this case, I want both title and author of a specific search, but I only get the title. If I reverse the order of the two I get author, so it only takes the first one. Why?
import scrapy
QUERY = "q=brilliant+friend&qt=results_page#x0%253Abook-%2C%2528x0%253Abook%2Bx4%253Aprintbook%2529%2C%2528x0%253Abook%2Bx4%253Adigital%2529%2C%2528x0%253Abook%2Bx4%253Alargeprint%2529%2C%2528x0%253Abook%2Bx4%253Amss%2529%2C%2528x0%253Abook%2Bx4%253Athsis%2529%2C%2528x0%253Abook%2Bx4%253Abraille%2529%2C%2528x0%253Abook%2Bx4%253Amic%2529%2Cx0%253Aartchap-%2C%2528x0%253Aartchap%2Bx4%253Achptr%2529%2C%2528x0%253Aartchap%2Bx4%253Adigital%2529format"
class Spider(scrapy.Spider):
name = 'worldcatspider'
start_urls = ['https://www.worldcat.org/search?start=%s&%s' % (number, QUERY) for number in range(0, 4400, 10)]
def parse(self, response):
for title in response.css('.name a > strong ::text').extract():
yield {"title:": title}
for author in response.css('.author ::text').extract():
yield {"author:": author}
My suggestion will be put for statement their head class or div.
I haven't checked but this should work:
def parse(self, response):
for page in response.css('.menuElem'):
title = page.css('.name a > strong ::text').extract()
author = page.css('.author ::text').extract()
yield {"title": title,
"author:": author}

Python Scrapy grabs all the rows into one single CSV row

I am trying to generate a CSV file with Scrapy, it is working but not as expected. I have an html table which has multiple rows, I want the same in CSV. However, the following code converts all the HTML rows into single CSV row.
code
class DemoSpider(scrapy.Spider):
name = "DemoSpider"
def start_requests(self):
urls = []
for page in range(1, 2):
url = "https://directory.easternuc.com/publicDirectory?page=%s" %page
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = TutorialItem()
item['name'] = response.selector.xpath("//tr/td/h4/text()").getall()
item['phone'] = response.selector.xpath("//tr/td[2]/text()").getall()
item['mobile'] = response.selector.xpath("//tr/td[3]/text()").getall()
item['email'] = response.selector.xpath("//tr/td[4]/text()").getall()
yield item
if I change the getall() method to get I am getting only first row from website into csv
Note: as a workaround, I can find the total rows in the website and then iterate it. However it seems like in the older version of the scrapy this is working.
You will have to iterate each tr one by one and yield each record separately
def parse(self, response):
for TR in response.xpath("//table/tr"):
item = TutorialItem()
item['name'] = TR.xpath("./td/h4/text()").get()
item['phone'] = TR.xpath("./td[2]/text()").get()
item['mobile'] = TR.xpath("./td[3]/text()").get()
item['email'] = TR.xpath("./td[4]/text()").get()
yield item

How do I scrape the number which denotes the maximum number of pages in a website before parsing a URL in a scrapy spider?

I'm trying to scrape this website https://phdessay.com/free-essays/.
I need to find the maximum number of pages so that I can append the URLs with page numbers to the start_urls list. I'm not able to figure out how to do that.
Here's my code so far,
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
In the above code, I'm following each essay to it's individual page and am scraping the URL and the title (I will be scraping the content which is the reason why I'm following the individual essay URL).
This works just fine for the starting page; but there are about 1677 pages which might change in the future. I would like to scrape this maximum_no_of_pages number and then append all links with all page numbers.
What you could do is to find the last page number and then do a range loop to yield next pages requests.
Something like this:
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
max_page = int(response.css('.page-numbers::text').getall()[-1])
for page_number in range(1, max_page + 1):
page_url = f'https://phdessay.com/free-essays/page/{page_number}/'
yield scrapy.Request(page_url, callback=self.parse_page)
def parse_page(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

How to add scraped items into a set and execute when condition is met?

This piece of code is expected to add extracted reviewId into a set( in order to omit duplicates. Then there is a check, when set lenth is 100 - callback is executed and long url string with all ids is passed to main extract function.
How do i do this(Save all ids, extracted from different callbacks into same Set and use it further) either with built in tools or with the code i have? the problem now is that lenth check loop is never enetered.
UPdate. I believe there are two options - pass Set as meta to each callback and somehow use Item for this one. But donno how.
import scrapy
from scrapy.shell import inspect_response
class QuotesSpider(scrapy.Spider):
name = "tripad"
list= set()
def start_requests(self):
url = "https://www.tripadvisor.com/Hotel_Review-g60763-d122005-Reviews-or{}-The_New_Yorker_A_Wyndham_Hotel-New_York_City_New_York.html#REVIEWS"
for i in range(0,500,5):
yield scrapy.Request(url=url.format(i), callback=self.parse)
def parse(self, response):
for result in response.xpath('//div[contains(#id,"review_")]/#id').extract():
if "review" in result[:8]:
QuotesSpider.list.add(result[7:] +"%2C")
if len(QuotesSpider.list) == 100:
url = "https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS&metaReferer=Hotel_Review&reviews="
for i in QuotesSpider.list:
url+=i
yield scrapy.Request(url=url, callback=self.parse_page)
There are several ways of doing this, however I'd advise splitting your spider into two parts:
Spider that collects review ids
class CollectorSpider(Spider):
name='collect_reviews'
def parse(self, response):
review_ids = ...
for review_id in review_ids:
yield {'review_id': review_id}
Spider that uses collected review ids to collect review content
class ConsumerSpider(Spider):
name='consume_reviews'
def start_requests(self):
with open(self.file, 'r') as f:
data = json.loads(f.read())
last = 0
for i in range(0, len(data), 100):
ids = data[last:i]
ids = [i['review_id'] for i in ids]
# make url from ids
url = ''
yield Request(url)
def parse(self, response):
# crawl 100 reviews here

Categories