I am trying to generate a CSV file with Scrapy, it is working but not as expected. I have an html table which has multiple rows, I want the same in CSV. However, the following code converts all the HTML rows into single CSV row.
code
class DemoSpider(scrapy.Spider):
name = "DemoSpider"
def start_requests(self):
urls = []
for page in range(1, 2):
url = "https://directory.easternuc.com/publicDirectory?page=%s" %page
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
item = TutorialItem()
item['name'] = response.selector.xpath("//tr/td/h4/text()").getall()
item['phone'] = response.selector.xpath("//tr/td[2]/text()").getall()
item['mobile'] = response.selector.xpath("//tr/td[3]/text()").getall()
item['email'] = response.selector.xpath("//tr/td[4]/text()").getall()
yield item
if I change the getall() method to get I am getting only first row from website into csv
Note: as a workaround, I can find the total rows in the website and then iterate it. However it seems like in the older version of the scrapy this is working.
You will have to iterate each tr one by one and yield each record separately
def parse(self, response):
for TR in response.xpath("//table/tr"):
item = TutorialItem()
item['name'] = TR.xpath("./td/h4/text()").get()
item['phone'] = TR.xpath("./td[2]/text()").get()
item['mobile'] = TR.xpath("./td[3]/text()").get()
item['email'] = TR.xpath("./td[4]/text()").get()
yield item
Related
I'm trying to scrape this website https://phdessay.com/free-essays/.
I need to find the maximum number of pages so that I can append the URLs with page numbers to the start_urls list. I'm not able to figure out how to do that.
Here's my code so far,
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
In the above code, I'm following each essay to it's individual page and am scraping the URL and the title (I will be scraping the content which is the reason why I'm following the individual essay URL).
This works just fine for the starting page; but there are about 1677 pages which might change in the future. I would like to scrape this maximum_no_of_pages number and then append all links with all page numbers.
What you could do is to find the last page number and then do a range loop to yield next pages requests.
Something like this:
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
max_page = int(response.css('.page-numbers::text').getall()[-1])
for page_number in range(1, max_page + 1):
page_url = f'https://phdessay.com/free-essays/page/{page_number}/'
yield scrapy.Request(page_url, callback=self.parse_page)
def parse_page(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv
I am trying to scrape a website using Scrapy. Example Link is: Here.
I am able to get some data using css selectors. I also need to fetch all image urls of each item. Now an item can have multiple colours. When we click on another colour, it actually fetch images from another url in the browser. So, I need to generate manual requests (due to multiple colours) and attach "meta" to store image urls from others urls into a SINGLE ITEM FIELD.
Here is my Scrapy code:
def get_image_urls(self, response):
item = response.meta['item']
if 'image_urls' in item:
urls = item['image_urls']
else:
urls = []
urls.extend(response.css('.product-image-link::attr(href)').extract())
item['image_urls'] = urls
next_url = response.css('.va-color .emptyswatch a::attr(href)').extract()
#print(item['image_urls'])
yield Request(next_url[0], callback=self.get_image_urls, meta={'item': item})
def parse(self, response):
output = JulesProduct()
output['name'] = self.get_name(response)
# Now get the recursive img urls
response.meta['item'] = output
self.get_image_urls(response)
return output
Ideally, I should return output object to have all of the required data. My question is why I am not getting output['image_urls']? Because when I uncomment print statement in get_image_urls function, I see 3 crawled urls and 3 print statements with url appended after each other. I need them in the parse function. I'm not sure if I'm able to dictate my issue. Can anybody help?
Your parse method is returning the output before the get_image_urls requests are done.
You should only yield or return your final item and at the end of your recursive logic. Something like this should work:
def parse(self, response):
output = JulesProduct()
output['name'] = self.get_name(response)
yield Request(response.url, callback=self.get_image_urls, meta={'item': item}, dont_filter=True)
def get_image_urls(self, response):
item = response.meta['item']
if 'image_urls' in item:
urls = item['image_urls']
else:
urls = []
urls.extend(response.css('.product-image-link::attr(href)').extract())
item['image_urls'] = urls
next_url = response.css('.va-color .emptyswatch a::attr(href)').extract()
if len(next_url) > 0:
yield Request(next_url[0], callback=self.get_image_urls, meta={'item': item})
else:
yield item
This piece of code is expected to add extracted reviewId into a set( in order to omit duplicates. Then there is a check, when set lenth is 100 - callback is executed and long url string with all ids is passed to main extract function.
How do i do this(Save all ids, extracted from different callbacks into same Set and use it further) either with built in tools or with the code i have? the problem now is that lenth check loop is never enetered.
UPdate. I believe there are two options - pass Set as meta to each callback and somehow use Item for this one. But donno how.
import scrapy
from scrapy.shell import inspect_response
class QuotesSpider(scrapy.Spider):
name = "tripad"
list= set()
def start_requests(self):
url = "https://www.tripadvisor.com/Hotel_Review-g60763-d122005-Reviews-or{}-The_New_Yorker_A_Wyndham_Hotel-New_York_City_New_York.html#REVIEWS"
for i in range(0,500,5):
yield scrapy.Request(url=url.format(i), callback=self.parse)
def parse(self, response):
for result in response.xpath('//div[contains(#id,"review_")]/#id').extract():
if "review" in result[:8]:
QuotesSpider.list.add(result[7:] +"%2C")
if len(QuotesSpider.list) == 100:
url = "https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS&metaReferer=Hotel_Review&reviews="
for i in QuotesSpider.list:
url+=i
yield scrapy.Request(url=url, callback=self.parse_page)
There are several ways of doing this, however I'd advise splitting your spider into two parts:
Spider that collects review ids
class CollectorSpider(Spider):
name='collect_reviews'
def parse(self, response):
review_ids = ...
for review_id in review_ids:
yield {'review_id': review_id}
Spider that uses collected review ids to collect review content
class ConsumerSpider(Spider):
name='consume_reviews'
def start_requests(self):
with open(self.file, 'r') as f:
data = json.loads(f.read())
last = 0
for i in range(0, len(data), 100):
ids = data[last:i]
ids = [i['review_id'] for i in ids]
# make url from ids
url = ''
yield Request(url)
def parse(self, response):
# crawl 100 reviews here
Have python script using scrapy , which scrapes the data from a website, allocates it to 3 fields and then generates a .csv. Works ok but with one major problem. All fields contain all of the data, rather than it being separated out for each table row. I'm sure this is due to my loop not working and when it finds the xpath it just grabs all the data for every row before moving on to get data for the other 2 fields, instead of creating seperate rows
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('//table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('//table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('//table/tbody/tr[*]/td[4]/p/text()').extract()
return item
The tr with the * increases in number with each entry on the website I need to crawl, and the other two paths slot in below. How do I edit this so it grabs the first set of data for say //table/tbody/tr[3] only, stores it for all three fields and then moves on to //table/tbody/tr[4] etc??
Update
Works correctly, however I'm trying to add some validation to the pipelines.py file to drop any records where var1 is more than 100%. I'm certain my code below is wrong, and also does "yield" instead of "return" stop the pipeline being used?
from scrapy.exceptions import DropItem
class TestbotPipeline(object):
def process_item(self, item, spider):
if item('var1') > 100%:
return item
else:
raise Dropitem(item)
I think this is what you are looking for:
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
yield item
You loop on the trs and then use relative XPath expressions (./td...), and in each iteration you use the yield instruction.
You can also append each item to a list and return that list outside of the loop) like this (it's equivalent to the code above):
def parse(self, response):
hxs = HtmlXPathSelector(response)
divs = hxs.select('//tr[#class="someclass"]')
items = []
for div in divs:
item = TestBotItem()
item['var1'] = div.select('./td[2]/p/span[2]/text()').extract()
item['var2'] = div.select('./td[3]/p/span[2]/text()').extract()
item['var3'] = div.select('./td[4]/p/text()').extract()
items.append(item)
return items
You don't need HtmlXPathSelector. Scrapy already has built-in XPATH selector. Try this:
def parse(self, response):
divs = response.xpath('//tr[#class="someclass"]')
for div in divs:
item = TestBotItem()
item['var1'] = div.xpath('table/tbody/tr[*]/td[2]/p/span[2]/text()').extract()[0]
item['var2'] = div.xpath('table/tbody/tr[*]/td[3]/p/span[2]/text()').extract()[0]
item['var3'] = div.xpath('table/tbody/tr[*]/td[4]/p/text()').extract()[0]
return item