import strings into scrapy to use as crawl urls - python

So my question is how do I tell scrapy to crawl URLs, which only set apart by one string. So for example: https://www.youtube.com/watch?v=STRING
I got the strings saved in a txt file.
with open("plz_nummer.txt") as f:
cityZIP = f.read().rsplit('\n')
for a in xrange(0,len(cityZIP)):
next_url = 'http://www.firmenfinden.de/?txtPLZ=' + cityZIP[a] + '&txtBranche=&txtKunden='
pass

I would make the loading of the file with zip codes part of the start_requests method as a generator. Something in the lines of:
import scrapy
class ZipSpider(scrapy.Spider):
name = "zipCodes"
self.city_zip_list = []
def start_requests(self):
with open("plz_nummer.txt") as f:
self.city_zip_list = f.read().rsplit('\n')
for city_zip in self.city_zip_list:
url = 'http://www.firmenfinden.de/?txtPLZ={}&txtBranche=&txtKunden='.format(city_zip)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# Anything else you need
# to do in here
pass
This should give you a good starting point. Also read this article: https://doc.scrapy.org/en/1.1/intro/tutorial.html

Related

Parse not being called

I have the code below that i want to loop through urls in a csv and for each url i want to run some selectors and return the data into a csv output. It seems to be looping through the start urls, but its raising an error saying that parse is not defined. i cant understand where i'm going wrong here. any help appreciated!
import scrapy
import csv
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
start_urls = []
for url in open('sites.csv'):
start_urls.append(url)
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}
Check your indentation. The functions start_requests and parse_url need to be indented to the right, because right now they don't belong to your class.
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}
Try this:
def start_requests(self):
parse_url = self.parse_url
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = parse_url)
This way the callback will be defined before you pass it into the Request initialize function.

How to loop through multiple URLs to scrape from a CSV file in Scrapy?

My code for scrapping data from alibaba website:
import scrapy
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
Response_rate = response.xpath('//i[#class="ui2-icon ui2-icon-skip"]/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info
Notice the start url, it only scraps through the given URL, but i want this code to scrap all the urls present in my csv file. My csv file contains large amount of URLs.
Sample of the data.csv file::
'https://www.alibaba.com/showroom/shock-absorber.html',
'https://www.alibaba.com/showroom/shock-wheel.html',
'https://www.alibaba.com/showroom/shoes-fastener.html',
'https://www.alibaba.com/showroom/shoes-women.html',
'https://www.alibaba.com/showroom/shoes.html',
'https://www.alibaba.com/showroom/shoulder-long-strip-bag.html',
'https://www.alibaba.com/showroom/shower-hair-band.html',
...........
How do i import all the links of csv file in the code at once?
To correctly loop through a file without loading all of it into memory you should use generators, as both file objects and start_requests method in python/scrapy are generators:
class MySpider(Spider):
name = 'csv'
def start_requests(self):
with open('file.csv') as f:
for line in f:
if not line.strip():
continue
yield Request(line)
To explain futher:
Scrapy engine uses start_requests to generate requests as it goes. It will keep generating requests untill concurrent request limit is full (settings like CONCURRENT_REQUESTS).
Also worth noting that by default scrapy crawls depth first - newer requests take priority, so start_requests loop will be last to finish.
You're almost there already. The only change is in start_urls, which you want to be "all the urls in the *.csv file." The following code easily implements that change.
with open('data.csv') as file:
start_urls = [line.strip() for line in file]
Let us assume you have stored url list in the form of a dataframe and you want to loop over each URL present inside the dataframe. My approach is given below which worked for me.
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
#allowed_domains = ['alibaba.com']
#start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def start_requests(self):
df = pd.read_csv('fileContainingUrls.csv')
#Here fileContainingUrls.csv is a csv file which has a column named as 'URLS'
# contains all the urls which you want to loop over.
urlList = df['URLS'].to_list()
for i in urlList:
yield scrapy.Request(url = i, callback=self.parse)
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info

How to add scraped items into a set and execute when condition is met?

This piece of code is expected to add extracted reviewId into a set( in order to omit duplicates. Then there is a check, when set lenth is 100 - callback is executed and long url string with all ids is passed to main extract function.
How do i do this(Save all ids, extracted from different callbacks into same Set and use it further) either with built in tools or with the code i have? the problem now is that lenth check loop is never enetered.
UPdate. I believe there are two options - pass Set as meta to each callback and somehow use Item for this one. But donno how.
import scrapy
from scrapy.shell import inspect_response
class QuotesSpider(scrapy.Spider):
name = "tripad"
list= set()
def start_requests(self):
url = "https://www.tripadvisor.com/Hotel_Review-g60763-d122005-Reviews-or{}-The_New_Yorker_A_Wyndham_Hotel-New_York_City_New_York.html#REVIEWS"
for i in range(0,500,5):
yield scrapy.Request(url=url.format(i), callback=self.parse)
def parse(self, response):
for result in response.xpath('//div[contains(#id,"review_")]/#id').extract():
if "review" in result[:8]:
QuotesSpider.list.add(result[7:] +"%2C")
if len(QuotesSpider.list) == 100:
url = "https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS&metaReferer=Hotel_Review&reviews="
for i in QuotesSpider.list:
url+=i
yield scrapy.Request(url=url, callback=self.parse_page)
There are several ways of doing this, however I'd advise splitting your spider into two parts:
Spider that collects review ids
class CollectorSpider(Spider):
name='collect_reviews'
def parse(self, response):
review_ids = ...
for review_id in review_ids:
yield {'review_id': review_id}
Spider that uses collected review ids to collect review content
class ConsumerSpider(Spider):
name='consume_reviews'
def start_requests(self):
with open(self.file, 'r') as f:
data = json.loads(f.read())
last = 0
for i in range(0, len(data), 100):
ids = data[last:i]
ids = [i['review_id'] for i in ids]
# make url from ids
url = ''
yield Request(url)
def parse(self, response):
# crawl 100 reviews here

Crawl website from list of values using scrapy

I have a list of NPIs which I want to scrape the names of the providers for from npidb.org
The NPI values are stored in a csv file.
I am able to do it manually by pasting the URLs in the code. However, I am unable to figure out how to do it if I have a list of NPIs for each of which I want the provider names.
Here is my current code:
import scrapy
from scrapy.spider import BaseSpider
class MySpider(BaseSpider):
name = "npidb"
def start_requests(self):
urls = [
'https://npidb.org/npi-lookup/?npi=1366425381',
'https://npidb.org/npi-lookup/?npi=1902873227',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'npidb-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
Assume you have a list of npi from csv file, then you can simply use format to change the website address as following(I also add the part to get list from csv file. If you have it already, you can omit that part):
def start_requests(self):
# get npis from csv file
npis = []
with open('test.csv', 'r') as f:
for line in f.readlines():
l = line.strip()
npis.append((l))
# generate the list of address depending on npi
start_urls = []
for npi in npis:
start_urls.append('https://npidb.org/npi-lookup/?npi={}'.format(npi))
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
Well, it depends on the structure of your csv file, but if it contains the npis in separate lines, you could do something like
def start_requests(self):
with open('npis.csv') as f:
for line in f:
yield scrapy.Request(
url='https://npidb.org/npi-lookup/?npi={}'.format(line.strip()),
callback=self.parse
)

Scrapy - Importing Excel .csv as start_url

So I'm building a scraper that imports a .csv excel file which has one row of ~2,400 websites (each website is in its own column) and using these as the start_url. I keep getting this error saying that I am passing in a list and not a string. I think this may be caused by the fact that my list basically just has one reallllllly long list in it that represents the row. How can I overcome this and basically put each website from my .csv as its own seperate string within the list?
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
exceptions.TypeError: Request url must be str or unicode, got list:
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import HtmlResponse
from tutorial.items import DanishItem
from scrapy.http import Request
import csv
with open('websites.csv', 'rbU') as csv_file:
data = csv.reader(csv_file)
scrapurls = []
for row in data:
scrapurls.append(row)
class DanishSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = []
start_urls = scrapurls
def parse(self, response):
for sel in response.xpath('//link[#rel="icon" or #rel="shortcut icon"]'):
item = DanishItem()
item['website'] = response
item['favicon'] = sel.xpath('./#href').extract()
yield item
Thanks!
Joey
Just generating a list for start_urls does not work as it is clearly written in Scrapy documentation.
From documentation:
You start by generating the initial Requests to crawl the first URLs, and specify a callback function to be called with the response downloaded from those requests.
The first requests to perform are obtained by calling the
start_requests() method which (by default) generates Request for
the URLs specified in the start_urls and the parse method as
callback function for the Requests.
I would rather do it in this way:
def get_urls_from_csv():
with open('websites.csv', 'rbU') as csv_file:
data = csv.reader(csv_file)
scrapurls = []
for row in data:
scrapurls.append(row)
return scrapurls
class DanishSpider(scrapy.Spider):
...
def start_requests(self):
return [scrapy.http.Request(url=start_url) for start_url in get_urls_from_csv()]
I find the following useful when in need:
import csv
import scrapy
class DanishSpider(scrapy.Spider):
name = "rei"
with open("output.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [item['Link'] for item in reader]
def parse(self, response):
yield {"link":response.url}
Try opening the .csv file inside the class (not outside as you have done before) and append the start_urls. This solution worked for me. Hope this helps :-)
class DanishSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = []
start_urls = []
f = open('websites.csv'), 'r')
for i in f:
u = i.split('\n')
start_urls.append(u[0])
for row in data:
scrapurls.append(row)
row is a list [column1, column2, ..]
So I think you need to extract the columns, and append to your start_urls.
for row in data:
# if all the column is the url str
for column in row:
scrapurls.append(column)
Try this way also,
filee = open("filename.csv","r+")
# Removing the \n 'new line' from the url
r=[i for i in filee]
start_urls=[r[j].replace('\n','') for j in range(len(r))]

Categories