Parse not being called - python

I have the code below that i want to loop through urls in a csv and for each url i want to run some selectors and return the data into a csv output. It seems to be looping through the start urls, but its raising an error saying that parse is not defined. i cant understand where i'm going wrong here. any help appreciated!
import scrapy
import csv
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
start_urls = []
for url in open('sites.csv'):
start_urls.append(url)
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}

Check your indentation. The functions start_requests and parse_url need to be indented to the right, because right now they don't belong to your class.
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}

Try this:
def start_requests(self):
parse_url = self.parse_url
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = parse_url)
This way the callback will be defined before you pass it into the Request initialize function.

Related

What is this Scrapy error: ReactorNotRestartable?

I do not understand why my spider wont run. I tested the css selector separately, so I do not think it is the parsing method.
Traceback message:
ReactorNotRestartable:
class espn_spider(scrapy.Spider):
name = "fsu2021_spider"
def start_requests(self):
urls = "https://www.espn.com/college-football/team/_/id/52"
for url in urls:
yield scrapy.Request(url = url, callback = self.parse_front)
def parse(self, response):
schedule_link = response.css('div.global-nav-container li > a::attr(href)')
process = CrawlerProcess()
process.crawl(espn_spider)
process.start()
urls = "https://www.espn.com/college-football/team/_/id/52"
for url in urls:
You're going through the characters of "urls", change it to a list:
urls = ["https://www.espn.com/college-football/team/_/id/52"]
...
...
Also you don't have "parse_front" function, if you just didn't add it to the snippet then ignore this, if it was a mistake then change it to:
yield scrapy.Request(url=url, callback=self.parse)

scrapy to get into next page and download all files

I am new to scrapy and python, I am able to get details from URL, I want enter into link and download all files(.htm and .txt).
My Code
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
And I need to enter into link and download all the files with ends with .htm and .txt files. Below code is not working..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
Can Anyone help me with this ? Thanks in Advance.
Try the following to get the files downloaded in your desktop or wherever you mention within the script:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
To be clearer: you need to specify explicitly dirf = r"C:\Users\WCS\Desktop\Storage" where C:\Users\WCS\Desktop or something will be your desired location. However, the script will automatically create Storage folder to save those files within.

How to loop through multiple URLs to scrape from a CSV file in Scrapy?

My code for scrapping data from alibaba website:
import scrapy
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
allowed_domains = ['alibaba.com']
start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
Response_rate = response.xpath('//i[#class="ui2-icon ui2-icon-skip"]/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info
Notice the start url, it only scraps through the given URL, but i want this code to scrap all the urls present in my csv file. My csv file contains large amount of URLs.
Sample of the data.csv file::
'https://www.alibaba.com/showroom/shock-absorber.html',
'https://www.alibaba.com/showroom/shock-wheel.html',
'https://www.alibaba.com/showroom/shoes-fastener.html',
'https://www.alibaba.com/showroom/shoes-women.html',
'https://www.alibaba.com/showroom/shoes.html',
'https://www.alibaba.com/showroom/shoulder-long-strip-bag.html',
'https://www.alibaba.com/showroom/shower-hair-band.html',
...........
How do i import all the links of csv file in the code at once?
To correctly loop through a file without loading all of it into memory you should use generators, as both file objects and start_requests method in python/scrapy are generators:
class MySpider(Spider):
name = 'csv'
def start_requests(self):
with open('file.csv') as f:
for line in f:
if not line.strip():
continue
yield Request(line)
To explain futher:
Scrapy engine uses start_requests to generate requests as it goes. It will keep generating requests untill concurrent request limit is full (settings like CONCURRENT_REQUESTS).
Also worth noting that by default scrapy crawls depth first - newer requests take priority, so start_requests loop will be last to finish.
You're almost there already. The only change is in start_urls, which you want to be "all the urls in the *.csv file." The following code easily implements that change.
with open('data.csv') as file:
start_urls = [line.strip() for line in file]
Let us assume you have stored url list in the form of a dataframe and you want to loop over each URL present inside the dataframe. My approach is given below which worked for me.
class IndiamartSpider(scrapy.Spider):
name = 'alibot'
#allowed_domains = ['alibaba.com']
#start_urls = ['https://www.alibaba.com/showroom/acrylic-wine-box_4.html']
def start_requests(self):
df = pd.read_csv('fileContainingUrls.csv')
#Here fileContainingUrls.csv is a csv file which has a column named as 'URLS'
# contains all the urls which you want to loop over.
urlList = df['URLS'].to_list()
for i in urlList:
yield scrapy.Request(url = i, callback=self.parse)
def parse(self, response):
Title = response.xpath('//*[#class="title three-line"]/a/#title').extract()
Price = response.xpath('//div[#class="price"]/b/text()').extract()
Min_order = response.xpath('//div[#class="min-order"]/b/text()').extract()
for item in zip(Title,Price,Min_order,Response_rate):
scraped_info = {
'Title':item[0],
'Price': item[1],
'Min_order':item[2],
'Response_rate':item[3]
}
yield scraped_info

How to modify url before following it in scrapy?

I'm new with scrapy and this is my second spider:
class SitenameScrapy(scrapy.Spider):
name = "sitename"
allowed_domains = ['www.sitename.com', 'sitename.com']
rules = [Rule(LinkExtractor(unique=True), follow=True)]
def start_requests(self):
urls = ['http://www.sitename.com/']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_cat)
def parse_cat(self, response):
links = LinkExtractor().extract_links(response)
for link in links:
if ('/category/' in link.url):
yield response.follow(link, self.parse_cat)
if ('/product/' in link.url):
yield response.follow(link, self.parse_prod)
def parse_prod(self, response):
pass
My problem is that sometimes I have links like http://sitename.com/path1/path2/?param1=value1&param2=value2 and for me, param1 is not important and I want to remove it from url before response.follow. I think I can do it with regex but I'm not sure that it is 'right way' for scrapy? Maybe I should use some kind of rule for this?
I think you could use the url_query_cleaner method from w3lib's library. Something like:
from w3lib.url import url_query_cleaner
...
....
def parse_cat(self, response):
links = LinkExtractor().extract_links(response)
for link in links:
url = url_query_cleaner(link.url, ('param2',))
if '/category/' in url:
yield response.follow(url, self.parse_cat)
if '/product/' in url:
yield response.follow(url, self.parse_prod)

import strings into scrapy to use as crawl urls

So my question is how do I tell scrapy to crawl URLs, which only set apart by one string. So for example: https://www.youtube.com/watch?v=STRING
I got the strings saved in a txt file.
with open("plz_nummer.txt") as f:
cityZIP = f.read().rsplit('\n')
for a in xrange(0,len(cityZIP)):
next_url = 'http://www.firmenfinden.de/?txtPLZ=' + cityZIP[a] + '&txtBranche=&txtKunden='
pass
I would make the loading of the file with zip codes part of the start_requests method as a generator. Something in the lines of:
import scrapy
class ZipSpider(scrapy.Spider):
name = "zipCodes"
self.city_zip_list = []
def start_requests(self):
with open("plz_nummer.txt") as f:
self.city_zip_list = f.read().rsplit('\n')
for city_zip in self.city_zip_list:
url = 'http://www.firmenfinden.de/?txtPLZ={}&txtBranche=&txtKunden='.format(city_zip)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# Anything else you need
# to do in here
pass
This should give you a good starting point. Also read this article: https://doc.scrapy.org/en/1.1/intro/tutorial.html

Categories