scrapy to get into next page and download all files - python

I am new to scrapy and python, I am able to get details from URL, I want enter into link and download all files(.htm and .txt).
My Code
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
And I need to enter into link and download all the files with ends with .htm and .txt files. Below code is not working..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
Can Anyone help me with this ? Thanks in Advance.

Try the following to get the files downloaded in your desktop or wherever you mention within the script:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
To be clearer: you need to specify explicitly dirf = r"C:\Users\WCS\Desktop\Storage" where C:\Users\WCS\Desktop or something will be your desired location. However, the script will automatically create Storage folder to save those files within.

Related

Parse not being called

I have the code below that i want to loop through urls in a csv and for each url i want to run some selectors and return the data into a csv output. It seems to be looping through the start urls, but its raising an error saying that parse is not defined. i cant understand where i'm going wrong here. any help appreciated!
import scrapy
import csv
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
start_urls = []
for url in open('sites.csv'):
start_urls.append(url)
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}
Check your indentation. The functions start_requests and parse_url need to be indented to the right, because right now they don't belong to your class.
class CbdSitechekerSpider(scrapy.Spider):
name = 'cbd_sitecheker'
def start_requests(self):
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = self.parse_url)
def parse_url(self, response):
links = response.xpath=('//a/#href').extract_first()
yield {'links' : links}
Try this:
def start_requests(self):
parse_url = self.parse_url
with open('sites.csv','r') as csvf:
for url in csvf:
yield scrapy.Request(url, callback = parse_url)
This way the callback will be defined before you pass it into the Request initialize function.

how to traverse a entire domain instead of providing individual links

Currently our spider works off a list of hard coded urls, would like to change that to just work off the main domain.
How can we change the below code to just expect the domain
https://www.example.com/shop/
If there is a good source with examples that would be great.
def start_requests(self):
urls = [
# 'https://www.example.com/shop/outdoors-unknown-hart-creek-fleece-hoodie',
'https://www.example.com/shop/adidas-unknown-essentials-cotton-fleece-3s-over-head-hoodie#repChildCatSku=111767466',
'https://www.example.com/shop/unknown-metallic-long-sleeve-shirt#repChildCatSku=115673740',
'https://www.example.com/shop/unknown-fleece-full-zip-hoodie#repChildCatSku=111121673',
'https://www.example.com/shop/unknown-therma-fleece-training-hoodie#repChildCatSku=114784077',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/under-unknown-rival-fleece-full-zip-hoodie#repChildCatSku=115448841',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/adidas-unknown-essentials-3-stripe-fleece-sweatshirt#repChildCatSku=115001812',
'https://www.example.com/shop/under-unknown-fleece-logo-hoodie#repChildCatSku=115305875',
'https://www.example.com/shop/under-unknown-heatgear-long-sleeve-shirt#repChildCatSku=107534192',
'https://www.example.com/shop/unknown-long-sleeve-legend-hoodie#repChildCatSku=112187421',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-sportswear-funnel-neck-hoodie-111112208#repChildCatSku=111112208',
'https://www.example.com/shop/unknown-therma-swoosh-fleece-training-hoodie#repChildCatSku=114784481',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'academy-%s.txt' % page
res2 = response.xpath("//span[#itemprop='price']/text()|//span[#itemprop='sku']/text()").extract()
res = '\n'.join(res2)
with open(filename, 'w') as f:
f.write(res)
self.log('Saved file %s' % filename)
Just for pure traversing you can make:
class MySpider(scrapy.Spider):
name = 'my'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/shop/']
def parse(self, response):
for link in response.css('a'):
yield response.follow(link)
But this task seems meaningless. Can you detail your question?

How can I open multiple links on a webpage and scrape there data?

I hope you guys are best with your health and R&D work.
import webbrowser
import scrapy
from urllib.request import urlopen
import re
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "forum"
def start_requests(self):
urls = ['https://tribune.com.pk/'], #'https://www.siasat.pk/forum/content.php/', 'http://hamariweb.com/news/', 'https://www.urdupoint.com/pakistan/all-news/']
for url in urls:
website = urlopen(url)
webbrowser.open(website)
print("HELLO WORLD")
html = website.read()
all_links = re.findall('"((http|ftp)s?://.*?)"', html)
for link in all_links:
yield scrapy.Request(url=link, callback=self.parse)
def parse(self, response):
page = response.url.split('/')[-2]
filename = '%s' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I want to open a webpage and that webpage contains many other links, I want to open all those and wants Scrapy to scrape all those web pages. Please help me out.
Thanks in Advance.
I have tried with monsterindia.com and open page using scrapy, that page contain multiple links. I have scraped all the data in the respective link and also we can do pagination. The following code may useful.
class MonsterSpider(scrapy.Spider):
name = 'monster'
start_urls = ['http://jobsearch.monsterindia.com/searchresult.html?day=1&jbc=22']
item = BotItem()
count = 1
def parse(self, response):
for href in response.css('h2.seotitle > a::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url =url, callback = self.parse_details)
next_page_url = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/#althref').extract_first()
print next_page_url
if next_page_url:
nextpage = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/#onclick').extract_first()
searchresult_num = nextpage.split("'")[1].strip()
next_page_url = "http://jobsearch.monsterindia.com/searchresult.html?day=1&n="+searchresult_num
next_page_url = response.urljoin(next_page_url)
print next_page_url
yield scrapy.Request(url = next_page_url, callback = self.parse)

How can I start to write Unit test in web Scrapy using python?

class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?
If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.

Scrapy crawl in order

I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')
I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.

Categories