I'm using Scrapy to collect data from stox.vn. I have urls.txt with have about 800 url, and pass all url to my bot. However, at first it crawl and scrape well. but then it's stop scrape and only crawl.
2013-06-27 03:24:28+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA> (referer: http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA)
2013-06-27 03:24:28+0700 [stox] DEBUG: Scraped from <200 http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA>
{'chi_phi_ban_hang': u'-7453.41',
'chi_phi_khau_hao_TSCD': u'11890.11',
'chi_phi_quan_ly': u'-5913.60',
'chi_phi_tai_chinh': u'-10677.99',
'chi_phi_tien_lai_vay': u'-5672.17',
'doanh_thu_thuan': u'122008.75',
'gia_von_hang_ban': u'-90790.07',
'lai_co_dong_ct_me': u'11885.60',
'lai_gop': u'31218.69',
'lai_sau_thue': u'11885.60',
'lai_tu_hdkd': u'11376.31',
'loi_ich_CDTS': u'11885.60',
'qtime': u'20101',
'thu_nhap_tai_chinh': u'4202.63',
'thue_TNDN_hl': u'509.29',
'thue_TNDN_ht': u'0',
'ticker': 'AAA'}
.....
2013-06-27 03:24:31+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI> (referer: None)
2013-06-27 03:24:33+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT> (referer: None)
2013-06-27 03:24:36+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=164&iId=289&iIdL=279&eId=1&tId=0status=1&id=-1&cat=&ticker=ACB> (referer: None)
2013-06-27 03:24:38+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=522&iId=180&iIdL=170&eId=0&tId=2status=1&id=-1&cat=&ticker=ACC> (referer: None)
2013-06-27 03:24:40+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=486&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ACE> (referer: None)
2013-06-27 03:24:42+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=2&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ACL> (referer: None)
2013-06-27 03:24:44+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=858&iId=256&iIdL=241&eId=1&tId=2status=1&id=-1&cat=&ticker=ADC> (referer: None)
2013-06-27 03:24:47+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=556&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ADP> (referer: None)
Here what i'm doing
In stox/spider/test.py
from scrapy import log
import logging
from scrapy.log import ScrapyFileLogObserver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from stox.items import StoxItem
from scrapy.http import Request, Response
from scrapy.http.cookies import CookieJar
from scrapy.contrib.exporter import CsvItemExporter
class MySpider(BaseSpider):
name = "stox"
allowed_domains = ["stox.vn"]
start_urls =["http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=FPT",
"http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=SSC"]
ticker = "";
items = [];
def __init__(self):
#write log file here
logfile = open('testlog.log', 'w')
log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
log_observer.start() #start logging
def start_requests(self):
products = []
#with open('urls.txt', 'rb') as urls:
# for url in urls:
# yield Request(url, self.parse)
# extract url file and call parse()
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
for url in start_urls:
yield Request(url, self.parse)
def parse(self, response):
hxs = HtmlXPathSelector(response)
self.ticker = "".join(hxs.select("//div[#class='stock-ticker-title']/label/text()").extract()).strip()
my_start_url = "http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=%s" % self.ticker
#get the cookie of start_url
cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
cookieJar.extract_cookies(response, response.request)
request = Request(my_start_url, callback = self.extractItem,
meta = {'dont_merge_cookies': True, 'cookie_jar': cookieJar})
cookieJar.add_cookie_header(request) # apply Set-Cookie ourselves
yield request
def extractItem(self, response):
items = [];
#extract ticker from url
pos = response.url.find('ticker=')
l = len("ticker=")
ticker = response.url[pos+l:]
f = open("data/%s.csv" % ticker, 'w')
#get the XPath
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[#data-time]/..")
for title in titles:
item = StoxItem()
item ["ticker"] = ticker;
item ["qtime"] = "".join(title.select("./p/#data-time").extract())
item ["doanh_thu_thuan"] = ''.join(title.select("./div[1]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["gia_von_hang_ban"] = ''.join(title.select("./div[1]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')#.encode('utf-8')
item ["lai_gop"] = ''.join(title.select("./div[2]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thu_nhap_tai_chinh"] = ''.join(title.select("./div[2]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tai_chinh"] = ''.join(title.select("./div[2]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tien_lai_vay"] = ''.join(title.select("./div[2]/p[4]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_ban_hang"] = ''.join(title.select("./div[2]/p[5]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_quan_ly"] = ''.join(title.select("./div[2]/p[6]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_tu_hdkd"] = ''.join(title.select("./div[3]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_ht"] = ''.join(title.select("./div[3]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_hl"] = ''.join(title.select("./div[3]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_sau_thue"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["loi_ich_CDTS"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_co_dong_ct_me"] = ''.join(title.select("./div[5]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_khau_hao_TSCD"] = ''.join(title.select("./div[6]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
items.append(item)
#write to file
str = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (item ["ticker"],
item ["qtime"],
item ["doanh_thu_thuan"],
item ["gia_von_hang_ban"],
item ["lai_gop"],
item ["thu_nhap_tai_chinh"],
item ["chi_phi_tai_chinh"],
item ["chi_phi_tien_lai_vay"],
item ["chi_phi_ban_hang"],
item ["chi_phi_quan_ly"],
item ["lai_tu_hdkd"],
item ["thue_TNDN_ht"],
item ["thue_TNDN_hl"],
item ["lai_sau_thue"],
item ["loi_ich_CDTS"],
item ["lai_co_dong_ct_me"],
item ["chi_phi_khau_hao_TSCD"])
f.write(str)
#print "Item %r " %items;
f.close()
return items
My settings.py
BOT_NAME = 'stox'
SPIDER_MODULES = ['stox.spiders']
NEWSPIDER_MODULE = 'stox.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stox (+http://www.yourdomain.com)'
#ITEM_PIPELINES = ['stox.pipelines.StoxPipeline']
DOWNLOAD_DELAY = 2
#DOWNLOAD_TIMEOUT = 180
#CONCURRENT_REQUESTS = 2
I check that when I change the param CONCURRENT_REQUESTS, then it will stop after scrapes CONCURENT_REQUEST times, and then it only crawl. I think there is a problem with the Concurrent process (which it is not free the process??? )
UPDATED
The content of urls.txt
http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA
http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI
http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT
.....
Any assistance is greatly appreciated!
Thank you.
PS: I'm very new to Scrapy project and sorry for my lack of english
Out of your 800 url, you are writing with ticker name as the file.
Are the ticker names distinct in all urls? If they are not distinct, you might be overwriting the files. Instead of writing into a file, you can use the export option.
You can read the following thread to know about exporting the data.
Scrapy : storing the data
Related
I'm working on Google search results crawling through this link.
https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz
I have disabled Javascript in the Chrome Browser and obtained the xpath value for Next.
xpath -> //*[#id="main"]/footer/div1/div/div/a
Here is my code
import scrapy
from ..items import GooglescrapyItem
from urllib.parse import urlparse, parse_qs
class GoogleBotsSpider(scrapy.Spider):
name = 'GoogleScrapyBot'
allowed_domains = ['google.com']
start_urls = [f'https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz']
def parse(self, response):
titles = response.xpath('//*[#id="main"]/div/div/div/a/h3/div//text()').extract()
links = response.xpath('//*[#id="main"]/div/div/div/a/#href').extract()
next_page = response.xpath('//*[#id="main"]/footer/div/div/div/a/#href').extract()
items = []
for idx in range(len(titles)):
item = GooglescrapyItem()
item['title'] = titles[idx]
parsed_url = urlparse(links[idx])
query_params = parse_qs(parsed_url.query)
item['link'] = query_params["q"][0]
items.append(item)
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.google.com/search?q=telsa+"model3"+intext:model3&hl=en&rlz' + next_href
request = scrapy.Request(url=next_page_url)
yield request
return items
output
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz> (referer: None)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LMrhYP3IOY6v0PEPmKGNoAg&start=10&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LsrhYIf-AdSTr7wPtt-LyA4&start=0&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LMrhYP3IOY6v0PEPmKGNoAg&start=10&sa=N)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=L8rhYJCNCI7_0gSA5qKAAg&start=10&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=LsrhYIf-AdSTr7wPtt-LyA4&start=0&sa=N)
DEBUG: Crawled (200) <GET https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=MMrhYOLRHeLFmAX2w4ioBA&start=0&sa=N> (referer: https://www.google.com/search?q=telsa+%22model3%22+intext:model3&hl=en&rlz/search?q=telsa+%22model3%22+intext:model3&hl=en&ie=UTF-8&ei=L8rhYJCNCI7_0gSA5qKAAg&start=10&sa=N)
...Skip
Question
Google Search Pages will increase by 10 steps, such as &start=0 &start=10 &start=20 &start=30.
However, my code repeats &start=0 and &start=10 and does not move to &start=20
Could you please go over my code?
Try nextPage = response.xpath('//td[#role="heading"]/a/#href').get()
2021-05-07 10:07:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/robots.txt> (referer: None)
2021-05-07 10:07:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa/> (referer: None)
2021-05-07 10:07:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa?s=120> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa/)
2021-05-07 10:07:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa?s=240> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa?s=120)
this is the output I get, seems like it just moves to the page of results, performed by selecting the next button and performing a request in line 27
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, Request
from craig.items import CraigItem
from scrapy.selector import Selector
class PhonesSpider(scrapy.Spider):
name = 'phones'
allowed_domains = ['tampa.craigslist.org']
start_urls = ['https://tampa.craigslist.org/d/cell-phones/search/moa/']
def parse(self, response):
phones = response.xpath('//p[#class="result-info"]')
for phone in phones:
relative_url = phone.xpath('a/#href').extract_first()
absolute_url = response.urljoin(relative_url)
title = phone.xpath('a/text()').extract_first()
price = phone.xpath('//*[#id="sortable-results"]/ul/li[3]/a/span').extract_first()
yield Request(absolute_url, callback=self.parse_item, meta={'URL': absolute_url, 'Title': title, 'price': price})
relative_next_url = response.xpath('//a[#class="button next"]/#href').extract_first()
absolute_next_url = "https://tampa.craigslist.org" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_item(self, response):
item = CraigItem()
item["cl_id"] = response.meta.get('Title')
item["price"] = response.meta.get
absolute_url = response.meta.get('URL')
yield{'URL': absolute_url, 'Title': title, 'price': price}
Seems like in my code, for phone in phones loop, doesn't run, which results in never running parse_item and continuing to requesting the next url, I am following some tutorials and reading documentation but im still having trouble grasping what I am doing wrong. I have experience with coding arduinos as a hobby when I was young, but no professional coding experience, this is my first forte into a project like this, I have an ok grasp on the basics of loops, functions, callbacks, etc.
any help is greatly appreciated
UPDATE
current output
2021-05-07 15:29:32 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/robots.txt> (referer: None)
2021-05-07 15:29:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/d/cell-phones/search/moa/> (referer: None)
2021-05-07 15:29:33 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2021-05-07 15:29:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html> (referer: https://tampa.craigslist.org/d/cell-phones/search/moa/)
2021-05-07 15:29:36 [scrapy.core.scraper] DEBUG: Scraped from <200 https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html>
{'cl_id': 'postid_7309734640',
'price': '$35',
'title': 'Cut that high cable bill, switch to SPC TV and save. 1400 hd '
'channels',
'url': 'https://tampa.craigslist.org/hil/mob/d/tampa-cut-that-high-cable-bill-switch/7309734640.html'}
2021-05-07 15:29:36 [scrapy.core.engine] INFO: Closing spider (finished)
CURRENT CODE
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, Request
from craig.items import CraigItem
from scrapy.selector import Selector
class PhonesSpider(scrapy.Spider):
name = 'phones'
allowed_domains = ['tampa.craigslist.org']
start_urls = ['https://tampa.craigslist.org/d/cell-phones/search/moa/']
base_url = 'https://tampa.craigslist.org'
def parse(self, response):
phones = response.xpath('//div[#class="result-info"]')
for phone in phones:
x = response.meta.get('x')
n = -1
url = response.xpath('//a[#class="result-title hdrlnk"]/#href').getall()
relative_url = phone.xpath('//a[#class="result-title hdrlnk"]/#href').get()
absolute_url = response.urljoin(relative_url)
title = phone.xpath('//a[#class="result-title hdrlnk"]/text()').getall()
price = phone.xpath('//span[#class="result-price"]/text()').getall()
cl_id = phone.xpath('//a[#class="result-title hdrlnk"]/#id').getall()
yield Request(absolute_url, callback=self.parse_item, meta={'absolute_url': absolute_url, 'url': url, 'title': title, 'price': price, 'cl_id': cl_id, 'n': n})
def parse_item(self, response):
n = response.meta.get('n')
x = n + 1
item = CraigItem()
item["title"] = response.meta.get('title')[x]
item["cl_id"] = response.meta.get('cl_id')[x]
item["price"] = response.meta.get('price')[x]
item["url"] = response.meta.get('url')[x]
yield item
absolute_next_url = response.meta.get('url')[x]
absolute_url = response.meta.get('absolute_url')
yield Request(absolute_next_url, callback=self.parse, meta={'x': x})
I am now able to retrieve the desired content for a posting, URL, Price, Title and craigslist id, now my spider automatically closes after pulling just 1 result, I am having trouble understanding the process of using variables between the 2 functions (x) and (n), logically, after pulling one listings data, as above in the format
cl_id
Price
title
url
I would like to proceed back to the initial parse function and swap to the next item in the list of urls retrieved by
response.xpath('//a[#class="result-title hdrlnk"]/#href').getall()
which (when run in scrapy shell, succesfully pulls all the URLs)
how do I go about implementing this logic of start with [0] in the list, perform parse, perform parse_item, output item, then update a variable (n which starts as 0, needs to + 1 after each item)then call n in parse_item with its updated value and use, for example (item["title"] = response.meta.get('title')[x]) to refer to the list of urls, etc, and which place to select, then run parse_item again outputting 1 at a time, until all the values in the URL list have been output with their related price, cl_id, and title.
I know the code is messy as hell and the basics aren't fully understood by me yet, but im committed to getting this to work and learning it the hard way rather than starting from the ground up for python.
Class result-info is used within the div block, so you should write:
phones = response.xpath('//div[#class="result-info"]')
That being said, I didn't check/fix your spider further (it seems there are only parsing errors, not functional ones).
As a suggestion for the future, you can use Scrapy shell for quickly debugging the issues:
scrapy shell "your-url-here"
I have a crawlspider which crawls given site upto certain dept and download the pdfs on that site. Everything works fine but along with link of pdf, i also need text inside anchor tag.
for eg:
<a href='../some/pdf/url/pdfname.pdf'>Project Report</a>
consider this anchor tag, in callback i get response object and along with this object i need text inside that tag for eg 'Project Report'.
Is there any way to get this information along with the response object. i have gone through https://docs.scrapy.org/en/latest/topics/selectors.html link but it not something that i am looking for.
sample code:
class DocumunetPipeline(scrapy.Item):
document_url = scrapy.Field()
name = scrapy.Field() # name of pdf/doc file
depth = scrapy.Field()
class MySpider(CrawlSpider):
name = 'pdf'
start_urls = ['http://www.someurl.com']
allowed_domains = ['someurl.com']
rules = (
Rule(LinkExtractor(tags="a", deny_extensions=[]),
callback='parse_document', follow=True),
)
def parse_document(self, response):
content_type = (response.headers
.get('Content-Type', None)
.decode("utf-8"))
url = response.url
if content_type == "application/pdf":
name = response.headers.get('Content-Disposition', None)
document = DocumunetPipeline()
document['document_url'] = url
document['name'] = name
document['depth'] = response.meta.get('depth', None)
yield document
It seems like it's not documented, but the meta attribute does contain the link text. It is updated in this line.
A minimal example would be:
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
class LinkTextSpider(CrawlSpider):
name = 'linktext'
start_urls = ['https://example.org']
rules = [
Rule(LinkExtractor(), callback='parse_document'),
]
def parse_document(self, response):
return dict(
url=response.url,
link_text=response.meta['link_text'],
)
Which produces an output similar to:
2019-04-01 12:03:30 [scrapy.core.engine] INFO: Spider opened
2019-04-01 12:03:30 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-04-01 12:03:30 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-04-01 12:03:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.org> (referer: None)
2019-04-01 12:03:32 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.iana.org/domains/reserved> from <GET http://www.iana.org/domains/example>
2019-04-01 12:03:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.iana.org/domains/reserved> (referer: None)
2019-04-01 12:03:33 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.iana.org/domains/reserved>
{'url': 'https://www.iana.org/domains/reserved', 'link_text': 'More information...'}
2019-04-01 12:03:33 [scrapy.core.engine] INFO: Closing spider (finished)
I believe the best way to achieve that is not to use crawling rules, and instead user regular crawling, with your own parse_* methods to handle all responses.
Then, when you yield a request that has parse_document as callback, you can include the link text on the meta parameter of your request, and read it from response.meta on your parse_document method.
class MySpider(CrawlSpider):
name = 'pdf'
start_urls = ['http://www.someurl.com']
allowed_domains = ['someurl.com']
def parse(self, response):
for link in response.css('a'):
yield response.follow(
link,
callback=self.parse_document,
meta={'link_text': link.xpath('text()').get()}
)
def parse_document(self, response):
# …
if content_type == "application/pdf":
# …
document = DocumunetPipeline()
# …
document['link_text'] = response.meta['link_text']
yield document
I am submitting a FormRequestto change the page number of multiple pages of results.
When I use the scrapy shell, the Post request goes through:
> `2017-05-21 22:44:19 [scrapy.core.engine] INFO: Spider opened
> 2017-05-21 22:44:20 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> http://www.australianschoolsdirectory.com.au/robots.txt> (referer:
> None) 2017-05-21 22:44:22 [scrapy.core.engine] DEBUG: Crawled (200)
> <POST http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:27 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:39 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:43 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True 2017-05-21 22:44:46 [scrapy.core.engine] DEBUG:
> Crawled (200) <POST
> http://www.australianschoolsdirectory.com.au/search-result.php>
> (referer: None) True`
Using this request sequence:
>>> from scrapy.http import FormRequest
>>> url = 'http://www.australianschoolsdirectory.com.au/search-result.php'
>>> for i in range(1, 6):
... payload={'pageNum': str(i)}
... r = FormRequest(url, formdata=payload)
... fetch(r)
... view(response)
But when I implement the postrequest into my scrapy code the post is referred back to the initial search site.
`2017-05-21 22:58:42 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.australianschoolsdirectory.com.au/robots.txt> (referer: None)
2017-05-21 22:58:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.australianschoolsdirectory.com.au/search-result.php> (referer: None)
2017-05-21 22:58:46 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://www.australianschoolsdirectory.com.au/**search.php>** (referer: http://www.australianschoolsdirectory.com.au/search-result.php)`
Of course the search.php doesn't have the data I'm looking for. Why is this Post in my code refering it back to search and not in the shell? And how can I stop the referral while still going to the next set of results?
Scrapy code:
from scrapy.http import FormRequest
from scrapy.spiders import Spider
class Foo(Spider):
name = "schoolsTest"
allowed_domains = ["australianschoolsdirectory.com.au"]
start_urls = ["http://www.australianschoolsdirectory.com.au/search-result.php"]
def parse(self, response):
yield FormRequest.from_response(response, formdata={'pageNum': str(5), 'search': 'true'}, callback=self.parse1)
def parse1(self, response):
print response.url
First of all, you don't need to use from_response (since you not dealing with form) and you can use a scrapy start_requests method:
import scrapy
class Foo(scrapy.Spider):
name = "schoolsTest"
def start_requests(self):
url = "http://www.australianschoolsdirectory.com.au/search-result.php"
# Change 5 to 488 to parse all search result
for i in range(1, 5):
payload = {'pageNum': str(i)}
yield scrapy.FormRequest(url, formdata=payload)
def parse(self, response):
# Extract all links from search page and make absolute urls
links = response.xpath('//div[#class="listing-header"]/a/#href').extract()
for link in links:
full_url = response.urljoin(link)
# Make a Request to each detail page
yield scrapy.Request(full_url, callback=self.parse_detail)
def parse_detail(self, response):
print(response.url)
i'm trying to make my spider go over a list and scrape all the url's it can find following them scraping some data and returning to continue on the next unscraped link if i run the spider i can see that it returns back to the starting page but tries to scrape the same page again and just quits afterwards any code suggestions pretty new to python.
import scrapy
import re
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
request = scrapy.Request(item['listurl'], callback=self.parseBasicListingInfo)
yield request
def parseBasicListingInfo(item, response):
item = ListResidentialItem()
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
return item
to clarify:
i'm passing [0] so it only takes the first link of the list
but i want it to continue using the next unscraped link
output after running the spider :
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/robots.txt> (referer: None)
2016-07-18 12:11:20 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/list> (referer: None)
2016-07-18 12:11:21 [scrapy] DEBUG: Crawled (200) <GET http://www.domain.com/link1> (referer: http://www.domain.com/list)
2016-07-18 12:11:21 [scrapy] DEBUG: Scraped from <200 http://www.domain.com/link1>
{'title': [u'\rlink1\r']}
This should just work fine. Change the domain and xpath and see
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class ProdItems(scrapy.Item):
listurl = scrapy.Field()
title = scrapy.Field()
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["domain.com"]
start_urls = [
"http://domain.com/list"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
list_urls = sel.xpath('//a[#id="link101"]/#href').extract()
for url in list_urls:
item['listurl'] = url
yield scrapy.Request(url, callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item['title'] = response.xpath('//span[#class="detail"]/text()').extract()
yield item
This is the line that's causing your problem:
item['listurl'] = sel.xpath('//a[#id="link101"]/#href').extract()[0]
The "//" means "from the start of the document" which means that it scans from the very first tag and will always find the same first link. What you need to do is search relative to the start of the current tag using ".//" which means "from this tag onwards". Also your current for loop is visiting every tag in the document which is unneccesary. Try this:
def parse(self, response):
for href in response.xpath('//a[#id="link101"]/#href').extract():
item = ProductionItem()
item['listurl'] = href
yield scrapy.Request(href,callback=self.parseBasicListingInfo, meta={'item': item})
The xpath pulls the hrefs out of the links and returns them as a list you can iterate over.