I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')
I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.
Related
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')
I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql
I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old
I am new to scrapy and python, I am able to get details from URL, I want enter into link and download all files(.htm and .txt).
My Code
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
And I need to enter into link and download all the files with ends with .htm and .txt files. Below code is not working..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
Can Anyone help me with this ? Thanks in Advance.
Try the following to get the files downloaded in your desktop or wherever you mention within the script:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
To be clearer: you need to specify explicitly dirf = r"C:\Users\WCS\Desktop\Storage" where C:\Users\WCS\Desktop or something will be your desired location. However, the script will automatically create Storage folder to save those files within.
Having an issue where I want to add output to a csv file but it does not start below the field name it is placed in the next row in sequence as opposed to placing it at row 2 when populating the playerMins item in the csv file. Can someone please tell me where my code is going wrong?? Here it is:
class EspnSpider3(BaseSpider):
name = "espn3.org"
allowed_domains = ["espn3.org"]
start_urls = [
"http://scores.espn.go.com/nba/boxscore?gameId=310502004"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = EspnItem()
rows = []
playerName = []
playerMins = []
# player names
p_names = hxs.select('(//table[#class="mod-data"][1]/tbody/tr)//a/text()').extract()
for p_name in p_names:
print p_name
yield EspnItem(playerName=p_name)
# minutes
p_minutes = hxs.select('(//table[#class="mod-data"][1]/tbody/tr)/td[2]').extract()
for p_minute in p_minutes:
print p_minute
yield EspnItem(playerMins=p_minute)
Was able to solve my issue, after much googling and rtfm: Trying to Use an ItemExporter in Scrapy
Here is my working code:
def parse(self, response):
hxs = HtmlXPathSelector(response)
player_names = hxs.select('(//table[#class="mod-data"][1]/tbody/tr)')
for p_name in player_names:
l = XPathItemLoader(item=EspnItem(), selector=p_name )
l.add_xpath('playerName', 'td[1]/a/text()')
l.add_xpath('playerMins', 'td[2]')
yield l.load_item()