Currently our spider works off a list of hard coded urls, would like to change that to just work off the main domain.
How can we change the below code to just expect the domain
https://www.example.com/shop/
If there is a good source with examples that would be great.
def start_requests(self):
urls = [
# 'https://www.example.com/shop/outdoors-unknown-hart-creek-fleece-hoodie',
'https://www.example.com/shop/adidas-unknown-essentials-cotton-fleece-3s-over-head-hoodie#repChildCatSku=111767466',
'https://www.example.com/shop/unknown-metallic-long-sleeve-shirt#repChildCatSku=115673740',
'https://www.example.com/shop/unknown-fleece-full-zip-hoodie#repChildCatSku=111121673',
'https://www.example.com/shop/unknown-therma-fleece-training-hoodie#repChildCatSku=114784077',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/under-unknown-rival-fleece-full-zip-hoodie#repChildCatSku=115448841',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/adidas-unknown-essentials-3-stripe-fleece-sweatshirt#repChildCatSku=115001812',
'https://www.example.com/shop/under-unknown-fleece-logo-hoodie#repChildCatSku=115305875',
'https://www.example.com/shop/under-unknown-heatgear-long-sleeve-shirt#repChildCatSku=107534192',
'https://www.example.com/shop/unknown-long-sleeve-legend-hoodie#repChildCatSku=112187421',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-sportswear-funnel-neck-hoodie-111112208#repChildCatSku=111112208',
'https://www.example.com/shop/unknown-therma-swoosh-fleece-training-hoodie#repChildCatSku=114784481',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'academy-%s.txt' % page
res2 = response.xpath("//span[#itemprop='price']/text()|//span[#itemprop='sku']/text()").extract()
res = '\n'.join(res2)
with open(filename, 'w') as f:
f.write(res)
self.log('Saved file %s' % filename)
Just for pure traversing you can make:
class MySpider(scrapy.Spider):
name = 'my'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/shop/']
def parse(self, response):
for link in response.css('a'):
yield response.follow(link)
But this task seems meaningless. Can you detail your question?
Related
Hi I would like to scrape 2 different domain in my script I have tried my if statement but I it seems that it is not working, any idea please?
Here's my code
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com']
start_urls = [
'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999',
'https://www.forever21.com/us/shop/catalog/category/f21/sale',
]
def parse_start_url(response):
if (response.url == 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'):
parse_1(response)
if (response.url == 'https://www.forever21.com/us/shop/catalog/category/f21/sale'):
parse_2(response)
def parse_1(self, response):
for product_item in response.css('li.product-item'):
item = {
'title': product_item.css('h3.item-heading a.link::text').extract_first(),
'regular-price': product_item.css('strong.item-price span.price.regular::text').extract_first(),
'sale-price': product_item.css('strong.item-price span.price.sale::text').extract_first(),
'photo-url': product_item.css('.image-container img::attr(data-src)').extract_first(),
'description-url': "https://www2.hm.com/" + product_item.css('h3.item-heading a::attr(href)').extract_first(),
}
yield item
def parse_2(self, response):
#Some code getting item on domain 2
Please Help thank you
Check your allowed_domains variable. You should add new domain, like ['www2.hm.com', 'forever21.com'] or remove it at all. Also you have no parse function.
I can suppose to remove your start_urls with if and use start_requests instead. Your code will be more readable.
import scrapy
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com', 'forever21.com']
def start_requests(self):
urls = (
(self.parse_1, 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'),
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_1(self, response):
print 111111111
def parse_2(self, response):
print 2222222222
I am new to scrapy and python, I am able to get details from URL, I want enter into link and download all files(.htm and .txt).
My Code
import scrapy
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_page)
def parse_page(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield {"links":targetLink}
And I need to enter into link and download all the files with ends with .htm and .txt files. Below code is not working..
if link.endswith('.htm'):
link = urlparse.urljoin(base_url, link)
req = Request(link, callback=self.save_pdf)
yield req
def save_pdf(self, response):
path = response.url.split('/')[-1]
with open(path, 'wb') as f:
f.write(response.body)
Can Anyone help me with this ? Thanks in Advance.
Try the following to get the files downloaded in your desktop or wherever you mention within the script:
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"C:\Users\WCS\Desktop\Storage"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
To be clearer: you need to specify explicitly dirf = r"C:\Users\WCS\Desktop\Storage" where C:\Users\WCS\Desktop or something will be your desired location. However, the script will automatically create Storage folder to save those files within.
Here's my code
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
names = response.xpath("//ul[#class='mlist']//li/a/#title").extract()
on = response.meta.get("names", [])
cmp_names = on + names
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
meta={"names": cmp_names},
callback=self.parse)
yield scrapy.Request("http://www.piaov.com", meta={"names": cmp_names}, callback=self.parse_item)
def parse_item(self, response):
pass
When i debug my code in 'parse_item' function,the 'response.meta["names"]' only include the first page datas(12 titles in this case), how could i get the 6 pages datas list.
Its because you have URL http://www.piaov.com and scrapy ignores the duplicate URLs unless dont_filter=True is specified in Request like Request(url_here, dont_filter=True)
Also I don't like your logic of scraper, why are you calling parse_item at all? it is not necessary. Please see the code below and do it like that.
import scrapy
class PvSpider(scrapy.Spider):
name = 'pv'
allowed_domains = ['www.piaov.com']
start_urls = ['http://www.piaov.com/']
def start_requests(self):
yield scrapy.Request(url='http://www.piaov.com/list/7.html')
def parse(self, response):
for name in response.xpath("//ul[#class='mlist']//li/a/#title").extract():
yield {"name": name}
for p in range(2, 7):
yield scrapy.Request(url='http://www.piaov.com/list/7_{}.html'.format(p),
callback=self.parse)
I have this code so far that extracts text from the page URLs, using scrapy:
class QuotesSpider(scrapy.Spider):
name = "dialpad"
def start_requests(self):
urls = [
'https://help.dialpad.com/hc/en-us/categories/201278063-User-Support',
'https://www.domo.com/',
'https://www.zenreach.com/',
'https://www.trendkite.com/',
'https://peloton.com/',
'https://ting.com/',
'https://www.cedar.com/',
'https://tophat.com/',
'https://www.bambora.com/en/ca/',
'https://www.hoteltonight.com/'
]
for url in urls:
BASE_URL = url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[2]
filename = 'quotes-thing-{}.csv'.format(page)
BASE_URL = response.url
# with open(filename, 'wb') as f:
# f.write(response.body)
# # with open(filename, 'r') as f:
with open(filename, 'w') as f:
for selector in response.css('body').xpath('.//text()'):
selector = selector.extract()
f.write(selector)
How can I also extract data from the links on those pages and write them to that filename that I create?
You could use CrawlSpider to extract each link and scrape them, your code could look like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spider import CrawlSpider, Rule
class QuotesSpider(CrawlSpider):
name = "dialpad"
start_urls = [
'https://help.dialpad.com/hc/en-us/categories/201278063-User-Support',
'https://www.domo.com/',
'https://www.zenreach.com/',
'https://www.trendkite.com/',
'https://peloton.com/',
'https://ting.com/',
'https://www.cedar.com/',
'https://tophat.com/',
'https://www.bambora.com/en/ca/',
'https://www.hoteltonight.com/'
]
rules = [
Rule(
LinkExtractor(
allow=(r'url patterns here to follow'),
deny=(r'other url patterns to deny'),
),
callback='parse_item',
follow=True,
)
]
def parse_item(self, response):
page = response.url.split("/")[2]
filename = 'quotes-thing-{}.csv'.format(page)
with open(filename, 'w') as f:
for selector in response.css('body').xpath('.//text()'):
selector = selector.extract()
f.write(selector)
Though I recommend creating a different spider for each website, and use allow and deny parameters to choose which links you want to be extracted on each website.
also it would be much better to use Scrapy Items
I can't figure out how to make scrapy crawl links in order
I've got a page with articles and in each one there is a title but the article doesn't match the title
Also in settings.py I added:
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
I've got something like this:
class Getgot(Spider):
name = "getem"
allowed_domains = ["somesite.us"]
start_urls = ["file:local.html"]
el = '//div[#article]'
def parse(self,response):
hxs = HtmlXPathSelector(response)
s = hxs.select('//article')
filename = ("links.txt")
filly = open(filename, "w")
for i in s:
t = i.select('a/#href').extract()
filly.write(str(t[0])+'\n')
yield Request(str(t[0]),callback=self.parse_page)
def parse_page(self,res):
hxs = HtmlXPathSelector(res)
s = hxs.select('//iframe').extract()
if s:
filename = ("frames.txt")
filly = open(filename, "a")
filly.write(str(s[0])+'\n')
else:
filename = ("/frames.txt")
filly = open(filename, "a")
filly.write('[]\n')
I'm not sure I understand how your question and your code are related. Where is the title ?
A few tips: 1) update your scrapy syntax with the latest version 2) don't write any files from the spider, write it in a pipeline or export feed. 3) if you need to transfer data from one function to the next, use the meta attribute.
def parse(self, response):
for link in response.xpath("//article/a/#href").extract():
yield Request(link, callback=self.parse_page, meta={'link':link})
def parse_page(self, response):
for frame in response.xpath("//iframe").extract():
item = MyItem()
item['link'] = response.meta['link']
item['frame'] = frame
yield item
And then you export it to csv or json or whatever, to store the link and the frame together.