How can I scrape links in all my webpages? - python

I have this code so far that extracts text from the page URLs, using scrapy:
class QuotesSpider(scrapy.Spider):
name = "dialpad"
def start_requests(self):
urls = [
'https://help.dialpad.com/hc/en-us/categories/201278063-User-Support',
'https://www.domo.com/',
'https://www.zenreach.com/',
'https://www.trendkite.com/',
'https://peloton.com/',
'https://ting.com/',
'https://www.cedar.com/',
'https://tophat.com/',
'https://www.bambora.com/en/ca/',
'https://www.hoteltonight.com/'
]
for url in urls:
BASE_URL = url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[2]
filename = 'quotes-thing-{}.csv'.format(page)
BASE_URL = response.url
# with open(filename, 'wb') as f:
# f.write(response.body)
# # with open(filename, 'r') as f:
with open(filename, 'w') as f:
for selector in response.css('body').xpath('.//text()'):
selector = selector.extract()
f.write(selector)
How can I also extract data from the links on those pages and write them to that filename that I create?

You could use CrawlSpider to extract each link and scrape them, your code could look like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spider import CrawlSpider, Rule
class QuotesSpider(CrawlSpider):
name = "dialpad"
start_urls = [
'https://help.dialpad.com/hc/en-us/categories/201278063-User-Support',
'https://www.domo.com/',
'https://www.zenreach.com/',
'https://www.trendkite.com/',
'https://peloton.com/',
'https://ting.com/',
'https://www.cedar.com/',
'https://tophat.com/',
'https://www.bambora.com/en/ca/',
'https://www.hoteltonight.com/'
]
rules = [
Rule(
LinkExtractor(
allow=(r'url patterns here to follow'),
deny=(r'other url patterns to deny'),
),
callback='parse_item',
follow=True,
)
]
def parse_item(self, response):
page = response.url.split("/")[2]
filename = 'quotes-thing-{}.csv'.format(page)
with open(filename, 'w') as f:
for selector in response.css('body').xpath('.//text()'):
selector = selector.extract()
f.write(selector)
Though I recommend creating a different spider for each website, and use allow and deny parameters to choose which links you want to be extracted on each website.
also it would be much better to use Scrapy Items

Related

Scrapy: How to store scraped data in different json files within one crawler run?

I'm using generic spiders with a list of multiple urls in the start_urls field.
Is it possible to export one json file for each URL?
As far as I know it's only possible to set one path to one specific output file.
Any ideas how to solve this are rewarded!
EDIT: This is my spider class:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'my_spider'
start_urls = start_urls = ['www.domain1.com','www.domain2.com',
'www.domain3.com']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
'DEPTH_LIMIT': '1',
'FEED_URI': 'file:///C:/path/to/result.json',
}
rules = (
Rule(LinkExtractor(allow=r"abc"), callback='parse_item', follow=True),
)
def parse_item(self, response):
all_text = response.xpath("//p/text()").getall()
yield {
"text": " ".join(all_text),
"url": response.url,
}
First option
You can save the items in the spider as Scrapy tutorial for example:
import scrapy
import json
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = "mydomain"
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
filename = DICT[response.url]
with open(filename, 'w') as fp:
json.dump({"content": response.body.decode("utf-8")}, fp)
The DICT variable is just for specifying the JSON filename but you can use the domain as the filename too.
Second option
You can try using process_item in pipelines.py as follow:
from scrapy.exporters import JsonItemExporter
class SaveJsonPipeline:
def process_item(self, item, spider):
filename = item['filename']
del item['filename']
JsonItemExporter(open(filename, "wb")).export_item(item)
return item
item['filename'] is for save the filename for each start_url. You need to set the items.py too, for example:
import scrapy
class MydomainItem(scrapy.Item):
filename = scrapy.Field()
content = scrapy.Field()
your spider:
import scrapy
from ..items import MydomainItem
DICT = {
'https://quotes.toscrape.com/page/1/': 'domain1.json',
'https://quotes.toscrape.com/page/2/': 'domain2.json',
}
class MydomainSpider(scrapy.Spider):
name = 'mydomain'
allowed_domains = ['mydomain.com']
start_urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
def parse(self, response):
item = MydomainItem()
item["filename"] = DICT[response.url]
item["content"] = response.body.decode("utf-8")
yield item
Before running you need to add the pipeline in your settings.
ITEM_PIPELINES = {
'myproject.pipelines.SaveJsonPipeline': 300,
}

Scrapy to print results in real time rather than waiting for crawl to finish

Is it possible for scrapy to print the results in real-time? I'm planning to crawl large sites and fear that if my vpn connection cuts off, crawl effort will just be wasted since it won't print any results.
I'm currently using VPN with rotating user agents and I know it's ideal to use rotating proxies instead of VPN but that will be for the future script upgrade.
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
results = open('results.csv','w')
class TestSpider(CrawlSpider):
name = "test"
with open("domains.txt", "r") as d:
allowed_domains = [url.strip() for url in d.readlines()]
with open("urls.txt", "r") as f:
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = (Rule(LinkExtractor(allow=('/'), deny=('9','10')), follow=True, callback='parse_item'),)
def parse_item(self, response):
for pattern in ['Albert Einstein', 'Bob Marley']:
result = re.findall(pattern, response.text)
print(response.url,">",pattern,'>',len(result), file = results)
Many thanks in advance.
Updates
The script from harada works perfectly without any changes at all apart from the save file. All I needed to do was to make some modifications to the current files as below in order for everything to work.
spider - defined items
import scrapy
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import TestItem
class TestSpider(CrawlSpider):
name = "test"
with open("domains.txt", "r") as d:
allowed_domains = [url.strip() for url in d.readlines()]
with open("urls.txt", "r") as f:
start_urls = [url.strip() for url in f.readlines()]
f.close()
rules = (Rule(LinkExtractor(allow=('/'), deny=('9','10')), follow=True, callback='parse_item'),)
def parse_item(self, response):
items = TestItem()
for pattern in ['Albert Einstein', 'Bob Marley']:
result = re.findall(pattern, response.text)
url = response.url
count = len(result)
items['url'] = url
items['pattern'] = pattern
items['count'] = count
yield(items)
items.py - added items as fields
import scrapy
class TestItem(scrapy.Item):
url = scrapy.Field()
pattern = scrapy.Field()
count = scrapy.Field()
settings.py - uncommented ITEM_PIPELINES
ITEM_PIPELINES = {
'test.pipelines.TestPipeline': 300,
}
You can add a script to your pipeline that can save the data you have at that time to a file. Add a counter to the pipeline as a variable, and when the pipeline reaches a certain threshold (let's say, each 1000 items yielded), it should write to a file. The code would look something like this. I tried to make it as general as possible.
class MyPipeline:
def __init__(self):
# variable that keeps track of the total number of items yielded
self.total_count = 0
self.data = []
def process_item(self, item, spider):
self.data.append(item)
self.total_count += 1
if self.total_count % 1000 == 0:
# write to your file of choice....
# I'm not sure how your data is stored throughout the crawling process
# If it's a variable of the pipeline like self.data,
# then just write that to the file
with open("test.txt", "w") as myfile:
myfile.write(f'{self.data}')
return item

how to traverse a entire domain instead of providing individual links

Currently our spider works off a list of hard coded urls, would like to change that to just work off the main domain.
How can we change the below code to just expect the domain
https://www.example.com/shop/
If there is a good source with examples that would be great.
def start_requests(self):
urls = [
# 'https://www.example.com/shop/outdoors-unknown-hart-creek-fleece-hoodie',
'https://www.example.com/shop/adidas-unknown-essentials-cotton-fleece-3s-over-head-hoodie#repChildCatSku=111767466',
'https://www.example.com/shop/unknown-metallic-long-sleeve-shirt#repChildCatSku=115673740',
'https://www.example.com/shop/unknown-fleece-full-zip-hoodie#repChildCatSku=111121673',
'https://www.example.com/shop/unknown-therma-fleece-training-hoodie#repChildCatSku=114784077',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/under-unknown-rival-fleece-full-zip-hoodie#repChildCatSku=115448841',
'https://www.example.com/shop/under-unknown-rival-fleece-crew-sweater#repChildCatSku=114636980',
'https://www.example.com/shop/adidas-unknown-essentials-3-stripe-fleece-sweatshirt#repChildCatSku=115001812',
'https://www.example.com/shop/under-unknown-fleece-logo-hoodie#repChildCatSku=115305875',
'https://www.example.com/shop/under-unknown-heatgear-long-sleeve-shirt#repChildCatSku=107534192',
'https://www.example.com/shop/unknown-long-sleeve-legend-hoodie#repChildCatSku=112187421',
'https://www.example.com/shop/unknown-element-1-2-zip-top#repChildCatSku=114794996',
'https://www.example.com/shop/unknown-sportswear-funnel-neck-hoodie-111112208#repChildCatSku=111112208',
'https://www.example.com/shop/unknown-therma-swoosh-fleece-training-hoodie#repChildCatSku=114784481',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-1]
filename = 'academy-%s.txt' % page
res2 = response.xpath("//span[#itemprop='price']/text()|//span[#itemprop='sku']/text()").extract()
res = '\n'.join(res2)
with open(filename, 'w') as f:
f.write(res)
self.log('Saved file %s' % filename)
Just for pure traversing you can make:
class MySpider(scrapy.Spider):
name = 'my'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/shop/']
def parse(self, response):
for link in response.css('a'):
yield response.follow(link)
But this task seems meaningless. Can you detail your question?

How can I open multiple links on a webpage and scrape there data?

I hope you guys are best with your health and R&D work.
import webbrowser
import scrapy
from urllib.request import urlopen
import re
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "forum"
def start_requests(self):
urls = ['https://tribune.com.pk/'], #'https://www.siasat.pk/forum/content.php/', 'http://hamariweb.com/news/', 'https://www.urdupoint.com/pakistan/all-news/']
for url in urls:
website = urlopen(url)
webbrowser.open(website)
print("HELLO WORLD")
html = website.read()
all_links = re.findall('"((http|ftp)s?://.*?)"', html)
for link in all_links:
yield scrapy.Request(url=link, callback=self.parse)
def parse(self, response):
page = response.url.split('/')[-2]
filename = '%s' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
I want to open a webpage and that webpage contains many other links, I want to open all those and wants Scrapy to scrape all those web pages. Please help me out.
Thanks in Advance.
I have tried with monsterindia.com and open page using scrapy, that page contain multiple links. I have scraped all the data in the respective link and also we can do pagination. The following code may useful.
class MonsterSpider(scrapy.Spider):
name = 'monster'
start_urls = ['http://jobsearch.monsterindia.com/searchresult.html?day=1&jbc=22']
item = BotItem()
count = 1
def parse(self, response):
for href in response.css('h2.seotitle > a::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url =url, callback = self.parse_details)
next_page_url = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/#althref').extract_first()
print next_page_url
if next_page_url:
nextpage = response.css('ul.pager').xpath('//a[contains(text(), "Next")]/#onclick').extract_first()
searchresult_num = nextpage.split("'")[1].strip()
next_page_url = "http://jobsearch.monsterindia.com/searchresult.html?day=1&n="+searchresult_num
next_page_url = response.urljoin(next_page_url)
print next_page_url
yield scrapy.Request(url = next_page_url, callback = self.parse)

Using Regex to Select Index Page Only

Is there regex to select on the index page when crawling a specific website? I'm select certain pages but also need just the index page on top of those.
I can't seem to figure out the proper way to put it. Basically, I want to crawl index page, contact page, about page, and advertise page to look for contact information.
Here is the code.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[#name=\'description\']/#content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
What you need to do is to call parse_item() callback from the parse_start_url() - this way you would also parse the url coming from start_urls, which is I am assuming is an index page:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
See also:
Scrapy CrawlSpider doesn't crawl the first landing page

Categories