I am using this book and tried to download links by using crawler. But i dont know why anything is not happening. I followed page 55,57 code but no links are coming as per him.
here is the code:
File name linkextractCrawler.py
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
# Create a list of words to ignore
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self):
pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print 'Indexing %s' % url
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
ignorewords=set(['the','of','to','and','a','in','is','it'])
print("kk");
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup=BeautifulSoup(c.read( ))
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
print("kk");
print(pages);
On console:
>>> import linkextractCrawler
>>> p = ['https://en.wikipedia.org/wiki/Perl.html']
>>> crawler=linkextractCrawler.crawler('')
>>> crawler.crawl(p)
>>>
Related
I'm trying to solve an exercise, basically, I have to parse a JSON page and search for an object. If the object is not found then I have to search the next page for it. If the person I'm looking for is on the first page then I pass the test but I fail if it's on another page.
I checked and each page is parsed correctly but the return is always undefined if it's not on the first page.
This is my code:
import urllib.request
import json
class Solution:
def __new__(self, character):
url = 'https://challenges.hackajob.co/swapi/api/people/'
numberOfFilms = 0
#
# Some work here; return type and arguments should be according to the problem's requirements
#
numberOfFilms = self.search(self,character,url)
return numberOfFilms
def search(self, character,url):
numberOfFilms = 0
found = False
with urllib.request.urlopen(url) as response:
data = response.read()
jsonData = json.loads(data.decode('utf-8'))
for r in jsonData['results']:
if r['name'] == character:
return len(r['films'])
if (jsonData['next']):
nextPage = jsonData['next']
self.search(self,character,nextPage)
change the last line to return self.search(self,character,nextPage)
I'm a newbie on python and thus on scrapy (tools to crawl website written in python...) too, hope someone can shed some lights on my way... I just wrote a spider consisting on 2 parsing fonctions:
- the first parsing function to parse the start page I'm crawling & which contains chapters & sub-chapters with 7 levels, some of the chapters at various level pointing ( to articles or lists of articles
- the second parsing function is there to parse the articles or list of articles and is invoked as the call back of scrapy.Request(...)
The objective of this spider is to create a sort of big DOM of the entire content with the chapters, sub-chapters, articles & their content.
I have a problem in the second function which seems to receive sometime responses that do not correspond the content located at the url used when invoking scrapy.Request. This problem disappeared when setting CONCURRENT_REQUESTS to 1. I initially thought that this was due to some multi-threading / non-re-entrant functions pb but found that I had no re-entrances issue and read afterwards that scrapy was actually not multi-threaded... so I cannot figure out where my pb comes from.
Here a snippet of my code
#---------------------------------------------
# Init part:
#---------------------------------------------
import scrapy
from scrapy import signals
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from scrapy.exceptions import CloseSpider
top = Element('top')
curChild = top
class mytest(scrapy.Spider):
name = 'lfb'
#
# This is what make my code working but I don't know why !!!
# Ideally would like to benefit from the speed of having several concurrent
# requests when crawling & parsing
#
custom_settings = {
'CONCURRENT_REQUESTS': '1',
}
#
# This section is just here to be able to do something when the spider closes
# In this case I want to print the DOM I've created.
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(mytest, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
print ("Spider closed - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
#this is to print the DOM created at the end
print tostring(top)
def parse(self, response):
pass
def start_requests(self):
level = 0
print "Start parsing legifrance level set to %d" % level
# This is to print the DOM which is empty (or almost - just the top element in there)
print tostring(top)
yield scrapy.Request("<Home Page>", callback=self.parse)
#----------------------------------------------
# First parsing function - Parsing the Home page - this one works fine (I think)
#----------------------------------------------
def parse(self, response):
for sel in response.xpath('//span'):
cl = sel.xpath("#class").extract()
desc = sel.xpath('text()').extract()
#
# Do some stuff here depending on the class (cl) of 'span' which corresponds
# to either one of the # 7 levels of chapters & sub-chapters or to list of
# articles attached to a sub-chapters. To simplify I'm just putting here the
# code corresponding to the handling of list of articles (cl == codeLienArt)
# ...
# ...
if cl == [unicode('codeLienArt')]:
art_plink= sel.css('a::attr("href")').extract()
artLink= "<Base URL>"+str(unicode(art_plink[0]))
#
# curChild points to the element in the DOM to which the list of articles
# should be attached. Pass it in the request meta, in order for the second
# parsing function to place the articles & their content at the right place
# in the DOM
#
thisChild = curChild
#
# print for debug - thisChild.text contains the heading of the sub-chapter
# to which the list of articles that will be processed by parse1 should be
# attached.
#
print "follow link cl:%s art:%s for %s" % (cl, sel.xpath('a/text()').extract(), thisChild.text )
#
# get the list of articles following artLink & pass the response to the second parsing function
# (I know it's called parse1 :-)
#
yield scrapy.Request(artLink, callback=self.parse1, meta={ 'element': thisChild })
#-------------------
# This is the second parsing function that parses list of Articles & their content
# format is basically one or several articles, each being presented(simplified) as
# < div class="Articles">
# <div class="titreArt"> Title here</div>
# <div class="corpsArt"> Sometime some text and often a list of paragraph <p>sentences</p>" ></div>
# </div>
#-------------------
def parse1(self, resp):
print "enter parse1"
numberOfArticles= 0
for selArt in resp.xpath('//div[#class="article"]'):
#
# This is where I see the problem when CONCURRENT_REQUESTS > 1, sometimes
# the response points to a page that is not the page that was requested in
# the previous parsing function...
#
clArt = selArt.xpath('.//div[#class="titreArt"]/text()').extract()
print clArt
numberOfArticles += 1
childArt = SubElement(resp.meta['element'], 'Article')
childArt.text =str(unicode("%s" % clArt[0]))
corpsArt = selArt.xpath('.//div[#class="corpsArt"]/text()').extract()
print "corpsArt=%s" % corpsArt
temp = ''
for corpsItem in corpsArt:
if corpsItem != '\n':
temp += corpsItem
if temp != '':
childCorps = SubElement(childArt, 'p')
childCorps.text = temp
print "corpsArt is not empty %s" % temp
for paraArt in selArt.xpath('.//div[#class="corpsArt"]//p/text()').extract():
childPara = SubElement(childArt, 'p')
childPara.text = paraArt
print "childPara.text=%s" % childPara.text
print "link followed %s (%d)" % (resp.url,numberOfArticles)
print "leave parse1"
yield
As some of you may have gathered, I'm learning scrapy to scrape some data off of Google Scholar for a research project that I am running. I have a file that contains many article titles for which I am scraping citations. I read in the file using pandas, generate the URLs that need scraping, and start scraping.
One problem that I face is 503 errors. Google shuts me off fairly quickly, and many entries remain unscraped. This is a problem that I am working on using some middleware provided by Crawlera.
Another problem I face is that when I export my scraped data, I have a hard time matching the scraped data to what I was trying to look for. My input data is a CSV file with three fields -- 'Authors','Title','pid' where 'pid' is a unique identifier.
I use pandas to read in the file and generate URLs for scholar based off the title. Each time a given URL is scraped, my spider goes through the scholar webpage, and picks up the title, publication information and cites for each article listed on that page.
Here is how I generate the links for scraping:
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
# generate a var to store links
links = []
# create the URLs to crawl
for entry in queries:
links.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = links
For example, one title from my data file could be the paper 'Elephants Don't Play Chess' by Rodney Brooks with 'pid' 5067. The spider goes to
http://scholar.google.com/scholar?q=allintitle%3Aelephants+don%27t+play+chess
Now on this page, there are six hits. The spider gets all six hits, but they need to be assigned the same 'pid'. I know I need to insert a line somewhere that reads something like item['pid'] = data.pid.apply("something") but I can't figure out exactly how I would do that.
Below is the rest of the code for my spider. I am sure the way to do this is pretty straightforward, but I can't think of how to get the spider to know which entry of data.pid it should look for if that makes sense.
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
I hope this question is well-defined, but please let me know if I can be more clear. There is really not much else in my scraper except some lines at the top importing things.
Edit 1: Based on suggestions below, I have modified my code as follows:
# test-case: http://scholar.google.com/scholar?q=intitle%3Amigratory+birds
import re
from pandas import *
import urllib
from scrapy.spider import Spider
from scrapy.selector import Selector
from scholarscrape.items import ScholarscrapeItem
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
pid = data.pid
# generate a var to store links
urls = []
# create the URLs to crawl
for entry in queries:
urls.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = (
(urls, pid),
)
def make_requests_from_url(self, (url,pid)):
return Request(url, meta={'pid':pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
item['pid'] = response.meta['pid']
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
You need to populate your list start_urls with tuples (url, pid).
Now redefine the method make_requests_from_url(url):
class ScholarSpider(Spider):
name = "ScholarSpider"
allowed_domains = ["scholar.google.com"]
start_urls = (
('http://www.scholar.google.com/', 100),
)
def make_requests_from_url(self, (url, pid)):
return Request(url, meta={'pid': pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
pid = response.meta['pid']
print '!!!!!!!!!!!', pid, '!!!!!!!!!!!!'
pass
I have written a python code to fetch the web-page corresponding to a given url, and parses all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.
Here code:
import BeautifulSoup
import urllib2
import itertools
import random
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup.BeautifulSoup(html_code)
page_links = []
try :
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href,
( a.get('href') for a in self.soup.findAll('a') ) )
except Exception: # Magnificent exception handling
pass
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
This code does not fetch internal links (only absolute formed hyperlinks)
How to fetch Internal links that starts with '/' or '#' or '.'
Well, your code kind of already tells you what's going on. In your lambda you are only grabbing absolute links that start with http:// (which you are not grabbing https FWIW). You should grab all of the links and check to see if they start with http+ or not. If they don't, then they are a relative link, and since you know what the current_page is then you can use that to create an absolute link.
Here's a modification to your code. Excuse my Python as it's a little rusty, but I ran it and it worked in Python 2.7 for me. You'll want to clean it up and add some edge/error detection, but you get the gist:
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
class Crawler(object):
"""docstring for Crawler"""
def __init__(self):
self.soup = None # Beautiful Soup object
self.current_page = "http://www.python.org/" # Current page's address
self.links = set() # Queue with every links fetched
self.visited_links = set()
self.counter = 0 # Simple counter for debug purpose
def open(self):
# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)
# Fetch every links
self.soup = BeautifulSoup(html_code)
page_links = []
try :
for link in [h.get('href') for h in self.soup.find_all('a')]:
print "Found link: '" + link + "'"
if link.startswith('http'):
page_links.append(link)
print "Adding link" + link + "\n"
elif link.startswith('/'):
parts = urlparse.urlparse(self.current_page)
page_links.append(parts.scheme + '://' + parts.netloc + link)
print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
else:
page_links.append(self.current_page+link)
print "Adding link " + self.current_page+link + "\n"
except Exception, ex: # Magnificent exception handling
print ex
# Update links
self.links = self.links.union( set(page_links) )
# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1
def run(self):
# Crawl 3 webpages (or stop if all url has been fetched)
while len(self.visited_links) < 3 or (self.visited_links == self.links):
self.open()
for link in self.links:
print link
if __name__ == '__main__':
C = Crawler()
C.run()
chage condition in lambda:
page_links = itertools.ifilter( # Only deal with absolute links
lambda href: 'http://' in href or href.startswith('/') or href.startswith('#') or href.startswith('.'),
( a.get('href') for a in self.soup.findAll('a') ) )
Here's the problem:
Users register for a site and can pick one of 8 job categories, or choose to skip this step. I want to classify the users who've skipped that step into job categories, based on the domain name in their email address.
Current setup:
Using a combination of Beautiful Soup and nltk, I scrape the homepage and look for links to pages on the site that contain the word "about". I scrape that page, too. I've copied the bit of code that does the scraping at the end of this post.
The issue:
I'm not getting enough data to get a good learning routine in place. I'd like to know if my scraping algorithm is set up for success--in other words, are there any gaping holes in my logic, or any better way to ensure that I have a good chunk of text that describes what kind of work a company does?
The (relevant) code:
import bs4 as bs
import httplib2 as http
import nltk
# Only these characters are valid in a url
ALLOWED_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]#!$&'()*+,;="
class WebPage(object):
def __init__(self, domain):
"""
Constructor
:param domain: URL to look at
:type domain: str
"""
self.url = 'http://www.' + domain
try:
self._get_homepage()
except: # Catch specific here?
self.homepage = None
try:
self._get_about_us()
except:
self.about_us = None
def _get_homepage(self):
"""
Open the home page, looking for redirects
"""
import re
web = http.Http()
response, pg = web.request(self.url)
# Check for redirects:
if int(response.get('content-length',251)) < 250:
new_url = re.findall(r'(https?://\S+)', pg)[0]
if len(new_url): # otherwise there's not much I can do...
self.url = ''.join(x for x in new_url if x in ALLOWED_CHARS)
response, pg = web.request(self.url)
self.homepage = self._parse_html(nltk.clean_html(pg))
self._raw_homepage = pg
def _get_about_us(self):
"""
Soup-ify the home page, find the "About us" page, and store its contents in a
string
"""
soup = bs.BeautifulSoup(self._raw_homepage)
links = [x for x in soup.findAll('a') if x.get('href', None) is not None]
about = [x.get('href') for x in links if 'about' in x.get('href', '').lower()]
# need to find about or about-us
about_us_page = None
for a in about:
bits = a.strip('/').split('/')
if len(bits) == 1:
about_us_page = bits[0]
elif 'about' in bits[-1].lower():
about_us_page = bits[-1]
# otherwise assume shortest string is top-level about pg.
if about_us_page is None and len(about):
about_us_page = min(about, key=len)
self.about_us = None
if about_us_page is not None:
self.about_us_url = self.url + '/' + about_us_page
web = http.Http()
response, pg = web.request(self.about_us_url)
if int(response.get('content-length', 251)) > 250:
self.about_us = self._parse_html(nltk.clean_html(pg))
def _parse_html(self, raw_text):
"""
Clean html coming from a web page. Gets rid of
- all '\n' and '\r' characters
- all zero length words
- all unicode characters that aren't ascii (i.e., &...)
"""
lines = [x.strip() for x in raw_text.splitlines()]
all_text = ' '.join([x for x in lines if len(x)]) # zero length strings
return [x for x in all_text.split(' ') if len(x) and x[0] != '&']
It is outside of what you are asking, but I would look at calling an external data source that has already collected this information. A good place to find such a service would be on the Programmable Web (for instance Mergent Company Fundamentals). Not all the data on Programmable Web is up-to-date but it seems like a lot of API providers are out there.