I'm a newbie on python and thus on scrapy (tools to crawl website written in python...) too, hope someone can shed some lights on my way... I just wrote a spider consisting on 2 parsing fonctions:
- the first parsing function to parse the start page I'm crawling & which contains chapters & sub-chapters with 7 levels, some of the chapters at various level pointing ( to articles or lists of articles
- the second parsing function is there to parse the articles or list of articles and is invoked as the call back of scrapy.Request(...)
The objective of this spider is to create a sort of big DOM of the entire content with the chapters, sub-chapters, articles & their content.
I have a problem in the second function which seems to receive sometime responses that do not correspond the content located at the url used when invoking scrapy.Request. This problem disappeared when setting CONCURRENT_REQUESTS to 1. I initially thought that this was due to some multi-threading / non-re-entrant functions pb but found that I had no re-entrances issue and read afterwards that scrapy was actually not multi-threaded... so I cannot figure out where my pb comes from.
Here a snippet of my code
#---------------------------------------------
# Init part:
#---------------------------------------------
import scrapy
from scrapy import signals
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from scrapy.exceptions import CloseSpider
top = Element('top')
curChild = top
class mytest(scrapy.Spider):
name = 'lfb'
#
# This is what make my code working but I don't know why !!!
# Ideally would like to benefit from the speed of having several concurrent
# requests when crawling & parsing
#
custom_settings = {
'CONCURRENT_REQUESTS': '1',
}
#
# This section is just here to be able to do something when the spider closes
# In this case I want to print the DOM I've created.
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(mytest, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
print ("Spider closed - !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
#this is to print the DOM created at the end
print tostring(top)
def parse(self, response):
pass
def start_requests(self):
level = 0
print "Start parsing legifrance level set to %d" % level
# This is to print the DOM which is empty (or almost - just the top element in there)
print tostring(top)
yield scrapy.Request("<Home Page>", callback=self.parse)
#----------------------------------------------
# First parsing function - Parsing the Home page - this one works fine (I think)
#----------------------------------------------
def parse(self, response):
for sel in response.xpath('//span'):
cl = sel.xpath("#class").extract()
desc = sel.xpath('text()').extract()
#
# Do some stuff here depending on the class (cl) of 'span' which corresponds
# to either one of the # 7 levels of chapters & sub-chapters or to list of
# articles attached to a sub-chapters. To simplify I'm just putting here the
# code corresponding to the handling of list of articles (cl == codeLienArt)
# ...
# ...
if cl == [unicode('codeLienArt')]:
art_plink= sel.css('a::attr("href")').extract()
artLink= "<Base URL>"+str(unicode(art_plink[0]))
#
# curChild points to the element in the DOM to which the list of articles
# should be attached. Pass it in the request meta, in order for the second
# parsing function to place the articles & their content at the right place
# in the DOM
#
thisChild = curChild
#
# print for debug - thisChild.text contains the heading of the sub-chapter
# to which the list of articles that will be processed by parse1 should be
# attached.
#
print "follow link cl:%s art:%s for %s" % (cl, sel.xpath('a/text()').extract(), thisChild.text )
#
# get the list of articles following artLink & pass the response to the second parsing function
# (I know it's called parse1 :-)
#
yield scrapy.Request(artLink, callback=self.parse1, meta={ 'element': thisChild })
#-------------------
# This is the second parsing function that parses list of Articles & their content
# format is basically one or several articles, each being presented(simplified) as
# < div class="Articles">
# <div class="titreArt"> Title here</div>
# <div class="corpsArt"> Sometime some text and often a list of paragraph <p>sentences</p>" ></div>
# </div>
#-------------------
def parse1(self, resp):
print "enter parse1"
numberOfArticles= 0
for selArt in resp.xpath('//div[#class="article"]'):
#
# This is where I see the problem when CONCURRENT_REQUESTS > 1, sometimes
# the response points to a page that is not the page that was requested in
# the previous parsing function...
#
clArt = selArt.xpath('.//div[#class="titreArt"]/text()').extract()
print clArt
numberOfArticles += 1
childArt = SubElement(resp.meta['element'], 'Article')
childArt.text =str(unicode("%s" % clArt[0]))
corpsArt = selArt.xpath('.//div[#class="corpsArt"]/text()').extract()
print "corpsArt=%s" % corpsArt
temp = ''
for corpsItem in corpsArt:
if corpsItem != '\n':
temp += corpsItem
if temp != '':
childCorps = SubElement(childArt, 'p')
childCorps.text = temp
print "corpsArt is not empty %s" % temp
for paraArt in selArt.xpath('.//div[#class="corpsArt"]//p/text()').extract():
childPara = SubElement(childArt, 'p')
childPara.text = paraArt
print "childPara.text=%s" % childPara.text
print "link followed %s (%d)" % (resp.url,numberOfArticles)
print "leave parse1"
yield
Related
I'm scraping the content of articles from a site like this where there is no 'Next' button to follow. ItemLoader is passed from parse_issue in the response.meta object as well as some additional data like section_name. Here is the function:
def parse_article(self, response):
self.logger.info('Parse function called parse_article on {}'.format(response.url))
acrobat = response.xpath('//div[#class="txt__lead"]/p[contains(text(), "Plik do pobrania w wersji (pdf) - wymagany Acrobat Reader")]')
limiter = response.xpath('//p[#class="limiter"]')
if not acrobat and not limiter:
loader = ItemLoader(item=response.meta['periodical_item'].copy(), response=response)
loader.add_value('section_name', response.meta['section_name'])
loader.add_value('article_url', response.url)
loader.add_xpath('article_authors', './/p[#class="l doc-author"]/b')
loader.add_xpath('article_title', '//div[#class="cf txt "]//h1')
loader.add_xpath('article_intro', '//div[#class="txt__lead"]//p')
article_content = response.xpath('.//div[#class=" txt__rich-area"]//p').getall()
# # check for pagiantion
next_page_url = response.xpath('//span[#class="pgr_nrs"]/span[contains(text(), 1)]/following-sibling::a[1]/#href').get()
if next_page_url:
# I'm not sure what should be here... Something like this: (???)
yield response.follow(next_page_url, callback=self.parse_article, meta={
'periodical_item' : loader.load_item(),
'article_content' : article_content
})
else:
loader.add_xpath('article_content', article_content)
yield loader.load_item()
The problem is in parse_article function: I don't know how to combine the content of paragraphs from all pages into the one item. Does anybody know how to solve this?
Your parse_article looks good. If the issue is just adding the article_content to the loader, you just needed to fetch it from the response.meta:
I would update this line:
article_content = response.meta.get('article_content', '') + response.xpath('.//div[#class=" txt__rich-area"]//p').getall()
Just set the next page URL to iterate over X amount.
I noticed that article had 4 pages but some could be more
They are simply distinguished by adding /2 or /3 to the end of the URL e.g
https://www.gosc.pl/doc/791526.Zaloz-zbroje/
https://www.gosc.pl/doc/791526.Zaloz-zbroje/2
https://www.gosc.pl/doc/791526.Zaloz-zbroje/3
I don't use scrapy. But when I need multiple pages I would normally just iterate.
When you first scrape the page. Find the max amount of pages for that article first . On that site for example it says 1/4 so you know you will need 4 pages in total.
url = "https://www.gosc.pl/doc/791526.Zaloz-zbroje/"
data_store = ""
for i in range(1, 5):
actual_url = "{}{}".format(url, I)
scrape_stuff = content_you_want
data_store += scrape_stuff
# format the collected data
I'm trying to do a statistics about one web page. This page has categories and products in there. I don't download informations about this products, I'm only counting them.
The point is that I'm getting either Memory Error error or just Some text like Script ends with code -1073741819 (the number is exact).
I've tried to print size of variable category_urls after each loop and it does not increases.
EDIT:
The memory error raises when the category which is being counted is too big (about 60 000 urls).
The main loop is simple:
for category in categories:
count_category(category)
I suppose that after each iteration, the memory should be released but I can't see any release when I look at Task Manager -> Memory tab (Python.exe). I see that the memory consumption is higher and higher.
In case it helps to solve the problem:
def count_category(url):
category_urls = list(get_category_urls(url))
mLib.printToFile('database/count.txt',str(len(category_urls)))
set_spracovanie_kategorie(url) # This fnc just writes category url into text file
def get_category_urls(url):
log('Getting category urls: {}'.format(url))
urls = []
next_url = url
i=1
while next_url:
root = load_root(next_url)
urls.extend(get_products_on_page(root))
for x in urls:
if 'weballow' in x:
yield x
next_url = next_page(root, url) (next page is defined below)
# if next_url == False:
# return urls
i+=1
def get_products_on_page(root):
hrefs = root.xpath('//div[#id="product-contain"]//h2/a/#href')
return hrefs
AND LXML LOADING FUNCTIONS:
class RedirectException(Exception):
pass
def load_url(url):
r = requests.get(url,allow_redirects=False)
if r.status_code == 301:
raise RedirectException
html = r.text
return html
def load_root(url):
html = load_url(url)
return etree.fromstring(html, etree.HTMLParser())
NEXT PAGE:
def next_page(root, url):
next = root.xpath('//a[#class="next"]/#href')
if len(next) > 0:
return urljoin(url, next[0])
return False
Could you give me and advice what to do?
I am using this book and tried to download links by using crawler. But i dont know why anything is not happening. I followed page 55,57 code but no links are coming as per him.
here is the code:
File name linkextractCrawler.py
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
# Create a list of words to ignore
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
pass
def __del__(self):
pass
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
return None
# Index an individual page
def addtoindex(self,url,soup):
print 'Indexing %s' % url
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
return None
# Separate the words by any non-whitespace character
def separatewords(self,text):
return None
# Return true if this url is already indexed
def isindexed(self,url):
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth
# first search to the given depth, indexing pages
# as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
ignorewords=set(['the','of','to','and','a','in','is','it'])
print("kk");
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set( )
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup=BeautifulSoup(c.read( ))
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit( )
pages=newpages
print("kk");
print(pages);
On console:
>>> import linkextractCrawler
>>> p = ['https://en.wikipedia.org/wiki/Perl.html']
>>> crawler=linkextractCrawler.crawler('')
>>> crawler.crawl(p)
>>>
As some of you may have gathered, I'm learning scrapy to scrape some data off of Google Scholar for a research project that I am running. I have a file that contains many article titles for which I am scraping citations. I read in the file using pandas, generate the URLs that need scraping, and start scraping.
One problem that I face is 503 errors. Google shuts me off fairly quickly, and many entries remain unscraped. This is a problem that I am working on using some middleware provided by Crawlera.
Another problem I face is that when I export my scraped data, I have a hard time matching the scraped data to what I was trying to look for. My input data is a CSV file with three fields -- 'Authors','Title','pid' where 'pid' is a unique identifier.
I use pandas to read in the file and generate URLs for scholar based off the title. Each time a given URL is scraped, my spider goes through the scholar webpage, and picks up the title, publication information and cites for each article listed on that page.
Here is how I generate the links for scraping:
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
# generate a var to store links
links = []
# create the URLs to crawl
for entry in queries:
links.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = links
For example, one title from my data file could be the paper 'Elephants Don't Play Chess' by Rodney Brooks with 'pid' 5067. The spider goes to
http://scholar.google.com/scholar?q=allintitle%3Aelephants+don%27t+play+chess
Now on this page, there are six hits. The spider gets all six hits, but they need to be assigned the same 'pid'. I know I need to insert a line somewhere that reads something like item['pid'] = data.pid.apply("something") but I can't figure out exactly how I would do that.
Below is the rest of the code for my spider. I am sure the way to do this is pretty straightforward, but I can't think of how to get the spider to know which entry of data.pid it should look for if that makes sense.
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
I hope this question is well-defined, but please let me know if I can be more clear. There is really not much else in my scraper except some lines at the top importing things.
Edit 1: Based on suggestions below, I have modified my code as follows:
# test-case: http://scholar.google.com/scholar?q=intitle%3Amigratory+birds
import re
from pandas import *
import urllib
from scrapy.spider import Spider
from scrapy.selector import Selector
from scholarscrape.items import ScholarscrapeItem
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
pid = data.pid
# generate a var to store links
urls = []
# create the URLs to crawl
for entry in queries:
urls.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = (
(urls, pid),
)
def make_requests_from_url(self, (url,pid)):
return Request(url, meta={'pid':pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
item['pid'] = response.meta['pid']
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
You need to populate your list start_urls with tuples (url, pid).
Now redefine the method make_requests_from_url(url):
class ScholarSpider(Spider):
name = "ScholarSpider"
allowed_domains = ["scholar.google.com"]
start_urls = (
('http://www.scholar.google.com/', 100),
)
def make_requests_from_url(self, (url, pid)):
return Request(url, meta={'pid': pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
pid = response.meta['pid']
print '!!!!!!!!!!!', pid, '!!!!!!!!!!!!'
pass
So I'm interested in this theory that if you go to a random Wikipedia article, click the first link not inside parentheses repeatedly, in 95% of the cases you will end up on the article about Philosophy.
I wanted to write a script in Python that does the link fetching for me and in the end, print a nice list of which articles were visited (linkA -> linkB -> linkC) etc.
I managed to get the HTML DOM of the web pages, and managed to strip out some unnecessary links and the top description bar which leads disambiguation pages. So far I have concluded that:
The DOM begins with the table which you see on the right on some pages, for example in Human. We want to ignore these links.
The valid link elements all have a <p> element somewhere as their ancestor (most often parent or grandparent if it's inside a <b> tag or similar. The top bar which leads to disambiguation pages, does not seem to contain any <p> elements.
Invalid links contain some special words followed by a colon, e.g. Wikipedia:
So far, so good. But it's the parentheses that get me. In the article about Human for example, the first link not inside parentheses is "/wiki/Species", but the script finds "/wiki/Taxonomy" which is inside them.
I have no idea how to go about this programmatically, since I have to look for text in some combination of parent/child nodes which may not always be the same. Any ideas?
My code can be seen below, but it's something I made up really quickly and not very proud of. It's commented however, so you can see my line of thoughts (I hope :) ).
"""Wikipedia fun"""
import urllib2
from xml.dom.minidom import parseString
import time
def validWikiArticleLinkString(href):
""" Takes a string and returns True if it contains the substring
'/wiki/' in the beginning and does not contain any of the
"special" wiki pages.
"""
return (href.find("/wiki/") == 0
and href.find("(disambiguation)") == -1
and href.find("File:") == -1
and href.find("Wikipedia:") == -1
and href.find("Portal:") == -1
and href.find("Special:") == -1
and href.find("Help:") == -1
and href.find("Template_talk:") == -1
and href.find("Template:") == -1
and href.find("Talk:") == -1
and href.find("Category:") == -1
and href.find("Bibcode") == -1
and href.find("Main_Page") == -1)
if __name__ == "__main__":
visited = [] # a list of visited links. used to avoid getting into loops
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')] # need headers for the api
currentPage = "Human" # the page to start with
while True:
infile = opener.open('http://en.wikipedia.org/w/index.php?title=%s&printable=yes' % currentPage)
html = infile.read() # retrieve the contents of the wiki page we are at
htmlDOM = parseString(html) # get the DOM of the parsed HTML
aTags = htmlDOM.getElementsByTagName("a") # find all <a> tags
for tag in aTags:
if "href" in tag.attributes.keys(): # see if we have the href attribute in the tag
href = tag.attributes["href"].value # get the value of the href attribute
if validWikiArticleLinkString(href): # if we have one of the link types we are looking for
# Now come the tricky parts. We want to look for links in the main content area only,
# and we want the first link not in parentheses.
# assume the link is valid.
invalid = False
# tables which appear to the right on the site appear first in the DOM, so we need to make sure
# we are not looking at a <a> tag somewhere inside a <table>.
pn = tag.parentNode
while pn is not None:
if str(pn).find("table at") >= 0:
invalid = True
break
else:
pn = pn.parentNode
if invalid: # go to next link
continue
# Next we look at the descriptive texts above the article, if any; e.g
# This article is about .... or For other uses, see ... (disambiguation).
# These kinds of links will lead into loops so we classify them as invalid.
# We notice that this text does not appear to be inside a <p> block, so
# we dismiss <a> tags which aren't inside any <p>.
pnode = tag.parentNode
while pnode is not None:
if str(pnode).find("p at") >= 0:
break
pnode = pnode.parentNode
# If we have reached the root node, which has parentNode None, we classify the
# link as invalid.
if pnode is None:
invalid = True
if invalid:
continue
###### this is where I got stuck:
# now we need to look if the link is inside parentheses. below is some junk
# for elem in tag.parentNode.childNodes:
# while elem.firstChild is not None:
# elem = elem.firstChid
# print elem.nodeValue
print href # this will be the next link
newLink = href[6:] # except for the /wiki/ part
break
# if we have been to this link before, break the loop
if newLink in visited:
print "Stuck in loop."
break
# or if we have reached Philosophy
elif newLink == "Philosophy":
print "Ended up in Philosophy."
break
else:
visited.append(currentPage) # mark this currentPage as visited
currentPage = newLink # make the the currentPage we found the new page to fetch
time.sleep(5) # sleep some to see results as debug
I found a python script on Github (http://github.com/JensTimmerman/scripts/blob/master/philosophy.py) to play this game.
It uses Beautifulsoup for HTML parsing and to cope with the parantheses issue he just removes text between brackets before parsing links.