I have been working on a small project which is a web-crawler template. Im having an issue in pycharm where I am getting a warning Unresolved attribute reference 'domain' for class 'Scraper'
from abc import abstractmethod
import requests
import tldextract
class Scraper:
scrapers = {}
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.domain] = scraper_class # Unresolved attribute reference 'domain' for class 'Scraper'
#classmethod
def for_url(cls, url):
k = tldextract.extract(url)
# Returns -> <scraper.SydsvenskanScraper object at 0x000001E94F135850> & Scraped BBC News<!DOCTYPE html><html Which type annotiation?
return cls.scrapers[k.registered_domain](url)
#abstractmethod
def scrape(self):
pass
class BBCScraper(Scraper):
domain = 'bbc.co.uk'
def __init__(self, url):
self.url = url
def scrape(self):
rep = requests.Response = requests.get(self.url)
return "Scraped BBC News" + rep.text[:20] # ALL HTML CONTENT
class SydsvenskanScraper(Scraper):
domain = 'sydsvenskan.se'
def __init__(self, url):
self.url = url
def scrape(self):
rep = requests.Response = requests.get(self.url)
return "Scraped Sydsvenskan News" + rep.text[:20] # ALL HTML CONTENT
if __name__ == "__main__":
URLS = ['https://www.sydsvenskan.se/', 'https://www.bbc.co.uk/']
for urls in URLS:
get_product = Scraper.for_url(urls)
r = get_product.scrape()
print(r)
Of course I could ignore it as it is working but I do not like to ignore a warning as I believe pycharm is smart and should solve the warning rather than ignoring it and I wonder what is the reason of it warns me regarding that?
There are a few different levels on how you can remove this warning:
Assign a default value:
class Scraper:
scrapers = {}
domain = None # Or a sensible value of one exists
You can in additon or alternatly annotate the type.
from typing import ClassVar
class Scraper:
scrapers: ClassVar[dict[str, 'Scraper']] = {}
domain: ClassVar[str]
Note that ClassVar is required because otherwise it is assume that they are instance attributes.
To ignore it, put
# noinspection PyUnresolvedReferences
on the line above the line causing the warning.
Just tell yrou Scraper class that this attribut exists
class Scraper:
scrapers = {}
domain: str
def __init_subclass__(scraper_class):
Scraper.scrapers[scraper_class.domain] = scraper_class
Related
I want to access an object attribute variable in Robot Framework tests and then validate the values of these attributes.
I have the following Python class:
class TestingClass(object):
def __init__(self, client_id=None, redirect_uri=None, state=None):
self.client_id = client_id
self.uri = uri
self.state = state
def url_authorize(self):
self.oauthsession = OAuth2Session(client_id=self.client_id, uri=self.uri, state=self.state)
self.authorize_endpoint_url = self.oauthsession.authorization_url(
url="http://localhost:8080/authorize")[0]
return self.authorize_endpoint_url
def authorize(self):
request = requests.get("http://localhost:8080")
self.status_code = request.status_code
self.content = request.content
self.json = request.json
I want to be able to grab any one of the attributes created in the Authorize (authorize method) and validate it. So I want to do something like this:
* Variables
${CLIENT_ID} to
${URI} http://127.0.0.1:8080/callback
${STATE} random
* Settings
Documentation Tests Suite
Library Utilities.TestingClass ${CLIENT_ID} ${URI}
... ${STATE}
*Keywords
*Test Cases
Authorize Test 1
[Documentation] Test that the user is redirected to the authorize
... endpoint
... when accessing the OAuth Client
Url Authorize
Authorize
Should Be Equal As Integers Utilities.TestingClass.status_code 200
However this gives: Utilities.TestingClass.status_code != 200 which is no surprise to me.
How do I grab this attribute to compare it? Would I need to maybe return all the attributes made from the authroize() method in a list/dictionary/some sort of array and then access by indexing? Or is there a more straight forward way to do this with Robot Framework?
You have the following two choices, both will involve the usage of extended variable syntax. I would prefer the second option.
You can use the Get Library Instance keyword, to get the library instance in the test. The you can access its member variables using the extended variable syntax. Here is an example based on your code, I just replaced the return values with constants.
class TestingClass(object):
def __init__(self, client_id=None, redirect_uri=None, state=None):
self.client_id = client_id
self.uri = redirect_uri
self.state = state
def url_authorize(self):
self.oauthsession = None
self.authorize_endpoint_url = "http://localhost:8080/authorize"
return self.authorize_endpoint_url
def authorize(self):
self.status_code = 200
self.content = 'content'
self.json = { "A" : ["StringA1", "StringA2"], "B": ["StringB1","StringB2"]}
*** Settings ***
Library Utilities.TestingClass to http://127.0.0.1:8080/callback random
*** Test Cases ***
Test
Url Authorize
Authorize
${TestingClass}= Get Library Instance Utilities.TestingClass
Log ${TestingClass.status_code}
Log ${TestingClass.content}
Log ${TestingClass.json}
The other option is to modify the authorize method, so it will return what requests.get("http://localhost:8080") returns. Then you could access the status code, content and JSON in the same way as above, using the extended variable syntax.
class DummyRequestReturnValue():
def __init__(self):
self.status_code = 200
self.content = 'content'
self.json = { "A" : ["StringA1", "StringA2"], "B": ["StringB1","StringB2"]}
class TestingClass(object):
def __init__(self, client_id=None, redirect_uri=None, state=None):
self.client_id = client_id
self.uri = redirect_uri
self.state = state
def url_authorize(self):
self.oauthsession = None
self.authorize_endpoint_url = "http://localhost:8080/authorize"
return self.authorize_endpoint_url
def authorize(self):
request = DummyRequestReturnValue()
return request
*** Settings ***
Library Utilities.TestingClass to http://127.0.0.1:8080/callback random
*** Test Cases ***
Test
Url Authorize
${response}= Authorize
Log ${response.status_code}
Log ${response.content}
Log ${response.json}
I am creating a Flask Website and i want to display different logout links based your current page i.e
If we’re on the home page and logged in, have this link be wrapped in h2 tags
If we’re on a different page and logged in, have this link be wrapped in underline tags
If we’re logged in, have this link wrapped in strong tags
So far i have tried upto here.
class HtmlLinks():
html =""
def set_html(self, html):
self.html = html
def get_html(self):
return self.html
def render(self):
print(self.html)
class LogoutLink(HtmlLinks):
def __init__(self):
self.html = "Logout"
class LogoutLinkH2Decorator(HtmlLinks):
def __init__(self, logout_link):
self.logout_link = logout_link
self.set_html("<h2> {0} </h2>").format(self.logout_link.get_html())
def call(self, name, args):
self.logout_link.name(args[0])
class LogoutLinkUnderlineDecorator(HtmlLinks):
def __init__(self, logout_link):
self.logout_link = logout_link
self.set_html("<u> {0} </u>").format(self.logout_link.get_html())
def call(self, name, args):
self.logout_link.name(args[0])
class LogoutLinkStrongDecorator(HtmlLinks):
def __init__(self, logout_link):
self.logout_link = logout_link
self.set_html("<strong> {0} </strong>").format(self.logout_link.get_html())
def call(self, name, args):
self.logout_link.name(args[0])
logout_link = LogoutLink()
is_logged_in = 0
in_home_page = 0
if is_logged_in:
logout_link = LogoutLinkStrongDecorator(logout_link)
if in_home_page:
logout_link = LogoutLinkH2Decorator(logout_link)
else:
logout_link = LogoutLinkUnderlineDecorator(logout_link)
logout_link.render()
I am getting Attribute error
AttributeError: 'NoneType' object has no attribute 'format'
What wrong i am doing and how to rectify it. Please Help.
So you have a few lines that looks like this:
self.set_html("<h2> {0} </h2>").format(self.logout_link.get_html())
You probably want them to look like:
self.set_html("<h2> {0} </h2>".format(self.logout_link.get_html()))
set_html returns nothing, but you try to call for format method on its returned value.
self.set_html("<strong> {0} </strong>").format(self.logout_link.get_html())
I am using pickle to save an object graph by dumping the root. When I load the root it has all the instance variables and connected object nodes. However I am saving all the nodes in a class variable of type dictionary. The class variable is full before being saved but after I unpickle the data it is empty.
Here is the class I am using:
class Page():
__crawled = {}
def __init__(self, title = '', link = '', relatedURLs = []):
self.__title = title
self.__link = link
self.__relatedURLs = relatedURLs
self.__related = []
#property
def relatedURLs(self):
return self.__relatedURLs
#property
def title(self):
return self.__title
#property
def related(self):
return self.__related
#property
def crawled(self):
return self.__crawled
def crawl(self,url):
if url not in self.__crawled:
webpage = urlopen(url).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
newPage = Page(findPatTitle,findPatLink,findPatRelated)
self.__related.append(newPage)
self.__crawled[url] = newPage
else:
self.__related.append(self.__crawled[url])
def crawlRelated(self):
for link in self.__relatedURLs:
self.crawl(link)
I save it like such:
with open('medTwiceGraph.dat','w') as outf:
pickle.dump(root,outf)
and I load it like such:
def loadGraph(filename): #returns root
with open(filename,'r') as inf:
return pickle.load(inf)
root = loadGraph('medTwiceGraph.dat')
All the data loads except for the class variable __crawled.
What am I doing wrong?
Python doesn't really pickle class objects. It simply saves their names and where to find them. From the documentation of pickle:
Similarly, classes are pickled by named reference, so the same
restrictions in the unpickling environment apply. Note that none of
the class’s code or data is pickled, so in the following example the
class attribute attr is not restored in the unpickling environment:
class Foo:
attr = 'a class attr'
picklestring = pickle.dumps(Foo)
These restrictions are why picklable functions and classes must be
defined in the top level of a module.
Similarly, when class instances are pickled, their class’s code and
data are not pickled along with them. Only the instance data are
pickled. This is done on purpose, so you can fix bugs in a class or
add methods to the class and still load objects that were created with
an earlier version of the class. If you plan to have long-lived
objects that will see many versions of a class, it may be worthwhile
to put a version number in the objects so that suitable conversions
can be made by the class’s __setstate__() method.
In your example you could fix your problems changing __crawled to be an instance attribute or a global variable.
By default pickle will only use the contents of self.__dict__ and not use self.__class__.__dict__ which is what you think you want.
I say, "what you think you want" because unpickling an instance should not mutate class level sate.
If you want to change this behavior then look at __getstate__ and __setstate__ in the docs
For anyone interested, what I did was make a superclass Graph which contained an instance variable __crawled and moved my crawling functions into Graph. Page now only contains attributes describing the page and its related pages. I pickle my instance of Graph which contains all my instances of Page. Here is my code.
from urllib import urlopen
#from bs4 import BeautifulSoup
import re
import pickle
###################CLASS GRAPH####################
class Graph(object):
def __init__(self,roots = [],crawled = {}):
self.__roots = roots
self.__crawled = crawled
#property
def roots(self):
return self.__roots
#property
def crawled(self):
return self.__crawled
def crawl(self,page,url):
if url not in self.__crawled:
webpage = urlopen(url).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
newPage = Page(findPatTitle,findPatLink,findPatRelated)
page.related.append(newPage)
self.__crawled[url] = newPage
else:
page.related.append(self.__crawled[url])
def crawlRelated(self,page):
for link in page.relatedURLs:
self.crawl(page,link)
def crawlAll(self,obj,limit = 2,i = 0):
print 'number of crawled pages:', len(self.crawled)
i += 1
if i > limit:
return
else:
for rel in obj.related:
print 'crawling', rel.title
self.crawlRelated(rel)
for rel2 in obj.related:
self.crawlAll(rel2,limit,i)
def loadGraph(self,filename):
with open(filename,'r') as inf:
return pickle.load(inf)
def saveGraph(self,obj,filename):
with open(filename,'w') as outf:
pickle.dump(obj,outf)
###################CLASS PAGE#####################
class Page(Graph):
def __init__(self, title = '', link = '', relatedURLs = []):
self.__title = title
self.__link = link
self.__relatedURLs = relatedURLs
self.__related = []
#property
def relatedURLs(self):
return self.__relatedURLs
#property
def title(self):
return self.__title
#property
def related(self):
return self.__related
####################### MAIN ######################
def main(seed):
print 'doing some work...'
webpage = urlopen(seed).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
print 'found the webpage', findPatTitle
#root = Page(findPatTitle,findPatLink,findPatRelated)
G = Graph([Page(findPatTitle,findPatLink,findPatRelated)])
print 'crawling related...'
G.crawlRelated(G.roots[0])
G.crawlAll(G.roots[0])
print 'now saving...'
G.saveGraph(G, 'medTwiceGraph.dat')
print 'done'
return G
#####################END MAIN######################
#'http://medtwice.com/am-i-pregnant/'
#'medTwiceGraph.dat'
#G = main('http://medtwice.com/menopause-overview/')
#print G.crawled
def loadGraph(filename):
with open(filename,'r') as inf:
return pickle.load(inf)
G = loadGraph('MedTwiceGraph.dat')
print G.roots[0].title
print G.roots[0].related
print G.crawled
for key in G.crawled:
print G.crawled[key].title
Using dill can solve this problem.
dill package: https://pypi.python.org/pypi/dill
reference: https://stackoverflow.com/a/28543378/6301132
According Asker's code, into this:
#notice:open the file in binary require
#save
with open('medTwiceGraph.dat','wb') as outf:
dill.dump(root,outf)
#load
def loadGraph(filename): #returns root
with open(filename,'rb') as inf:
return dill.load(inf)
root = loadGraph('medTwiceGraph.dat')
I wrote another example:
#Another example (with Python 3.x)
import dill
import os
class Employee:
def __init__ (self ,name='',contact={}) :
self.name = name
self.contact = contact
def print_self(self):
print(self.name, self.contact)
#save
def save_employees():
global emp
with open('employees.dat','wb') as fh:
dill.dump(emp,fh)
#load
def load_employees():
global emp
if os.path.exists('employees.dat'):
with open('employees.dat','rb') as fh:
emp=dill.load(fh)
#---
emp=[]
load_employees()
print('loaded:')
for tmpe in emp:
tmpe.print_self()
e=Employee() #new employee
if len(emp)==0:
e.name='Jack'
e.contact={'phone':'+086-12345678'}
elif len(emp)==1:
e.name='Jane'
e.contact={'phone':'+01-15555555','email':'a#b.com'}
else:
e.name='sb.'
e.contact={'telegram':'x'}
emp.append(e)
save_employees()
I am trying to get my head around Pyramid traversal with this very simple example. What I haven't quite grasped yet is where to "inject" an Article object from the db.
As it is, /Article correctly finds and renders the article_view but that's fairly useless. How/when/where do I use the next part of the URL to query for a particular Article from the db? eg. /Article/5048230b2485d614ecec341d.
Any clues would be great!
init.py
from pyramid.config import Configurator
from pyramid.events import subscriber
from pyramid.events import NewRequest
import pymongo
from otk.resources import Root
def main(global_config, **settings):
""" This function returns a WSGI application.
"""
config = Configurator(settings=settings, root_factory=Root)
config.add_static_view('static', 'otk:static')
# MongoDB
def add_mongo_db(event):
settings = event.request.registry.settings
url = settings['mongodb.url']
db_name = settings['mongodb.db_name']
db = settings['mongodb_conn'][db_name]
event.request.db = db
db_uri = settings['mongodb.url']
MongoDB = pymongo.Connection
if 'pyramid_debugtoolbar' in set(settings.values()):
class MongoDB(pymongo.Connection):
def __html__(self):
return 'MongoDB: <b>{}></b>'.format(self)
conn = MongoDB(db_uri)
config.registry.settings['mongodb_conn'] = conn
config.add_subscriber(add_mongo_db, NewRequest)
config.include('pyramid_jinja2')
config.include('pyramid_debugtoolbar')
config.scan('otk')
return config.make_wsgi_app()
resources.py
class Root(object):
__name__ = None
__parent__ = None
def __init__(self, request):
self.request = request
def __getitem__(self, key):
if key == 'Article':
return Article(self.request)
else:
raise KeyError
class Article:
__name__ = ''
__parent__ = Root
def __init__(self, request):
self.reqeust = request
# so I guess in here I need to update the Article with
# with the document I get from the db. How?
def __getitem__(self, key):
raise KeyError
views.py
from pyramid.view import view_config
from otk.resources import *
from pyramid.response import Response
#view_config(context=Root, renderer='templates/index.jinja2')
def index(request):
return {'project':'OTK'}
#view_config(context=Article, renderer='templates/view/article.jinja2')
def article_view(context, request):
# I end up with an instance of Article here as the context.. but
# at the moment, the Article is empty
return {}
You'd generally return a Article object from the id part of the URL traversal.
What happens with traversal is that for each element in the URL path, an object is looked up and made the new current object for the next path element lookup.
So for Article, the root object is asked for something matching that name, and the result of that lookup is made the new "current" object, and 5048230b2485d614ecec341d is then looked up on that new object.
So, what you are looking for is a dispatcher object, something that looks up articles based on the longer id you are passed, and that returns your Article instances:
class Root(object):
__name__ = None
__parent__ = None
def __init__(self, request):
self.request = request
def __getitem__(self, key):
if key == 'articles':
dispatch = ArticleDispatcher(self.request)
dispatch.__name__ = key
dispatch.__parent__ = self
return dispatch
raise KeyError(key)
class ArticleDispatcher(object):
__name__ = None
__parent__ = None
def __init__(self, request):
self.request = request
def __getitem__(self, key):
# Get a hold of the database here:
db = findDatabase(self.request)
if db.exists(key):
data = db.load(key)
art = Article(data)
art.__name__ = key
art.__parent__ = self
return art
raise KeyError(key)
class Article:
__name__ = None
__parent__ = None
def __init__(self, data):
self.data = data
Note how I returned a ArticleDispatcher when you use the /articles URL path, and also how I set the __name__ and __parent__ variables; you'll need those to be able to generate URLs for those instances.
The Article object returned now contains the actual article data, and the view can access that information when rendering.
You really want to go and study the Pyramid Traversal tutorial which explains this all in more detail.
I'm not sure if this is effective or not. It works, but sometimes i feel...weird about it. Can you please tell me if this is a good way or not?
I threw the code on pastebin, because i think it's a bit too much to put here: http://pastebin.com/662TiQLq
EDIT
I edited the title to make it more objective.
I'm just guessing that the questioner is asking about creating a dictionary of functions in the __ init __ function of the handlers, and then using this dict in the "get" function to look up specific functions. If this is the question, then IMHO a clearer approach would be to set up separate handlers for each different function. For example
class QuotesView(webapp.RequestHandler):
"""Super class for quotes that can accommodate common functionality"""
pass
class QuotesViewSingle(QuotesView):
def get(self):
...
class QuotesViewRandom(QuotesView):
def get(self):
...
class QuotesViewAll(QuotesView):
def get(self):
...
def main():
application = webapp.WSGIApplication([('/quote/new',NewQuote),
(r'/quotes/single',QuotesViewSingle),
(r'/quotes/all',QuotesViewAll),
(r'/quotes/random',QuotesViewRandom),
...
('/', MainHandler)],
debug=True)
BTW. A lot of people use the regex in the WSGIApplication calls to parse out arguments for the get functions. There's nothing particularly wrong with it. I'm not a big fan of that feature, and prefer to parse the arguments in the get functions. But that's just me.
For completeness here's the original code:
class Quote(db.Model):
author = db.StringProperty()
string = db.StringProperty()
class MainHandler(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
quotes = Quote.all()
path = os.path.join(os.path.dirname(__file__),'quotery.html')
template_values = {'quotes':quotes,'user':user,'login_url':users.create_login_url('/')}
self.response.out.write(template.render(path, template_values))
class QuoteHandler(webapp.RequestHandler):
def __init__(self):
self.actions = {'fetch':self.fetch, 'random':self.fetch_random}
#Memcache the number of quotes in the datastore, to minimize datastore calls
self.quote_count = memcache.get('quote_count')
if not self.quote_count:
self.quote_count = self.cache_quote_count()
def cache_quote_count(self):
count = Quote.all().count()
memcache.add(key='quote_count', value=count, time=3600)
return count
def get(self, key):
if key in self.actions:
action = self.actions[key]
action()
def fetch(self):
for quote in Quote.all():
print 'Quote!'
print 'Author: ',quote.author
print 'String: ',quote.string
print
def fetch_random(self):
max_offset = self.quote_count-1
random_offset = random.randint(0,max_offset)
'''self.response.out.write(max_offset)
self.response.out.write('\n<br/>')
self.response.out.write(random_offset)'''
try:
query = db.GqlQuery("SELECT * FROM Quote")
quotes = query.fetch(1,random_offset)
return quotes
'''for quote in quotes:
self.response.out.write(quote.author)
self.response.out.write('\n')
self.response.out.write(quote.string)'''
except BaseException:
raise
class NewQuote(webapp.RequestHandler):
def post(self):
author = self.request.get('quote_author')
string = self.request.get('quote_string')
if not author or not string:
return False
quote = Quote()
quote.author = author
quote.string = string
quote.put()
QuoteHandler().cache_quote_count()
self.redirect("/")
#return True
class QuotesView(webapp.RequestHandler):
def __init__(self):
self.actions = {'all':self.view_all,'random':self.view_random,'get':self.view_single}
def get(self, key):
if not key or key not in self.actions:
self.view_all()
if key in self.actions:
action = self.actions[key]
action()
def view_all(self):
print 'view all'
def view_random(self):
quotes = QuoteHandler().fetch_random()
template_data = {}
for quote in quotes:
template_data['quote'] = quote
template_path = os.path.join(os.path.dirname(__file__),'base_view.html')
self.response.out.write(template.render(template_path, template_data))
def view_single(self):
print 'view single'
def main():
application = webapp.WSGIApplication([('/quote/new',NewQuote),(r'/quotes/(.*)',QuotesView),(r'/quote/(.*)',QuoteHandler),('/', MainHandler)],
debug=True)
util.run_wsgi_app(application)