Troubles loading a function from python file - python

I am having trouble loading a function in a python file that i created. The entire code in the python file runs without any problems in a Jupyter Notebook. Now that i have put the code, 1:1 into a python file, i get an error "cannot import name 'get' from 'connector'" where connector is my python file, and get is the function nested inside.
I suspect that the format from Jupyter Notebook changes and into a regular python file changes the way the file is read somehow?
I wanted to move the code into a python file, as this function is usefull for mulitple projects i am doing in python when scraping webpages. I can call the function ratelimit function, but i can't seem to figure out what is wrong with my get command.
# Imports
import scraping_class
import pandas as pd
import requests,os,time
logfile="trustpilot.txt"
Connector = scraping_class.Connector(logfile)
def ratelimit(x):
"A function that handles the rate of your calls."
time.sleep(x) # sleep x seconds.
class Connector():
def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
"""This Class implements a method for reliable connection to the internet and monitoring.
It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
Keyword arguments:
logfile -- path to the logfile
overwrite_log -- bool, defining if logfile should be cleared (rarely the case).
connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
session -- requests.session object. For defining custom headers and proxies.
path2selenium -- str, sets the path to the geckodriver needed when using selenium.
n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
"""
## Initialization function defining parameters.
self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
self.timeout = timeout # Defining the maximum time to wait for a server to response.
## not implemented here, if you use selenium.
if connector_type=='selenium':
assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
from selenium import webdriver
## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases
assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.
self.connector_type = connector_type # set the connector_type
if session: # set the custom session
self.session = session
else:
self.session = requests.session()
self.logfilename = logfile # set the logfile path
## define header for the logfile
header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
if os.path.isfile(logfile):
if overwrite_log==True:
self.log = open(logfile,'w')
self.log.write(';'.join(header))
else:
self.log = open(logfile,'a')
else:
self.log = open(logfile,'w')
self.log.write(';'.join(header))
## load log
with open(logfile,'r') as f: # open file
l = f.read().split('\n') # read and split file by newlines.
## set id
if len(l)<=1:
self.id = 0
else:
self.id = int(l[-1][0])+1
def get(self,url,project_name):
"""Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
Keyword arguments:
url -- str, url
project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'.
"""
project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
if self.connector_type=='requests': # Determine connector method.
for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
ratelimit()
t = time.time()
try: # error handling
response = self.session.get(url,timeout = self.timeout) # make get call
err = '' # define python error variable as empty assumming success.
success = True # define success variable
redirect_url = response.url # log current url, after potential redirects
dt = t - time.time() # define delta-time waiting for the server and downloading content.
size = len(response.text) # define variable for size of html content of the response.
response_code = response.status_code # log status code.
## log...
call_id = self.id # get current unique identifier for the call
self.id+=1 # increment call id
#['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
self.log.write('\n'+';'.join(map(str,row))) # write log.
return response,call_id # return response and unique identifier.
except Exception as e: # define error condition
err = str(e) # python error
response_code = '' # blank response code
success = False # call success = False
size = 0 # content is empty.
redirect_url = '' # redirect url empty
dt = t - time.time() # define delta t
## log...
call_id = self.id # define unique identifier
self.id+=1 # increment call_id
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
self.log.write('\n'+';'.join(map(str,row))) # write row to log.
else:
t = time.time()
ratelimit()
self.browser.get(url) # use selenium get method
## log
call_id = self.id # define unique identifier for the call.
self.id+=1 # increment the call_id
err = '' # blank error message
success = '' # success blank
redirect_url = self.browser.current_url # redirect url.
dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
response_code = '' # empty response code.
row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
# Using selenium it will not return a response object, instead you should call the browser object of the connector.
## connector.browser.page_source will give you the html.
return call_id
logfile="trustpilot.txt" ## name your log file.
connector = Connector(logfile)
I expected to be able to load my 'get' function from 'connector.py'

The following line may be the issue due to naming conflict as you Connector variable and Connector class:
Connector = scraping_class.Connector(logfile)

Related

Python code not working in Lambda. Gives an invalid parameter error

I have the code below that powers on or off a DB cluster based on its tags. I use this code with a Lambda function. The code is to run on a schedule I have defined in cloudformation. However, although the Lambda function is invoked, it does not power off or power on the DB cluster.
It fails each time
tag_name = "AutoPower"
tag_value = "true"
#Get configuration variable
identifier = os.environ.get("db_identifier")
#Clients
rds = boto3.client("rds")
#Function to start the cluster.
def start(identifier):
rds.start_db_cluster(
DBClusterIdentifier=identifier
)
#Function to stop the cluster.
def stop(identifier):
rds.stop_db_cluster(
DBClusterIdentifier=identifier
)
def handler(event, context):
#Call AWS' "describe_db_clusters"; retrieve specific cluster info.
resp = rds.describe_db_clusters(
DBClusterIdentifier=identifier
)
# Isolate the one entry in the 'array' (with one result) we want.
raw = resp["DBClusters"][0]
# Pull tag info out of the dict above.
tag_info = raw["TagList"]
for tag in tag_info:
# If tag is 'AutoPower'
if tag["Key"] == tag_name:
# and Value is 'true'
if tag["Value"] == tag_value:
status = raw["Status"]
# and the DB is running
if (status == 'available'):
# Stop the DB
stop(identifier)
# and the DB is off
elif (status == 'stopped'):
# Start the DB
start(identifier)
I run the code using a Lambda function. However each time I run it the Lambda function fails with the message I have posted below. Can anyone see what the issue is with the code?
{
"errorMessage": "Parameter validation failed:\nInvalid type for parameter DBClusterIdentifier, value: None, type: <class 'NoneType'>, valid types: <class 'str'>",
"errorType": "ParamValidationError",
"requestId": "62767449-e633-4594-b5df-0086e385ebeb",
"stackTrace": [
" File \"/var/task/index.py\", line 34, in handler\n resp = rds.describe_db_clusters(\n",
Based on the comments above, you need identifier specified inside describe_db. Right now it's a global, but likely as it's being used as a callback (handler) the global context isn't carried in.
Perhaps try either determining the value inside handler. This may, or may not, work depending on the environment that handler is being run in:
def handler(event, context):
#Call AWS' "describe_db_clusters"; retrieve specific cluster info.
identifier = os.environ.get("db_identifier")
resp = rds.describe_db_clusters(
DBClusterIdentifier=identifier
)
...
or you could try specifying identifier as part of the args for handler:
def handler(event, context, identifier):
....

pubsub publisher retry settings failing with: TypeError: All attributes being published to Pub/Sub must be sent as text strings

I have a cloud function which is errors when using the pubsub publisher retry settings on the publisher client or publish requests.
https://cloud.google.com/pubsub/docs/samples/pubsub-publisher-retry-settings#pubsub_publisher_retry_settings-python
When i run my code in JupterLab the python code runs successfully but as soon as I move the code to Cloud Functions I get a TypeError: All attributes being published to Pub/Sub must be sent as text strings.
I have now tried a new simple Cloud Function copying the code directly from the example in the like above link but still get the same error, any suggestions much appreaciated.
from google.cloud import pubsub_v1
# TODO(developer)
GCP_PROJECT_ID='test_project'
SIT_EVENT_TOPIC = ('test_project1')
topic_id=SIT_EVENT_TOPIC
project_id=GCP_PROJECT_ID
# project_id = "your-project-id"
# topic_id = "your-topic-id"
# Configure the retry settings. Defaults shown in comments are values applied
# by the library by default, instead of default values in the Retry object.
def test():
custom_retry = api_core.retry.Retry(
initial=0.250, # seconds (default: 0.1)
maximum=90.0, # seconds (default: 60.0)
multiplier=1.45, # default: 1.3
deadline=300.0, # seconds (default: 60.0)
predicate=api_core.retry.if_exception_type(
api_core.exceptions.Aborted,
api_core.exceptions.DeadlineExceeded,
api_core.exceptions.InternalServerError,
api_core.exceptions.ResourceExhausted,
api_core.exceptions.ServiceUnavailable,
api_core.exceptions.Unknown,
api_core.exceptions.Cancelled,
),
)
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path(project_id, topic_id)
# for n in range(1, 10):
# data = "Message number {}".format(n)
data="test message"
# Data must be a bytestring
data = data.encode("utf-8")
print(type(data))
future = publisher.publish(topic=topic_path, data=data, retry=custom_retry)
print(future.result())
print(f"Published messages with retry settings to {topic_path}.")
def main(event, context):
"""
Call the main function, sets the order in which to run functions.
"""
test()
return 'Script has run without errors !!'
if (__name__ == "__main__"):
main()
output
ine 40, in test future = publisher.publish(topic=topic_path, data=data, retry=custom_retry) File "/layers/google.python.pip/pip/lib/python3.8/site-packages/google/cloud/pubsub_v1/publisher/client.py", line 195, in publish raise TypeError( TypeError: All attributes being published to Pub/Sub must be sent as text strings.
This issue was caused by running outdated python packages in the requirements.txt file.
 
Original requirements list
google-cloud-pubsub==0.34.0
google-cloud-storage==1.13.1
google-cloud-bigquery==1.8.1
google-cloud-core==0.29.1
ndjson==0.3.1
 
New requirements list
google-cloud-pubsub==2.2.0
google-cloud-core==1.5.0
google-api-core==1.24.1
google-cloud-storage==1.35.0
google-cloud-bigquery==2.6.2
ndjson==0.3.1
Updating the list allowed the cloud function to run, another issue I found with the example code is that you need to convert the data to a string, I used the following prior to publishing:
data=str(data)

How to convert suds object to xml string

This is a duplicate to this question:
How to convert suds object to xml
But the question has not been answered: "totxt" is not an attribute on the Client class.
Unfortunately I lack of reputation to add comments. So I ask again:
Is there a way to convert a suds object to its xml?
I ask this because I already have a system that consumes wsdl files and sends data to a webservice. But now the customers want to alternatively store the XML as files (to import them later manually). So all I need are 2 methods for writing data: One writes to a webservice (implemented and tested), the other (not implemented yet) writes to files.
If only I could make something like this:
xml_as_string = My_suds_object.to_xml()
The following code is just an example and does not run. And it's not elegant. Doesn't matter. I hope you get the idea what I want to achieve:
I have the function "write_customer_obj_webservice" that works. Now I want to write the function "write_customer_obj_xml_file".
import suds
def get_customer_obj():
wsdl_url = r'file:C:/somepathhere/Customer.wsdl'
service_url = r'http://someiphere/Customer'
c = suds.client.Client(wsdl_url, location=service_url)
customer = c.factory.create("ns0:CustomerType")
return customer
def write_customer_obj_webservice(customer):
wsdl_url = r'file:C:/somepathhere/Customer.wsdl'
service_url = r'http://someiphere/Customer'
c = suds.client.Client(wsdl_url, location=service_url)
response = c.service.save(someparameters, None, None, customer)
return response
def write_customer_obj_xml_file(customer):
output_filename = r'C\temp\testxml'
# The following line is the problem. "to_xml" does not exist and I can't find a way to do it.
xml = customer.to_xml()
fo = open(output_filename, 'a')
try:
fo.write(xml)
except:
raise
else:
response = 'All ok'
finally:
fo.close()
return response
# Get the customer object always from the wsdl.
customer = get_customer_obj()
# Since customer is an object, setting it's attributes is very easy. There are very complex objects in this system.
customer.name = "Doe J."
customer.age = 42
# Write the new customer to a webservice or store it in a file for later proccessing
if later_processing:
response = write_customer_obj_xml_file(customer)
else:
response = write_customer_obj_webservice(customer)
I found a way that works for me. The trick is to create the Client with the option "nosend=True".
In the documentation it says:
nosend - Create the soap envelope but don't send. When specified, method invocation returns a RequestContext instead of sending it.
The RequestContext object has the attribute envelope. This is the XML as string.
Some pseudo code to illustrate:
c = suds.client.Client(url, nosend=True)
customer = c.factory.create("ns0:CustomerType")
customer.name = "Doe J."
customer.age = 42
response = c.service.save(someparameters, None, None, customer)
print response.envelope # This prints the XML string that would have been sent.
You have some issues in write_customer_obj_xml_file function:
Fix bad path:
output_filename = r'C:\temp\test.xml'
The following line is the problem. "to_xml" does not exist and I can't find a way to do it.
What's the type of customer? type(customer)?
xml = customer.to_xml() # to be continued...
Why mode='a'? ('a' => append, 'w' => create + write)
Use a with statement (file context manager).
with open(output_filename, 'w') as fo:
fo.write(xml)
Don't need to return a response string: use an exception manager. The exception to catch can be EnvironmentError.
Analyse
The following call:
customer = c.factory.create("ns0:CustomerType")
Construct a CustomerType on the fly, and return a CustomerType instance customer.
I think you can introspect your customer object, try the following:
vars(customer) # display the object attributes
help(customer) # display an extensive help about your instance
Another way is to try the WSDL URLs by hands, and see the XML results.
You may obtain the full description of your CustomerType object.
And then?
Then, with the attributes list, you can create your own XML. Use an XML template and fill it with the object attributes.
You may also found the magic function (to_xml) which do the job for you. But, not sure the XML format matches your need.
client = Client(url)
client.factory.create('somename')
# The last XML request by client
client.last_sent()
# The last XML response from Web Service
client.last_received()

Dnspython: Setting query timeout/lifetime

I have a small script that checks a large list of domains for their MX records, everything works fine but when the script finds a domain with no record, it takes quite a long time to skip to the next one.
I have tried adding:
query.lifetime = 1.0
or
query.timeout = 1.0
but this doesn't seem to do anything. Does anyone know how this setting is configured?
My script is below, thanks for your time.
import dns.resolver
from dns.exception import DNSException
import dns.query
import csv
domains = csv.reader(open('domains.csv', 'rU'))
output = open('output.txt', 'w')
for row in domains:
try:
domain = row[0]
query = dns.resolver.query(domain,'MX')
query.lifetime = 1.0
except DNSException:
print "nothing here"
for rdata in query:
print domain, " ", rdata.exchange, 'has preference', rdata.preference
output.writelines(domain)
output.writelines(",")
output.writelines(rdata.exchange.to_text())
output.writelines("\n")
You're setting the timeout after you've already performed the query. So that's not gonna do anything!
What you want to do instead is create a Resolver object, set its timeout, and then call its query() method. dns.resolver.query() is just a convenience function that instantiates a default Resolver object and invokes its query() method, so you need to do that manually if you don't want a default Resolver.
resolver = dns.resolver.Resolver()
resolver.timeout = 1
resolver.lifetime = 1
Then use this in your loop:
try:
domain = row[0]
query = resolver.resolve(domain,'MX')
except:
# etc.
You should be able to use the same Resolver object for all queries.

How to halt, kill, stop or close a PycURL request on a stream example given using Twitter Stream

Im currently cURLing the twitter API stream (http://stream.twitter.com/1/statuses/sample.json), so am constantly receiving data. I wish to stop cURLing the stream once i have retrieved X number of objects from it (in the example I give 10 as an arbitrary number).
You can see how I have attempted to close the connection in the code below. The code below curling.perform() never executes, due to the fact that it is a continuous stream of data. So I attempted to close the stream in the body_callback, however because perform() is currently running i can not invoke close().
Any help would be appreciated.
Code:
# Imports
import pycurl # Used for doing cURL request
import base64 # Used to encode username and API Key
import json # Used to break down the json objects
# Settings to access stream and API
userName = 'twitter_username' # My username
password = 'twitter_password' # My API Key
apiURL = 'http://stream.twitter.com/1/statuses/sample.json' # the twitter api
tweets = [] # An array of Tweets
# Methods to do with the tweets array
def how_many_tweets():
print 'Collected: ',len(tweets)
return len(tweets)
class Tweet:
def __init__(self):
self.raw = ''
self.id = ''
self.content = ''
def decode_json(self):
return True
def set_id(self):
return True
def set_content(self):
return True
def set_raw(self, data):
self.raw = data
# Class to print out the stream as it comes from the API
class Stream:
def __init__(self):
self.tweetBeingRead =''
def body_callback(self, buf):
# This gets whole Tweets, and adds them to an array called tweets
if(buf.startswith('{"in_reply_to_status_id_str"')): # This is the start of a tweet
# Added Tweet to Global Array Tweets
print 'Added:' # Priniting output to console
print self.tweetBeingRead # Printing output to console
theTweetBeingProcessed = Tweet() # Create a new Tweet Object
theTweetBeingProcessed.set_raw(self.tweetBeingRead) # Set its raw value to tweetBeingRead
tweets.append(theTweetBeingProcessed) # Add it to the global array of tweets
# Start processing a new tweet
self.tweet = buf # Start a new tweet from scratch
else:
self.tweetBeingRead = self.tweetBeingRead+buf
if(how_many_tweets()>10):
try:
curling.close() # This is where the problem lays. I want to close the stream
except Exception as CurlError:
print ' Tried closing stream: ',CurlError
# Used to initiate the cURLing of the Data Sift streams
datastream = Stream()
curling = pycurl.Curl()
curling.setopt(curling.URL, apiURL)
curling.setopt(curling.HTTPHEADER, ['Authorization: '+base64.b64encode(userName+":"+password)])
curling.setopt(curling.WRITEFUNCTION, datastream.body_callback)
curling.perform() # This is cURLing starts
print 'I cant reach here.'
curling.close() # This never gets called. :(
You can abort the write callback by returning a number that isn't the same amount as was passed in to it. (By default it treats returning 'None' the same as returning the same number as was passed in to it)
When you abort it, the entire transfer will be considered done and your perform() call returns properly.
That transfer will then return an error as the transfer was aborted.

Categories