Printing values from dictionary in specific form - python

I have a dictionary with keys relating to various reactions and their data ie. exponentn, comment etc. I want to search and print a list of reactions concerning the atom 'BR'. My code currently prints all reactions for 'BR' and the data in random order. I am not sure which data corresponds to which reaction.
I've had a go at trying to use the repr function to output the data as follows but I'm not having much luck: reactionName : exponentn comment I found another question which I tried to replicate but was not able to do so; printing values and keys from a dictionary in a specific format (python).
class SourceNotDefinedException(Exception):
def __init__(self, message):
super(SourceNotDefinedException, self).__init__(message)
class tvorechoObject(object):
"""The class stores a pair of objects, "tv" objects, and "echo" objects. They are accessed simply by doing .tv, or .echo. If it does not exist, it will fall back to the other variable. If neither are present, it returns None."""
def __init__(self, echo=None, tv=None):
self.tv = tv
self.echo = echo
def __repr__(self):
return str({"echo": self.echo, "tv": self.tv}) # Returns the respective strings
def __getattribute__(self, item):
"""Altered __getattribute__() function to return the alternative of .echo / .tv if the requested attribute is None."""
if item in ["echo", "tv"]:
if object.__getattribute__(self,"echo") is None: # Echo data not present
return object.__getattribute__(self,"tv") # Select TV data
elif object.__getattribute__(self,"tv") is None: # TV data not present
return object.__getattribute__(self,"echo") # Select Echo data
else:
return object.__getattribute__(self,item) # Return all data
else:
return object.__getattribute__(self,item) # Return all data
class Reaction(object):
def __init__(self, inputLine, sourceType=None):
#self.reactionName = QVTorQPObject()
self.exponentn = QVTorQPObject()
self.comment = QVTorQPObject()
self.readIn(inputLine, sourceType=sourceType)
products, reactants = self.reactionName.split(">")
self.products = [product.strip() for product in products.split("+")]
self.reactants = [reactant.strip() for reactant in reactants.split("+")]
def readIn(self, inputLine, sourceType=None):
if sourceType == "echo": # Parsed reaction line for combined format
echoPart = inputLine.split("|")[0]
reactionName = inputLine.split(":")[0].strip()
exponentn = echoPart.split("[")[1].split("]")[0].strip() # inputLine.split("[")[1].split("]")[0].strip()
comment = "%".join(echoPart.split("%")[1:]).strip() # "%".join(inputLine.split("%")[1:]).strip()
# Store the objects
self.reactionName = reactionName
self.exponentn.echo = exponentn
self.comment.echo = comment
elif sourceType == "tv": # Parsed reaction line for combined format
tvPart = inputLine.split("|")[1]
reactionName = inputLine.split(":")[0].strip()
comment = "%".join(tvPart.split("!")[1:]).strip() # "%".join(inputLine.split("!")[1:]).strip()
# Store the objects
self.reactionName = reactionName
self.comment.tv = comment
elif sourceType.lower() == "unified":
reaction = inputLine.split(":")[0]
echoInput, tvInput = ":".join(inputLine.split(":")[1:]).split("|")
echoInput = reaction + ":" + echoInput
tvInput = reaction + ":" + tvInput
if "Not present in TV" not in tvInput:
self.readIn(inputLine, sourceType="tv")
if "Not present in Echo" not in echoInput:
self.readIn(inputLine, sourceType="echo")
else:
raise SourceNotDefinedException("'%s' is not a valid 'sourceType'" % sourceType) # Otherwise print
def __repr__(self):
return str({"reactionName": self.reactionName, "exponentn": self.exponentn, "comment": self.comment, })
return str(self.reactionName) # Returns all relevant reactions
keykeyDict = {}
for key in reactionDict.keys():
keykeyDict[key] = key
formatString = "{reactionName:<40s} {comment:<10s}" # TV format
formatString = "{reactionName:<40s} {exponentn:<10s} {comment:<10s}" # Echo format
return formatString.format(**keykeyDict)
return formatString.format(**reactionDict)
def toDict(self, priority="tv"):
"""Returns a dictionary of all the variables, in the form {"comment":<>, "exponentn":<>, ...}. Design used is to be passed into the echo and tv style line format statements."""
if priority in ["echo", "tv" # Creating the dictionary by a large, horrible, list comprehension, to avoid even more repeated text
return dict([("reactionName", self.reactionName)] + [(attributeName, self.__getattribute__(attributeName).__getattribute__(priority))
for attributeName in ["exponentn", "comment"]])
else:
raise SourceNotDefinedException("{0} source type not recognised.".format(priority)) # Otherwise print
def find_allReactions(allReactions, reactant_set):
"""
reactant_set is the set of reactants that you want to grab all reactions which are relevant allReactions is just the set of reactions you're considering. Need to repeatedly loop through all reactions. If the current reaction only contains reactants in the reactant_set, then add all its products to the reactant set. Repeat this until reactant_set does not get larger.
"""
reactant_set = set(reactant_set) # this means that we can pass a list, but it will always be treated as a set.
#Initialise the list of reactions that we'll eventually return
relevant_reactions = []
previous_reactant_count = None
while len(reactant_set) != previous_reactant_count:
previous_reactant_count = len(reactant_set)
for reaction in allReactions:
if set(reaction.reactants).issubset(reactant_set):
relevant_reactions.append(reaction)
reactant_set = reactant_set.union(set(reaction.products))
return relevant_reactions
print find_allReactions(allReactions, ["BR"])
Current output:
'{'exponentn': {'tv': '0', 'echo': '0'}, 'comment': {'tv': 'BR-NOT USED', 'echo': 'BR-NOT USED'},'reactionName': 'E + BR > BR* + E', {'exponentn': {'qvt': '0', 'qp': '0'}, 'comment': {'qvt': 'BR+ -RECOMBINATION', 'qp': 'BR+ -RECOMBINATION'},'reactionName': 'E + BR* > BR* + E'
Desired output: reactionName exponentn comment
E + BR > BR* + E 0 BR+ -RECOMBINATION
E + BR* > BR* + E 0 BR-NOT USED

If your data is added into the dict in a certain order, and you want to preserve that order, collections.OrderedDict is what you're looking for.

Related

decoding uniswap event data in python with ABI?

I started two days ago with ethereum blockchain, so my knowledge is still a little bit all over the place. Nevertheless, i managed to connect to a node, pull some general block data and so on. As a next level of difficulty, I tried to start building event filters, in order to look at more specific types of historical data (to be clear, I don't want to fetch live data, I would rather like to query through the entire chain, and get historical sample extracts for various types of data).
See here my first attempt to build an event filter for the USDC Uniswap V2 contract, in order to collect Swap events (its not about speed or efficiency right now, just to make it work):
w3 = Web3(Web3.HTTPProvider(NODE_ADDRESS))
# uniswap v2 USDC
address = w3.toChecksumAddress('0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc')
# get the ABI for uniswap v2 pair events
resp = requests.get("https://unpkg.com/#uniswap/v2-core#1.0.0/build/IUniswapV2Pair.json")
if resp.status_code==200:
abi = json.loads(resp.content)['abi']
# create contract object
contract = w3.eth.contract(address=address, abi=abi)
# get topics by hashing abi event signatures
res = contract.events.Swap.build_filter()
# put this into a filter input dictionary
filter_params = {'fromBlock':int_to_hex(12000000),'toBlock':int_to_hex(12010000),**res.filter_params}
# res.filter_params contains: 'topics' and 'address'
# create a filter id (i.e. a hashed version of the filter data, representing the filter)
method = 'eth_newFilter'
params = [filter_params]
resp = self.block_manager.general_sample_request(method,params)
if 'error' in resp:
print(resp)
else:
filter_id = resp['result']
# pass on the filter id, in order to query the respective logs
params = [filter_id]
method = 'eth_getFilterLogs'
resp = self.block_manager.general_sample_request(method,params)
# takes about 10-12s for about 12000 events
the resulting array contains event logs of this structure:
resp['result'][0]
>>>
{'address': '0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc',
'topics': ['0xd78ad95fa46c994b6551d0da85fc275fe613ce37657fb8d5e3d130840159d822',
'0x0000000000000000000000007a250d5630b4cf539739df2c5dacb4c659f2488d',
'0x0000000000000000000000000ffd670749d4179558b6b367e30e72ce2efea28f'],
'data': '0x0000000000000000000000000000000000000000000000000000000000000000000000000000000000000\
00000000000000000000000000034f0f8a0c7663264000000000000000000000000000000000000000000000\
000000000019002d5b60000000000000000000000000000000000000000000000000000000000000000',
'blockNumber': '0xb71b01',
'transactionHash': '0x76403053ee0300411b68fc223b327b51fb4f1a26e1f6cb8667e05ec370e8176e',
'transactionIndex': '0x22',
'blockHash': '0x4bd35cb48395e77fd317a0309342c95d6687dbc4fcb85ada2d635fe266d1e769',
'logIndex': '0x16',
'removed': False}
As far as I understand now, I can somehow apply the ABI to decode the 'data' field.
I tried with this function:
contract.decode_function_input(resp['result'][0]['data'])
but it gives me this error:
>>> ValueError: Could not find any function with matching selector
Seems like there is some problem with decoding the data. However, I am so close now to getting the real data, I dont wanna give up xD. Any help will be appreciated!
Thanks!
import json
import traceback
from pprint import pprint
from eth_utils import event_abi_to_log_topic, to_hex
from hexbytes import HexBytes
from web3._utils.events import get_event_data
from web3.auto import w3
def decode_tuple(t, target_field):
output = dict()
for i in range(len(t)):
if isinstance(t[i], (bytes, bytearray)):
output[target_field[i]['name']] = to_hex(t[i])
elif isinstance(t[i], (tuple)):
output[target_field[i]['name']] = decode_tuple(t[i], target_field[i]['components'])
else:
output[target_field[i]['name']] = t[i]
return output
def decode_list_tuple(l, target_field):
output = l
for i in range(len(l)):
output[i] = decode_tuple(l[i], target_field)
return output
def decode_list(l):
output = l
for i in range(len(l)):
if isinstance(l[i], (bytes, bytearray)):
output[i] = to_hex(l[i])
else:
output[i] = l[i]
return output
def convert_to_hex(arg, target_schema):
"""
utility function to convert byte codes into human readable and json serializable data structures
"""
output = dict()
for k in arg:
if isinstance(arg[k], (bytes, bytearray)):
output[k] = to_hex(arg[k])
elif isinstance(arg[k], (list)) and len(arg[k]) > 0:
target = [a for a in target_schema if 'name' in a and a['name'] == k][0]
if target['type'] == 'tuple[]':
target_field = target['components']
output[k] = decode_list_tuple(arg[k], target_field)
else:
output[k] = decode_list(arg[k])
elif isinstance(arg[k], (tuple)):
target_field = [a['components'] for a in target_schema if 'name' in a and a['name'] == k][0]
output[k] = decode_tuple(arg[k], target_field)
else:
output[k] = arg[k]
return output
def _get_topic2abi(abi):
if isinstance(abi, (str)):
abi = json.loads(abi)
event_abi = [a for a in abi if a['type'] == 'event']
topic2abi = {event_abi_to_log_topic(_): _ for _ in event_abi}
return topic2abi
def _sanitize_log(log):
for i, topic in enumerate(log['topics']):
if not isinstance(topic, HexBytes):
log['topics'][i] = HexBytes(topic)
if 'address' not in log:
log['address'] = None
if 'blockHash' not in log:
log['blockHash'] = None
if 'blockNumber' not in log:
log['blockNumber'] = None
if 'logIndex' not in log:
log['logIndex'] = None
if 'transactionHash' not in log:
log['transactionHash'] = None
if 'transactionIndex' not in log:
log['transactionIndex'] = None
def decode_log(log, abi):
if abi is not None:
try:
# get a dict with all available events from the ABI
topic2abi = _get_topic2abi(abi)
# ensure the log contains all necessary keys
_sanitize_log(log)
# get the ABI of the event in question (stored as the first topic)
event_abi = topic2abi[log['topics'][0]]
# get the event name
evt_name = event_abi['name']
# get the event data
data = get_event_data(w3.codec, event_abi, log)['args']
target_schema = event_abi['inputs']
decoded_data = convert_to_hex(data, target_schema)
return (evt_name, decoded_data, target_schema)
except Exception:
return ('decode error', traceback.format_exc(), None)
else:
return ('no matching abi', None, None)
Example usage:
output = decode_log(
{'data': '0x000000000000000000000000000000000000000000000000000000009502f90000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000093f8f932b016b1c',
'topics': [
'0xd78ad95fa46c994b6551d0da85fc275fe613ce37657fb8d5e3d130840159d822',
'0x0000000000000000000000007a250d5630b4cf539739df2c5dacb4c659f2488d',
'0x000000000000000000000000242301fa62f0de9e3842a5fb4c0cdca67e3a2fab'],
},
pair_abi
)
print(output[0])
pprint(output[1])
# Swap
# {'amount0In': 2500000000,
# 'amount0Out': 0,
# 'amount1In': 0,
# 'amount1Out': 666409132118600476,
# 'sender': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
# 'to': '0x242301FA62f0De9e3842A5Fb4c0CdCa67e3A2Fab'}
Or in your case:
output = decode_log(resp['result'][0], pair_abi)
print(output[0])
pprint(output[1])
# Swap
# {'amount0In': 0,
# 'amount0Out': 6711072182,
# 'amount1In': 3814822253806629476,
# 'amount1Out': 0,
# 'sender': '0x7a250d5630B4cF539739dF2C5dAcb4c659F2488D',
# 'to': '0x0Ffd670749D4179558b6B367E30e72ce2efea28F'}
Now, note that you need to provide the pair_abi variable. It depends on the type of smart contract that you're using. I've found that when on Uniswap V3, the UniswapV2Pair ABI worked for some events, while UniswapV3Pool ABI worked for others, in particular for the Swap event that I've found the most useful.
After a few hours of digging I managed to find this solution, which is a slightly modified version of the one proposed in: https://towardsdatascience.com/decoding-ethereum-smart-contract-data-eed513a65f76 Big thumbs up to its author 👍 You can read more there on parsing the transaction input too.

Data output is not the same as inside function

I am currently having an issue where I am trying to store data in a list (using dataclasses). When I print the data inside the list in the function (PullIncursionData()) it responded with a certain amount of numbers (never the same, not possible due to it's nature). When printing it after it being called to store it's return in a Var it somehow prints only the same number.
I cannot share the numbers, as they update with EVE Online's API, so the only way is to run it locally and read the first list yourself.
The repository is Here: https://github.com/AtherActive/EVEAPI-Demo
Heads up! Inside the main.py (the file with issues) (a snippet of code is down below) are more functions. All functions from line 90 and forward are important, the rest can be ignored for this question, as they do not interact with the other functions.
def PullIncursionData():
#Pulls data from URL and converts it into JSON
url = 'https://esi.evetech.net/latest/incursions/?datasource=tranquility'
data = rq.get(url)
jsData = data.json()
#Init var to store incursions
incursions = []
#Set lenght for loop. yay
length = len(jsData)
# Every loop incursion data will be read by __parseIncursionData(). It then gets added to var Incursions.
for i in range(length):
# Add data to var Incursion.
incursions.append(__parseIncursionData(jsData, i))
# If Dev mode, print some debug. Can be toggled in settings.py
if settings.developerMode == 1:
print(incursions[i].constellation_id)
return incursions
# Basically parses the input data in a decent manner. No comments needed really.
def __parseIncursionData(jsData, i):
icstruct = stru.Incursion
icstruct.constellation_id = jsData[i]['constellation_id']
icstruct.constellation_name = 'none'
icstruct.staging = jsData[i]['staging_solar_system_id']
icstruct.region_name = ResolveSystemNames(icstruct.constellation_id, 'con-reg')
icstruct.status = jsData[i]['state']
icstruct.systems_id = jsData[i]['infested_solar_systems']
icstruct.systems_names = ResolveSystemNames(jsData[i]['infested_solar_systems'], 'system')
return icstruct
# Resolves names for systems, regions and constellations. Still WIP.
def ResolveSystemNames(id, mode='constellation'):
#init value
output_name = 'none'
# If constellation, pull data and find region name.
if mode == 'con-reg':
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?constellationid={}&format=json'.format(id)
data = rq.get(url)
jsData = data.json()
output_name = jsData[0]['regionname']
# Pulls system name form Fuzzwork.co.uk.
elif mode == 'system':
#Convert output to a list.
output_name = []
lenght = len(id)
# Pulls system name from Fuzzwork. Not that hard.
for i in range(lenght):
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?solarsystemid={}&format=json'.format(id[i])
data = rq.get(url)
jsData = data.json()
output_name.append(jsData[i]['solarsystemname'])
return output_name
icdata = PullIncursionData()
print('external data check:')
length = len(icdata)
for i in range(length):
print(icdata[i].constellation_id)
structures.py (custom file)
#dataclass
class Incursion:
constellation_id = int
constellation_name = str
staging = int
staging_name = str
systems_id = list
systems_names = list
region_name = str
status = str
def ___init___(self):
self.constellation_id = -1
self.constellation_name = 'undefined'
self.staging = -1
self.staging_name = 'undefined'
self.systems_id = []
self.systems_names = []
self.region_name = 'undefined'
self.status = 'unknown'

using another WS as validation Flask/Rest/Mysql

I am trying to build a simple web application with 3 web services. Two of my web services are supposed to validate if a student exist in a course or not. This is done by a simple SELECT-query. My third web service should add a student into a database, but only if the student do exist in the specific course.
This is my validation WS which should return a true/false.
#app.route('/checkStudOnCourse/<string:AppCode>/<string:ideal>', methods= ["GET"])
def checkStudOnCourseWS(AppCode, ideal):
myCursor3 = mydb.cursor()
query3 = ("SELECT studentID FROM Ideal.course WHERE applicationCode = " + "'" + AppCode + "' AND Ideal = " + "'" + ideal + "'")
myCursor3.execute(query3)
myresult3 = myCursor3.fetchall()
if len(myresult3) == 0:
return render_template('Invalid.html')
else:
return jsonify({'Student in course ': True})
Below is regResult which should do a SQL insert into a database. I only want the submit to work if the above result is "True", how can I do that? I know I have not done the INSERT query, but that is not a problem.
What I am unsure about is: How can I only let the submit be be INSERTED if the validation WS is "True".
#app.route('/register', methods=["POST", "GET"])
def regResultat():
if request.method == "POST":
Period = request.form['period']
#ProvNr = request.form['provNr']
Grade = request.form['grade']
Applicationcode = request.form['applicationcode']
#Datum = request.form['datum']
Ideal = request.form['ideal']
CheckStudOnCourse = 'http://127.0.0.1:5000/checkAppCodeWS/'+Applicationcode+'/'+Ideal
CheckStudOnResp = requests.get(CheckStudOnCourse)
At first, such syntax:
if len(myresult3) == 0, can be simplified by if myresult3, because Python evaluates that implicitly to bool.
Secondly, if you once returned from function, there is no need to write an else statement:
if len(myresult3) == 0:
return render_template('Invalid.html') # < -- in case 'True',
# it returns here, otherwise
# function keeps going"""
return jsonify({'Student in course ': True}) # < -- in case 'False', it is returned here
Focusing on your issue, you could do that:
Get your value from ws
CheckStudOnCourse = 'http://127.0.0.1:5000/checkAppCodeWS/'+Applicationcode+'/'+Ideal
CheckStudOnResp = requests.get(CheckStudOnCourse)
Extract json from it:
if result_as_json.status == 200:
result_as_json = CheckStudOnResp.json() # < -- it is now a dict
Do some checks:
if result_as_json.get('Student in course', False): # I highly suggest to use other
# convention to name json keys
# e.g. Student in course ->
# student_exists_in_course
# do your code here

Building Abreviations Dictionary from Text file

I would like to build a dictionary of abreviations.
I have a text file with a lot of abreviations. The text file looks like this(after import)
with open('abreviations.txt') as ab:
ab_words = ab.read().splitlines()
An extract:
'ACE',
'Access Control Entry',
'ACK',
'Acknowledgement',
'ACORN',
'A Completely Obsessive Really Nutty person',
Now I want to build the dictionnary, where I have every uneven line as a dictionary key and every even line as the dictionary value.
Hence I should be able to write at the end:
ab_dict['ACE']
and get the result:
'Access Control Entry'
Also, How can I make it case-insensitive ?
ab_dict['ace']
should yield the same result
'Access Control Entry'
In fact, it would be perfect, if the output would also be lower case:
'access control entry'
Here is a link to the text file: https://www.dropbox.com/s/91afgnupk686p9y/abreviations.txt?dl=0
Complete solution with custom ABDict class and Python's generator functionality:
class ABDict(dict):
''' Class representing a dictionary of abbreviations'''
def __getitem__(self, key):
v = dict.__getitem__(self, key.upper())
return v.lower() if key.islower() else v
with open('abbreviations.txt') as ab:
ab_dict = ABDict()
while True:
try:
k = next(ab).strip() # `key` line
v = next(ab).strip() # `value` line
ab_dict[k] = v
except StopIteration:
break
Now, testing (with case-relative access):
print(ab_dict['ACE'])
print(ab_dict['ace'])
print('*' * 10)
print(ab_dict['WYTB'])
print(ab_dict['wytb'])
The output(consecutively):
Access Control Entry
access control entry
**********
Wish You The Best
wish you the best
Here's another solution based on the pairwise function from this solution:
from requests.structures import CaseInsensitiveDict
def pairwise(iterable):
"s -> (s0, s1), (s2, s3), (s4, s5), ..."
a = iter(iterable)
return zip(a, a)
with open('abreviations.txt') as reader:
abr_dict = CaseInsensitiveDict()
for abr, full in pairwise(reader):
abr_dict[abr.strip()] = full.strip()
Here is an answer that also allows sentences to be replaced with words from the dictionary:
import re
from requests.structures import CaseInsensitiveDict
def read_file_dict(filename):
"""
Reads file data into CaseInsensitiveDict
"""
# lists for keys and values
keys = []
values = []
# case sensitive dict
data = CaseInsensitiveDict()
# count used for deciding which line we're on
count = 1
with open(filename) as file:
temp = file.read().splitlines()
for line in temp:
# if the line count is even, a value is being read
if count % 2 == 0:
values.append(line)
# otherwise, a key is being read
else:
keys.append(line)
count += 1
# Add to dictionary
# perhaps some error checking here would be good
for key, value in zip(keys, values):
data[key] = value
return data
def replace_word(ab_dict, sentence):
"""
Replaces sentence with words found in dictionary
"""
# not necessarily words, but you get the idea
words = re.findall(r"[\w']+|[.,!?; ]", sentence)
new_words = []
for word in words:
# if word is in dictionary, replace it and add it to resulting list
if word in ab_dict:
new_words.append(ab_dict[word])
# otherwise add it as normally
else:
new_words.append(word)
# return sentence with replaced words
return "".join(x for x in new_words)
def main():
ab_dict = read_file_dict("abreviations.txt")
print(ab_dict)
print(ab_dict['ACE'])
print(ab_dict['Ace'])
print(ab_dict['ace'])
print(replace_word(ab_dict, "The ACE is not easy to understand"))
if __name__ == '__main__':
main()
Which outputs:
{'ACE': 'Access Control Entry', 'ACK': 'Acknowledgement', 'ACORN': 'A Completely Obsessive Really Nutty person'}
Access Control Entry
Access Control Entry
Access Control Entry
The Access Control Entry is not easy to understand

How to scrape entire integer in python with Beautiful Soup?

Working on getting some wave heights from websites and my code fails when the wave heights get into the double digit range.
Ex: Currently the code would scrape a 12 from the site as '1' and '2' separately, not '12'.
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
NUM_SITES = 2
reportsFinal = []
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This class represents a surf break. It contains all wave, wind, & tide data
associated with that break relevant to the website
'''
class surfBreak:
def __init__(self, name,low, high, wind, tide):
self.name = name
self.low = low
self.high = high
self.wind = wind
self.tide = tide
#toString method
def __str__(self):
return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name,
self.low, self.high, self.wind, self.tide)
#END CLASS
'''
This returns the proper attribute from the surf report sites
'''
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
#END METHOD
'''
This method checks if the parameter is of type int
'''
def representsInt(s):
try:
int(s)
return True
except ValueError:
return False
#END METHOD
'''
This method extracts all ints from a list of reports
reports: The list of surf reports from a single website
returns: reportNums - A list of ints of the wave heights
'''
def extractInts(reports):
print reports
reportNums = []
afterDash = False
num = 0
tens = 0
ones = 0
#extract all ints from the reports and ditch the rest
for report in reports:
for char in report:
if representsInt(char) == True:
num = int(char)
reportNums.append(num)
else:
afterDash = True
return reportNums
#END METHOD
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extractReports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
reportNums = []
index = 0
#loop thru URLs
for url in urlList:
try:
index += 1
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where surflines report lives
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
#notify if fail
except:
print 'scrape failure at URL ', index
pass
reportNums = extractInts(reports)
return reportNums
#END METHOD
'''
This method calculates the average of the wave heights
'''
def calcAverages(reportList):
#empty list to hold averages
finalAverages = []
listIndex = 0
waveIndex = 0
#loop thru list of reports to calc each breaks ave low and high
for x in range(0, 6):
#get low ave
average = (reportList[listIndex][waveIndex]
+ reportList[listIndex+1][waveIndex]) / NUM_SITES
finalAverages.append(average)
waveIndex += 1
return finalAverages
#END METHOD
slReports = extractReports(slRootUrl, slUrls, slTag, slTagText)
msReports = extractReports(msRootUrl, msUrls, msTag, msTagText)
reportsFinal.append(slReports)
reportsFinal.append(msReports)
print 'Surfline: ', slReports
print 'Magicseaweed: ', msReports
You are not actually extracting integers, but floats, it seems, since the values in reports are something like ['0.3-0.6 m']. Right now you are just going through every single character and converting them to int one by one or discarding. So no wonder that you will get only single-digit numbers.
One (arguably) simple way to extract those numbers from that string is with regexp:
import re
FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")
def extractFloats(reports):
reportNums = []
for report in reports:
groups = re.match(FLOATEXPR, report).groups()
for group in groups:
reportNums.append(float(group))
return reportNums
This expression would match your floats and return them as a list.
In detail, the expression will match anything that has at least one digit before a '.', and one digit after it, a '-' between, another float sequence and ending with 'm' or ' m'. Then it groups the parts representing floats to a tuple. For example that ['12.0m-3.0m'] would return [12.0, 3.0]. If you expect it to have more digits after the floating point, you can add an extra '+' after the second 'd':s in the expression.

Categories