I want to build a URL-shortener which should work as follows:
shortener saves the original URL typed in by user, the automatically generated numeric ID of the original URL and the base 62-encoded version of the numeric ID.
with each new entered original URL, those 3 types of information are saved into a pandas data frame as columns;
the data frame is empty in the beginning; when the first row is inserted, a random numeric ID is generated; the IDs of following rows are incremented by 1
the process of insertion should have following logic:
User gets asks for input (the original URL)
a check is conducted, whether this URL is already contained in the database; if it is, the user gets asked to enter a new URL
if the URL is not contained in the database yet, the script checks if there are other entries in the database yet
if there are no entries, the entered URL gets inserted into the data frame with a randomly generated ID (which is used as the index of the data frame) and the encoded ID
if there are other entries available, the entered Url gets inserted into the data frame with an ID that is the last available ID in the data frame + 1
What I want to achieve is to operate on the same data frame without creating copies of the data frame with each new entry. So the main function should each time receive the same data frame and update it with new rows. However, when I execute the code below, and the script gets to the point of executing the line
database = database.append(row)
in the update_database-function, I get the following error:
UnboundLocalError: local variable 'database' referenced before assignment
This seems strange, because the variable database is defined on the global scope right on top of the main function, so each function within the main function should have access to it. Can anyone tell me where I'm going wrong?
import numpy as np
import pandas as pd
import string
import random
#create the 62-base for encoding
digits = [str(i) for i in list(np.arange(0,10))]
letters_upper = list(string.ascii_uppercase)
letters_lower = list(string.ascii_lowercase)
base_62 = digits + letters_upper + letters_lower
#create the empty database with ID as index
database = pd.DataFrame(columns = ['ID', 'URL_long', 'URL_short']).set_index("ID")
#create the 62-base encoded version of the ID
def encode_id(num,base, base_size):
result = []
while divmod(num,base_size) != (0,0):
el = base[divmod(num,base_size)[1]]
result.append(el)
num = divmod(num,base_size)[0]
result.reverse()
return "".join(result)
def main(df):
database = df
#asks user for input
def user_input():
print("exec user_input")
return input("Enter your URL: ")
#if the entered URL is not in the data frame yet, inserts the URL with ID and encoded ID
def update_database(passed_input_value):
print("exec_update_database")
#executes if the data frame is empty
if len(database) == 0:
first_id = int("".join(random.sample(string.digits,7)))
row = pd.Series({"URL_long": passed_input_value, "URL_short": encode_id(first_id,base_62,62)})
row.name = first_id
#executes if the data frame already has entries
else:
last_id_incr = int(df.index[-1]+1)
row = pd.Series({"URL_long": passed_input_value, "URL_short": encode_id(last_id_incr,base_62,62)})
row.name = last_id_incr
#appends the created row to the data frame
#this is where the error happens
database = database.append(row)
#checks if the entered URL is already in the data frame
#if it is, redirects to the beginning of the process
#if it's not, passes the value of the input to the update-function and executes it
def check_duplicates():
print("exec_check_duplicates")
input_value = user_input()
if input_value in database["URL_long"].unique():
url_available = database[database["URL_long"]==input_value].iloc[0]['URL_short']
print(f"URL already shortened: {url_available}.")
check_duplicates()
else:
update_database(input_value)
check_duplicates()
return database
main(database)
Related
I want to store the JSON I get from an API, but only get the JSON of the last loop. How to get the lists dynamic? Also I need to use the last query (Pandas) but it's not working.
Last how to make an API to :
List latest forecast for each location for every day.
List average the_temp of last 3 forecasts for each location for every day.
Get the top n locations based on each available metric where n is a parameter given in the API call.
import requests
import json
import sqlite3
import pandas as pd #library for data frame
print(sqlite3.sqlite_version)
for x in range(20,28): # i need to get LONDjson/BERLjson/SANjson lists dynamic to store bot 7 jsons from each urls
r = requests.get('https://www.metaweather.com/api/location/44418/2021/4/'+str(x)+'/') #GET request from the source url
LONDjson=r.json() #JSON object of the result
r2 = requests.get('https://www.metaweather.com//api/location/2487956/2021/4/'+str(x)+'/')
SANjson=r2.json()
r3 = requests.get('https://www.metaweather.com//api/location/638242/2021/4/'+str(x)+'/')
BERLjson=r3.json()
conn= sqlite3.connect('D:\weatherdb.db') #create db in path
cursor = conn.cursor()
#import pprint
#pprint.pprint(LONDjson)
cursor.executescript('''
DROP TABLE IF EXISTS LONDjson;
DROP TABLE IF EXISTS SANjson;
DROP TABLE IF EXISTS BERLjson;
CREATE TABLE LONDjson (id int, data json);
''');
for LOND in LONDjson:
cursor.execute("insert into LONDjson values (?, ?)",
[LOND['id'], json.dumps(LOND)])
conn.commit()
z=cursor.execute('''select json_extract(data, '$.id', '$.the_temp', '$.weather_state_name', '$.applicable_date' ) from LONDjson;
''').fetchall() #query the data
hint: in your initial for loop you are not storing the results of api call. you are storing in variable but that is just getting re-written each loop.
a common solution for this starting with empty list that you append to. where perhaps if storing mutliple variables you are storing a dictionary as elements of list
example
results = []
for x in range(10):
results.append(
{
'x': x,
'x_sqaured': x*x,
'abs_x': abs(x)
}
)
print(results)
It looks like there's at least two things that can be improved in the data manipulation part of your code.
Using an array to store the retrieved data
LONDjson = []
SANjson = []
BERLjson = []
for x in range(20,28):
r = requests.get('https://www.metaweather.com/api/location/44418/2021/4/'+str(x)+'/')
LONDjson.append(r.json())
r2 = requests.get('https://www.metaweather.com//api/location/2487956/2021/4/'+str(x)+'/')
SANjson.append(r2.json())
r3 = requests.get('https://www.metaweather.com//api/location/638242/2021/4/'+str(x)+'/')
BERLjson.append(r3.json())
Retrieving the data from the array
# The retrieved data is a dictionary inside a list with only one entry
for LOND in LONDjson:
print(LOND[0]['id'])
Hope this helps you out.
I'm using python unittests and sqlalchemy to test datamodels to store an WTFoms in mariaDB.
The test should create a dataset, write this dataset to db, read this set an compare if original dataset is the same like sored data.
So the partial test looks like that:
#set data
myForm = NiceForm()
myForm.name = "Ben"
#write data
db.session.add(myForm)
db.session.commit()
#read data
loadedForms = NiceForm.query.all()
#check that only one entry is in db
self.assertEqual(len(loadedForms), 1)
#compare stores data with dataset
self.assertIn(myForm, loadedForms)
The test seams to work fine. No I tried the find out, if the test fails, if dataset != stored data. So ein changed the dataset before compareing it, like this:
#set data
myForm = NiceForm()
myForm.name = "Ben"
#write data
db.session.add(myForm)
db.session.commit()
#read data
loadedForms = NiceForm.query.all()
#modify dataset
myForm.name = "Foo"
#show content of both
print(myForm.name)
print(loadedForms[0].name)
#check that only one entry is in db
self.assertEqual(len(loadedForms), 1)
#compare stores data with dataset
self.assertIn(myForm, loadedForms)
This test still passed. Why? I output the content of myForm.name and loadedForms[0].name where both set to Foo. This is the reason, why the self.assertsIn(myForm, loadedForms)passed the test, but I don't understand:
Why the content of the loadedForms is changed, when Foowas only applied to myForm?
The row identity for MyForm does not change by changing one of the values.
Row numbers have no meaning in a table, but to make the issue clear I will still use them.
Row 153 has 2 fields. Field name = "Ben" and field homeruns = 3.
Now we change the home runs (Ben has hit a home run);
Row 153 has 2 fields. Field name = "Ben" and field homeruns = 4.
It is still row 153, so your assertIn wil still return True, though one of the values in the row has changed. You only test identity.
If it wouldn't, changing a field in a table row would need to be saved by an insert into the table and not an update to the row. That is not correct of course; how many Bens do we have? One. And he has 4 home runs, not 3 or 4, depending on which record you look at.
In this use case, I am trying to add documents to a MongoDB collection using pymongo that are retrieved from various RSS news feeds based on the date (not datetime), title, and article summary in dataframe format (the date being the index to the dataframe).
When I store the dataframe to the database, they are stored with the schema of _id, date, title, summary which is fine.
So what I'm trying to do is only upload those rows in the dataframe which haven't been stored as documents in the collection. There are a few ways I've tried:
Get the last document in the database, compare to the dataframe. Create a new DF which excludes all previous rows + the row its being compared to. This should work, however, it is still uploading roughly 20% of the rows which have been previously stored and I have no idea why.
Store the entire dataframe, then aggregate the collection and remove the duplicates: Sounds good in theory however all of the examples of doing this are in JS and not python, so I haven't been able to get this to work.
Create a unique index of the title: Again, this should work in theory, but I haven't gotten it to work.
One thing that I don't want to do is to query the entire collection and store as a DF, concatenate them, drop the duplicates, delete the collection, and re-create it from the new DF. It wouldn't be an issue now since I'm working with 30 or so documents, but when I'll be working with multiple collections and millions of documents, well.. not very efficient at all.
Anyone have any suggestions I can look into / research / code examples?
Here is the code I'm working with now:
Download RSS Feed
def getSymbolNews(self, symbol):
self.symbol = symbol
self.dbName = 'db_' + self.symbol
self.columnName = 'col_News'
self.topics = ['$' + self.symbol]
self.sa = getNews().parseNews(fn.SeekingAlpha(topics = self.topics))
self.yfin = getNews().parseNews(fn.Yahoo(topics = self.topics))
self.wb_news = getNews().getWebullNews(self.symbol)
self.df = pd.concat([self.sa, self.yfin, self.wb_news], axis = 0, ignore_index = False)
self.df.drop_duplicates(inplace = True)
self.df.sort_index(ascending = True, inplace = True)
del self.symbol, self.topics, self.sa, self.yfin, self.wb_news
getNews().uploadRecords(self.dbName, self.columnName, self.df)
return self.df
Upload to Collection:
def uploadRecords(self, dbName, columnName, data):
self.data = data
self.dbName = dbName
self.columnName = columnName
self.data.reset_index(inplace=True)
self.data.rename(columns={'index': 'Date'}, inplace = True)
mongoFunctions.insertRecords(self.dbName, self.columnName, self.data)
del self.data
gc.collect()
return
PyMongo function to upload:
def insertRecords(dbName: str, collectionName: str, data: object):
"""Inserts a pandas dataframe object into a MongoDB collection (table)
Args:
dbName (str): Database name
collectionName (str): Collection name
data (object): Pandas dataframe object
"""
collection = getCollection(dbName, collectionName)
query = queryAllRecords(dbName, collectionName)
if query.shape == (0, 0):
record = data.to_dict(orient="records")
collection.insert(record)
else:
query.drop(["_id"], axis=1, inplace=True)
if query.equals(data):
return
else:
df_temp = pd.concat([query, data]).drop_duplicates(keep=False)
records = df_temp.to_dict(orient="records")
collection.insert_many(records)
return
I'd be minded to take an md5 hash of the document and store that as the _id; then you can just use insert_many() with ordered=False to insert any items that aren't duplicates; you can run this as often as you like and only new items will be added; bear in mind that if any field is even sligtly changed a new item is added; if this isn't the behaviour you want then tweak what you pass to md5().
The code ends up being fairly straightforward:
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
import feedparser
from hashlib import md5
from json import dumps
db = MongoClient()['mydatabase']
entries = feedparser.parse("http://feeds.bbci.co.uk/news/world/rss.xml")['entries']
for item in entries:
item['_id'] = md5(dumps(item).encode("utf-8")).hexdigest()
try:
db.news.insert_many(entries, ordered=False)
except BulkWriteError:
pass
I am attempting to get the results of a stored procedure and populate a model dynamically, or at a minimum, generate a model based off of the result.
My intent is to create a reusable function where it should be ambiguous to the data. I will not know the fields being returned, and wish to take what's returned from the stored procedure, get the field names and put the data in an object with said field names.
How can I dynamically discover the columns in a result set returned from a stored procedure and then create an object to match?
I was able to figure this out. I got a list of the column names from the returned data, created an object by name and set properties/attributes of the object by string.
def callProc(sqlString, clsName):
cursor = connection.cursor()
dataResults = []
try:
cursor.execute(sqlString)
#get data results
results = cursor.fetchall()
#Get column names
columns = [column[0] for column in cursor.description]
#populate class
for row in results:
p = getattr(sys.modules[__name__], clsName)
i=0
for x in columns:
#set property value of matching column name
setattr(p, x, row[i])
#get property value
#x = getattr(p, x)
i=i+1
dataResults.append(p)
except Exception as ex:
print(ex)
finally:
cursor.close()
return dataResults
I have some dbf files that I want to add new fields to. To do so, I'm using dbfpy to open the original dbf, copy all fields (or the ones I want to keep) and records and then create a new file with those fields plus the new ones that I want. All is working great, except for one minor detail: I can't manage to keep the original fields' types, since I don't know how to obtain them. What I'm doing is to create all the fields in the new file as "C" (character), which so far works for what I need right now but might be an issue eventually.
The real problem is that there is no documentation available. I searched through the package files to look for the examples there, but couldn't find an answer to this question (might be that I couldn't find just by the "greenish" I still am with python... I'm definitely not an expert).
An example of the code:
from dbfpy import dbf
import sys
org_db_file = str(sys.argv[1])
org_db = dbf.Dbf(org_db_file, new = False)
new_db_file = str(sys.argv[2])
new_db = dbf.Dbf(new_db_file, new = True)
#Obtain original field names:
fldnames = []
fldsize = {}
for names in org_db.fieldNames:
fldnames.append(names)
fldsize[name] = 0
#Cycle thru table entries:
for rec in org_db:
#Cycle thru columns to obtain fields' name and value:
for name in fldnames:
value = str(rec[name])
if len(value) > fldsize[name]:
fldsize[name] = len(value)
#Copy original fields to new table:
for names in fldnames:
new_db.addField((names, "C", fldsize[name]))
#Add new fields:
new_fieldname = "some_name"
new_db.addField((new_fieldname, "C", 2))
#Copy original entries and store new values:
for rec in org_db:
#Create new record instance for new table:
new_rec = new_db.newRecord()
#Populate fields:
for field in fldnames:
new_rec[field] = rec[field]
#Store value of new field for record i:
new_rec[new_fieldname] = "some_value"
new_rec.store()
new_db.close()
Thanks in advance for your time.
Cheers.
I don't have any experience with dbfpy other than when I first went looking several years ago it (and several others) did not meet my needs. So I wrote my own.
Here is how you would accomplish your task using it:
import dbf
import sys
org_db_file = sys.argv[1]
org_db = dbf.Table(org_db_file)
new_db_file = sys.argv[2]
# postpone until we have the field names...
# new_db = dbf.Dbf(new_db_file, new = True)
# Obtain original field list:
fields = org_db.field_names
for field in fields[:]: # cycle through a separate list
if field == "something we don't like":
fields.remove(field)
# now get definitions for fields we keep
field_defs = ord_db.structure(fields)
# Add new fields:
field_defs.append("some_name C(2)")
# now create new table
new_db = ord_db.new(new_db_file, field_specs=field_defs)
# open both tables
with dbf.Tables(ord_db, new_db):
# Copy original entries and store new values:
for rec in org_db:
# Create new record instance for new table:
new_db.append()
# Populate fields:
with new_db.last_record as new_rec:
for field in new_db.field_names:
new_rec[field] = rec[field]
# Store value of new field for record i:
new_rec[new_fieldname] = "some_value"