Get all rows in Json format with API-Rest Cassandra - python

I have the following code that allows me to retrieve the first keyspace:
def Query(str):
auth_provider = PlainTextAuthProvider(username='admin', password='root')
cluster = Cluster(['hostname'], auth_provider=auth_provider)
session = cluster.connect('system')
rows = session.execute(str)
keyspaces = []
row_list = list(rows)
for x in range(len(row_list)):
return row_list[0]
#app.route('/keyspaces')
def all():
return Query('select json * from schema_keyspaces')
I would like not only get all the keyspaces, but also their attributes and that in JSON document, how I can proceed ?
Thanks,

Instead of a loop that only runs once, you need to collect all the elements
rows = session.execute(str)
return jsonify(list(rows))
Note that you should ideally not be creating a new cassandra connection for each query you need to make, but that's unrelated to the current problem

Related

I Made Tow function in pymongo but the out but that i want is different from i get from the function any ideas how can i fix it?

Funtion that save Close,Symbol, Timeframe
def Save_(self,collection,symbol,price,TF):
db = self.get_db('MTF')[collection]
B = {'ts':time.time(),"Symbol":symbol,
"Price":price,'TimeFrame':TF}
data = db.insert_one(B)
return data
Function to get data from mongodb
def find_all(self,collection):
db = self.get_db('MTF')[collection]
Symbols ={}
data = db.find({})
for i in data:
Symbols[i['Symbol']] = [i['Price'],i['TimeFrame']]
return Symbols
images from mongodb
[2]: https://i.stack.imgur.com/RLtnz.png
images from B Function
[1]: https://i.stack.imgur.com/AtwSy.png
if u see the image from Function B only gave me on timeframe but Function Save have 4 timeframe
Looking at this loop:
for i in data:
Symbols[i['Symbol']] = [i['Price'],i['TimeFrame']]
If you have the same Symbol coming from MongoDB, it will overwrite any previous value, so you will only get the final value for each Symbol which is what you are seeing.
To fix it you have a few options: you could check the key and either create or append the values to Symbols; or you could use $push in an aggregate query.

Making lists store all data of the loop and not only last one

I want to store the JSON I get from an API, but only get the JSON of the last loop. How to get the lists dynamic? Also I need to use the last query (Pandas) but it's not working.
Last how to make an API to :
List latest forecast for each location for every day.
List average the_temp of last 3 forecasts for each location for every day.
Get the top n locations based on each available metric where n is a parameter given in the API call.
import requests
import json
import sqlite3
import pandas as pd #library for data frame
print(sqlite3.sqlite_version)
for x in range(20,28): # i need to get LONDjson/BERLjson/SANjson lists dynamic to store bot 7 jsons from each urls
r = requests.get('https://www.metaweather.com/api/location/44418/2021/4/'+str(x)+'/') #GET request from the source url
LONDjson=r.json() #JSON object of the result
r2 = requests.get('https://www.metaweather.com//api/location/2487956/2021/4/'+str(x)+'/')
SANjson=r2.json()
r3 = requests.get('https://www.metaweather.com//api/location/638242/2021/4/'+str(x)+'/')
BERLjson=r3.json()
conn= sqlite3.connect('D:\weatherdb.db') #create db in path
cursor = conn.cursor()
#import pprint
#pprint.pprint(LONDjson)
cursor.executescript('''
DROP TABLE IF EXISTS LONDjson;
DROP TABLE IF EXISTS SANjson;
DROP TABLE IF EXISTS BERLjson;
CREATE TABLE LONDjson (id int, data json);
''');
for LOND in LONDjson:
cursor.execute("insert into LONDjson values (?, ?)",
[LOND['id'], json.dumps(LOND)])
conn.commit()
z=cursor.execute('''select json_extract(data, '$.id', '$.the_temp', '$.weather_state_name', '$.applicable_date' ) from LONDjson;
''').fetchall() #query the data
hint: in your initial for loop you are not storing the results of api call. you are storing in variable but that is just getting re-written each loop.
a common solution for this starting with empty list that you append to. where perhaps if storing mutliple variables you are storing a dictionary as elements of list
example
results = []
for x in range(10):
results.append(
{
'x': x,
'x_sqaured': x*x,
'abs_x': abs(x)
}
)
print(results)
It looks like there's at least two things that can be improved in the data manipulation part of your code.
Using an array to store the retrieved data
LONDjson = []
SANjson = []
BERLjson = []
for x in range(20,28):
r = requests.get('https://www.metaweather.com/api/location/44418/2021/4/'+str(x)+'/')
LONDjson.append(r.json())
r2 = requests.get('https://www.metaweather.com//api/location/2487956/2021/4/'+str(x)+'/')
SANjson.append(r2.json())
r3 = requests.get('https://www.metaweather.com//api/location/638242/2021/4/'+str(x)+'/')
BERLjson.append(r3.json())
Retrieving the data from the array
# The retrieved data is a dictionary inside a list with only one entry
for LOND in LONDjson:
print(LOND[0]['id'])
Hope this helps you out.

How to get column names in a SQLAlchemy query?

I have a function (remote database, without models in my app), I am making a request to it. Is it possible to get the column names using query rather than execute?
session = Session(bind=engine)
data = session.query(func.schema.func_name())
I am getting an array of strings with values, how do I get the keys? I want to generate a dict.
When I make a request with an execute, the dictionary is generated fine.
data = session.execute("select * from schema.func_name()")
result = [dict(row) for row in data]
You can do something like:
keys = session.execute("select * from schema.func_name()").keys()
Or try accessing it after the query:
data = session.query(func.schema.func_name()).all()
data[0].keys()
You can also use: data.column_descriptions
Documention:
https://docs.sqlalchemy.org/en/14/orm/query.html

What's the most efficient way to add (new) documents from a Dataframe to MongoDB?

In this use case, I am trying to add documents to a MongoDB collection using pymongo that are retrieved from various RSS news feeds based on the date (not datetime), title, and article summary in dataframe format (the date being the index to the dataframe).
When I store the dataframe to the database, they are stored with the schema of _id, date, title, summary which is fine.
So what I'm trying to do is only upload those rows in the dataframe which haven't been stored as documents in the collection. There are a few ways I've tried:
Get the last document in the database, compare to the dataframe. Create a new DF which excludes all previous rows + the row its being compared to. This should work, however, it is still uploading roughly 20% of the rows which have been previously stored and I have no idea why.
Store the entire dataframe, then aggregate the collection and remove the duplicates: Sounds good in theory however all of the examples of doing this are in JS and not python, so I haven't been able to get this to work.
Create a unique index of the title: Again, this should work in theory, but I haven't gotten it to work.
One thing that I don't want to do is to query the entire collection and store as a DF, concatenate them, drop the duplicates, delete the collection, and re-create it from the new DF. It wouldn't be an issue now since I'm working with 30 or so documents, but when I'll be working with multiple collections and millions of documents, well.. not very efficient at all.
Anyone have any suggestions I can look into / research / code examples?
Here is the code I'm working with now:
Download RSS Feed
def getSymbolNews(self, symbol):
self.symbol = symbol
self.dbName = 'db_' + self.symbol
self.columnName = 'col_News'
self.topics = ['$' + self.symbol]
self.sa = getNews().parseNews(fn.SeekingAlpha(topics = self.topics))
self.yfin = getNews().parseNews(fn.Yahoo(topics = self.topics))
self.wb_news = getNews().getWebullNews(self.symbol)
self.df = pd.concat([self.sa, self.yfin, self.wb_news], axis = 0, ignore_index = False)
self.df.drop_duplicates(inplace = True)
self.df.sort_index(ascending = True, inplace = True)
del self.symbol, self.topics, self.sa, self.yfin, self.wb_news
getNews().uploadRecords(self.dbName, self.columnName, self.df)
return self.df
Upload to Collection:
def uploadRecords(self, dbName, columnName, data):
self.data = data
self.dbName = dbName
self.columnName = columnName
self.data.reset_index(inplace=True)
self.data.rename(columns={'index': 'Date'}, inplace = True)
mongoFunctions.insertRecords(self.dbName, self.columnName, self.data)
del self.data
gc.collect()
return
PyMongo function to upload:
def insertRecords(dbName: str, collectionName: str, data: object):
"""Inserts a pandas dataframe object into a MongoDB collection (table)
Args:
dbName (str): Database name
collectionName (str): Collection name
data (object): Pandas dataframe object
"""
collection = getCollection(dbName, collectionName)
query = queryAllRecords(dbName, collectionName)
if query.shape == (0, 0):
record = data.to_dict(orient="records")
collection.insert(record)
else:
query.drop(["_id"], axis=1, inplace=True)
if query.equals(data):
return
else:
df_temp = pd.concat([query, data]).drop_duplicates(keep=False)
records = df_temp.to_dict(orient="records")
collection.insert_many(records)
return
I'd be minded to take an md5 hash of the document and store that as the _id; then you can just use insert_many() with ordered=False to insert any items that aren't duplicates; you can run this as often as you like and only new items will be added; bear in mind that if any field is even sligtly changed a new item is added; if this isn't the behaviour you want then tweak what you pass to md5().
The code ends up being fairly straightforward:
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
import feedparser
from hashlib import md5
from json import dumps
db = MongoClient()['mydatabase']
entries = feedparser.parse("http://feeds.bbci.co.uk/news/world/rss.xml")['entries']
for item in entries:
item['_id'] = md5(dumps(item).encode("utf-8")).hexdigest()
try:
db.news.insert_many(entries, ordered=False)
except BulkWriteError:
pass

SQLAlchemy MySQL - Optimal method to update a table that needs to be frequently entirely updated. Questions on implementation

My use case;
My code runs multiple Scrapy spiders on different US counties to collect property data on every property. This is done by looping through a list of PINS/Parcels(100k to 200k) which are appended to the same URLS over and over, collecting sales data on each parcel or property, and storing that data on its respective county table one row at a time. My use case involves updating these tables frequently(once a week or so) to collect trends on sales data. Out of 100K properties, it can be that only a few acquired new sales records, but I would not know unless I went through all of them.
I currently began implementing this via pipeline below which essentially accomplishes getting the data in the table on the first run when the table is a clean slate. However when re-running to refresh data, I'm obviously unable to insert rows that contain the same unique ID and would need to update the row instead. My unique ID for each data point is its parcel number.
My questions-
1. What is the optimal method to update a database table that
requires a full refresh(all rows) frequently?
My guess so far based on research I've done is replacing the old table with a new temporary table. This is because it would be quicker(I think) to insert all data into a new table than to query each item in the old table, see if it has changed, and if changed, modify that row. This can be accomplished by inserting all data into the temporary table first, then replacing the old table with the new one.
If my method of implementation is optimal, how would I go about implementing this?
Should I use some kind of data migration module(panda?)- What would happen if I dropped the old table, and program was interrupted at this point prior to new table replacing it.
class PierceDataPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates tables.
"""
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self,item,spider):
"""
This method is called for every item pipeline component
"""
session = self.Session()
propertyDataTable = PierceCountyPropertyData()
propertyDataTable.parcel = item["parcel"]
propertyDataTable.mailing_address = item["mailing_address"]
propertyDataTable.owner_name = item["owner_name"]
propertyDataTable.county = item["county"]
propertyDataTable.site_address = item["site_address"]
propertyDataTable.property_type = item["property_type"]
propertyDataTable.occupancy = item["occupancy"]
propertyDataTable.year_built = item["year_built"]
propertyDataTable.adj_year_built = item["adj_year_built"]
propertyDataTable.units = item["units"]
propertyDataTable.bedrooms = item["bedrooms"]
propertyDataTable.baths = item["baths"]
propertyDataTable.siding_type = item["siding_type"]
propertyDataTable.stories = item["stories"]
propertyDataTable.lot_square_footage = item["lot_square_footage"]
propertyDataTable.lot_acres = item["lot_acres"]
propertyDataTable.current_balance_due = item["current_balance_due"]
propertyDataTable.tax_year_1 = item["tax_year_1"]
propertyDataTable.tax_year_2 = item["tax_year_2"]
propertyDataTable.tax_year_3 = item["tax_year_3"]
propertyDataTable.tax_year_1_assessed = item["tax_year_1_assessed"]
propertyDataTable.tax_year_2_assessed = item["tax_year_2_assessed"]
propertyDataTable.tax_year_3_assessed = item["tax_year_3_assessed"]
propertyDataTable.sale1_price = item["sale1_price"]
propertyDataTable.sale1_date = item["sale1_date"]
propertyDataTable.sale2_date = item["sale2_date"]
propertyDataTable.sale2_price = item["sale2_price"]
try:
session.add(propertyDataTable)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item

Categories