I'm pretty much new to python and SQL and I am trying some coding tasks. I have a SQL query in the format of the below, where I return a set of values using python and SQL. What I would like to do using python is to define the variable "X as User_Name" and parse this to a text file within my local linux directory (for example in a file called Usernames.txt).
query = """\
Select
X as User_Name,
Y,
Z
FROM
tbl1
WHERE ...
AND ... """
In the below snippets I attempt to write this to the text file, but does not seem to work for me
cursor = connection.cursor()
....
fo = open ('/localDrive/Usernames.txt', 'a')
for row in cur:
rows = list(row)
fo.write(rows[0])
....
fo.close()
The issue is sometimes there are more than 1 row returned so I'd need to store all usernames in that text file. I'd like then to be able to check against this text file and not return SQL Output if the "X as User_Name" already exists within the text file (Usernames.txt) This is something I'm not sure how to do
Just use Pickle to save your data, with dictionaries and sets to compare them. Pickle can save and load Python objects with no parsing required.
If you want a human readable output as well, just print the objects to the screen or file.
e.g. (untested)
import pickle
from pathlib import Path
pickle_path = Path("data.pickle")
fields = ('field_1', 'field_2', 'field_3')
def add_fields(data_list):
# Return a list of dictionaries
return [dict(zip(fields, row)) for row in data_list]
def get_unique_values(dict_list, key):
# Return a set of key field values
return set(dl[key] for dl in dict_list)
def get_data_subset(dict_list, key_field, keys):
# Return records where key_field contains values in keys
return [dl for dl in dict_list if dl[key_field] in keys]
# ...
# Create DB connection etc.
# ...
# ...
cursor = connection.cursor()
cursor.execute(query)
results = cursor.fetchall()
# De-serialise the local data if it exists
if pickle_path.exists():
with pickle_path.open("rb") as pp:
prev_results = pickle.load(pp)
else:
prev_results = []
results = add_fields(results)
keys = get_unique_values(results, 'field_1')
prev_keys = get_unique_values(prev_results, 'field_1')
# All keys
all_keys = keys | prev_keys
# in both sets
existing_keys = keys ^ prev_keys
# just in prev
deleted_keys = prev_keys - keys
# just the new values in keys
new_keys = keys - prev_keys
# example: handle deleted data
temp_dl = []
for row in prev_results:
if row['field_1'] not in deleted_keys:
temp_dl.append(row)
prev_results = temp_dl
# example: handle new keys
new_data = get_data_subset(results, 'field_1', new_keys)
prev_results.extend(new_data)
# Serialise the local data
if pickle_path.exists():
pickle_path.unlink()
with pickle_path.open("wb") as pp:
pickle.dump(prev_results, pp)
if len(new_data):
print("New records added")
for row in new_data:
print(row)
What i get from api:
"name":"reports"
"col_type":"array<struct<imageUrl:string,reportedBy:string>>"
So in hive schema I got:
reports array<struct<imageUrl:string,reportedBy:string>>
Note: I got hive array schema as string from api
My target:
bigquery.SchemaField("reports", "RECORD", mode="NULLABLE",
fields=(
bigquery.SchemaField('imageUrl', 'STRING'),
bigquery.SchemaField('reportedBy', 'STRING')
)
)
Note: I would like to create universal code that can handle when i receive any number of struct inside of the array.
Any tips are welcome.
I tried creating a script that parses your input which is reports array<struct<imageUrl:string,reportedBy:string>>. This converts your input to a dictionary that could be used as schema when creating a table. The main idea of the apporach is instead of using SchemaField(), you can create a dictionary which is much easier than creating SchemaField() objects with parameters using your example input.
NOTE: The script is only tested based on your input and it can parse more fields if added in struct<.
import re
from google.cloud import bigquery
def is_even(number):
if (number % 2) == 0:
return True
else:
return False
def clean_string(str_value):
return re.sub(r'[\W_]+', '', str_value)
def convert_to_bqdict(api_string):
"""
This only works for a struct with multiple fields
This could give you an idea on constructing a schema dict for BigQuery
"""
num_even = True
main_dict = {}
struct_dict = {}
field_arr = []
schema_arr = []
# Hard coded this since not sure what the string will look like if there are more inputs
init_struct = sample.split(' ')
main_dict["name"] = init_struct[0]
main_dict["type"] = "RECORD"
main_dict["mode"] = "NULLABLE"
cont_struct = init_struct[1].split('<')
num_elem = len(cont_struct)
# parse fields inside of struct<
for i in range(0,num_elem):
num_even = is_even(i)
# fields are seen on even indices
if num_even and i != 0:
temp = list(filter(None,cont_struct[i].split(','))) # remove blank elements
for elem in temp:
fields = list(filter(None,elem.split(':')))
struct_dict["name"] = clean_string(fields[0])
# "type" works for STRING as of the moment refer to
# https://cloud.google.com/bigquery/docs/schemas#standard_sql_data_types
# for the accepted data types
struct_dict["type"] = clean_string(fields[1]).upper()
struct_dict["mode"] = "NULLABLE"
field_arr.append(struct_dict)
struct_dict = {}
main_dict["fields"] = field_arr # assign dict to array of fields
schema_arr.append(main_dict)
return schema_arr
sample = "reports array<struct<imageUrl:string,reportedBy:string,newfield:bool>>"
bq_dict = convert_to_bqdict(sample)
client = bigquery.Client()
project = client.project
dataset_ref = bigquery.DatasetReference(project, '20211228')
table_ref = dataset_ref.table("20220203")
table = bigquery.Table(table_ref, schema=bq_dict)
table = client.create_table(table)
Output:
I am currently trying to append to the output list in my code the id of the query result. I can get it to do one of the ids but it will override the first one how can I change my code to allow any amount of looping to the output.append(q.id)
Here is the code:
#app.route('/new-mealplan', methods=['POST'])
def create_mealplan():
data = request.get_json()
recipes = data['recipes']
output = []
for recipe in recipes:
try:
query = Recipes.query.filter(func.lower(Recipes.recipe_name) == func.lower(recipe)).all()
# print(recipe)
if query:
query = Recipes.query.filter(func.lower(Recipes.recipe_name) == func.lower(recipe)).all()
for q in query:
output.append(q.id)
finally:
return jsonify({"data" : output})
To fix this I removed the
Try and Finally blocks.
Then returned after the for-loop was completed.
I have the following code below which takes a specific type of .xml file and converts to a more reader friendly .csv file.
You can see what the reader friendly .csv looks like in the two linked .csv file output links below.
A few things I would like to add/change, but need help with altering my code to do so.
1.) I'd like to add one decimal position to column H so instead of reading say 20, I want it to read as 2.0. I'd also like to add one decimal position to any row that has CF in it, i.e. cell AE1, AG1, AI1, etc.
2.) I'd like to insert a row above row 1 where I can type in row headers. For example, H1 reads as T2. I want to insert a row above H1 that will say Accel./Decel. Time [s]. I'd like to be able to do this for every row in the spreadsheet.
3.) I'd like to rearrange the column order of the outputted .csv file. I'd like to move columns AD - BQ to where column I is.
Any help with any of these would be greatly appreciated. Thanks!
.csv file output
.csv file output1
from lxml import etree
import os
import pandas as pd
import collections
ProcessRecipe = collections.namedtuple('ProcessRecipe', ['recipe_number', 'recipe_steps'])
class File:
def __init__(self, fp):
fp_wo_quotes = fp.replace('"', '')
if os.path.isfile(fp_wo_quotes):
self.fp = fp_wo_quotes
else:
self.fp = None
#property
def parent_folder(self):
return os.path.dirname(self.fp)
def new_fp(self, file_name):
return os.path.join(self.parent_folder, file_name)
class MPCProcessRecipe:
RECIPE_TYPE = 'MPCProcessRecipe'
def __init__(self, fp):
"""
constructor
:param fp: absolute file path to a recipe file
"""
self.tree = etree.parse(fp)
#staticmethod
def get_columns(step_data):
"""
Collect attributes from a step data element for CSV column header
:param step_data: step data element
:return: column names as a list of strings
"""
_columns = ['Step']
for data_element in step_data.xpath('./Data'):
# <Data name="Repeat">0</Data> // without index
# <Data name="MR" index="1"/> // with index
_name = data_element.get('name')
_index = data_element.get('index')
column_name = _name + _index if _index else _name
_columns.append(column_name)
# print('columns', _columns)
return _columns
#staticmethod
def get_data(step_data_list):
data = []
for step_data in step_data_list:
name = step_data.tag
step = step_data.get('step')
values = []
for _data_element in step_data.xpath('./Data'):
_text = _data_element.text
if _text:
values.append(_text)
else:
values.append('')
datum = [name + step] + values
# print('datum', datum)
data.append(datum)
return data
#property
def process_recipe_list(self):
"""
Loop through <Recipe> elements and populate DataFrames
:return: list of ProcessRecipe namedtuple
"""
pr_list = []
for recipe in self.tree.xpath('/Recipes/Recipe'):
# <Recipe number="994" type="MPCProcessRecipe">
data =[]
if recipe.get('type') == self.RECIPE_TYPE:
recipe_number = recipe.get('number')
# <PreStepData step="1">
# <StepData step="1">
pre_step_data_list = recipe.xpath('./RecipeData/PreStepData')
step_data_list = recipe.xpath('./RecipeData/StepData')
# create columns from the first entry of StepData
columns = MPCProcessRecipe.get_columns(step_data_list[0])
if len(pre_step_data_list) > 0:
data += MPCProcessRecipe.get_data(pre_step_data_list)
data += MPCProcessRecipe.get_data(step_data_list)
df = pd.DataFrame(data=data, columns=columns)
pr = ProcessRecipe(recipe_number, df)
pr_list.append(pr)
return pr_list
def convert_xml_files_to_csv_files():
fp_input = input('D&D a recipe file: ')
file_obj = File(fp_input)
print('File Path:', file_obj.fp)
rcp = MPCProcessRecipe(file_obj.fp)
pr_list = rcp.process_recipe_list
for pr in pr_list:
out_fp = file_obj.new_fp(pr.recipe_number + '.csv')
pr.recipe_steps.to_csv(out_fp)
if __name__ == '__main__':
convert_xml_files_to_csv_files()
I have to insert 8000+ records into a SQLite database using Django's ORM. This operation needs to be run as a cronjob about once per minute.
At the moment I'm using a for loop to iterate through all the items and then insert them one by one.
Example:
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
What is an efficient way of doing this?
Edit: A little comparison between the two insertion methods.
Without commit_manually decorator (11245 records):
nox#noxdevel marinetraffic]$ time python manage.py insrec
real 1m50.288s
user 0m6.710s
sys 0m23.445s
Using commit_manually decorator (11245 records):
[nox#noxdevel marinetraffic]$ time python manage.py insrec
real 0m18.464s
user 0m5.433s
sys 0m10.163s
Note: The test script also does some other operations besides inserting into the database (downloads a ZIP file, extracts an XML file from the ZIP archive, parses the XML file) so the time needed for execution does not necessarily represent the time needed to insert the records.
You want to check out django.db.transaction.commit_manually.
http://docs.djangoproject.com/en/dev/topics/db/transactions/#django-db-transaction-commit-manually
So it would be something like:
from django.db import transaction
#transaction.commit_manually
def viewfunc(request):
...
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
transaction.commit()
Which will only commit once, instead at each save().
In django 1.3 context managers were introduced.
So now you can use transaction.commit_on_success() in a similar way:
from django.db import transaction
def viewfunc(request):
...
with transaction.commit_on_success():
for item in items:
entry = Entry(a1=item.a1, a2=item.a2)
entry.save()
In django 1.4, bulk_create was added, allowing you to create lists of your model objects and then commit them all at once.
NOTE the save method will not be called when using bulk create.
>>> Entry.objects.bulk_create([
... Entry(headline="Django 1.0 Released"),
... Entry(headline="Django 1.1 Announced"),
... Entry(headline="Breaking: Django is awesome")
... ])
In django 1.6, transaction.atomic was introduced, intended to replace now legacy functions commit_on_success and commit_manually.
from the django documentation on atomic:
atomic is usable both as a decorator:
from django.db import transaction
#transaction.atomic
def viewfunc(request):
# This code executes inside a transaction.
do_stuff()
and as a context manager:
from django.db import transaction
def viewfunc(request):
# This code executes in autocommit mode (Django's default).
do_stuff()
with transaction.atomic():
# This code executes inside a transaction.
do_more_stuff()
Bulk creation is available in Django 1.4:
https://django.readthedocs.io/en/1.4/ref/models/querysets.html#bulk-create
Have a look at this. It's meant for use out-of-the-box with MySQL only, but there are pointers on what to do for other databases.
You might be better off bulk-loading the items - prepare a file and use a bulk load tool. This will be vastly more efficient than 8000 individual inserts.
To answer the question particularly with regard to SQLite, as asked, while I have just now confirmed that bulk_create does provide a tremendous speedup there is a limitation with SQLite: "The default is to create all objects in one batch, except for SQLite where the default is such that at maximum 999 variables per query is used."
The quoted stuff is from the docs--- A-IV provided a link.
What I have to add is that this djangosnippets entry by alpar also seems to be working for me. It's a little wrapper that breaks the big batch that you want to process into smaller batches, managing the 999 variables limit.
You should check out DSE. I wrote DSE to solve these kinds of problems ( massive insert or updates ). Using the django orm is a dead-end, you got to do it in plain SQL and DSE takes care of much of that for you.
Thomas
def order(request):
if request.method=="GET":
cust_name = request.GET.get('cust_name', '')
cust_cont = request.GET.get('cust_cont', '')
pincode = request.GET.get('pincode', '')
city_name = request.GET.get('city_name', '')
state = request.GET.get('state', '')
contry = request.GET.get('contry', '')
gender = request.GET.get('gender', '')
paid_amt = request.GET.get('paid_amt', '')
due_amt = request.GET.get('due_amt', '')
order_date = request.GET.get('order_date', '')
print(order_date)
prod_name = request.GET.getlist('prod_name[]', '')
prod_qty = request.GET.getlist('prod_qty[]', '')
prod_price = request.GET.getlist('prod_price[]', '')
print(prod_name)
print(prod_qty)
print(prod_price)
# insert customer information into customer table
try:
# Insert Data into customer table
cust_tab = Customer(customer_name=cust_name, customer_contact=cust_cont, gender=gender, city_name=city_name, pincode=pincode, state_name=state, contry_name=contry)
cust_tab.save()
# Retrive Id from customer table
custo_id = Customer.objects.values_list('customer_id').last() #It is return
Tuple as result from Queryset
custo_id = int(custo_id[0]) #It is convert the Tuple in INT
# Insert Data into Order table
order_tab = Orders(order_date=order_date, paid_amt=paid_amt, due_amt=due_amt, customer_id=custo_id)
order_tab.save()
# Insert Data into Products table
# insert multiple data at a one time from djanog using while loop
i=0
while(i<len(prod_name)):
p_n = prod_name[i]
p_q = prod_qty[i]
p_p = prod_price[i]
# this is checking the variable, if variable is null so fill the varable value in database
if p_n != "" and p_q != "" and p_p != "":
prod_tab = Products(product_name=p_n, product_qty=p_q, product_price=p_p, customer_id=custo_id)
prod_tab.save()
i=i+1
I recommend using plain SQL (not ORM) you can insert multiple rows with a single insert:
insert into A select from B;
The select from B portion of your sql could be as complicated as you want it to get as long as the results match the columns in table A and there are no constraint conflicts.
def order(request):
if request.method=="GET":
# get the value from html page
cust_name = request.GET.get('cust_name', '')
cust_cont = request.GET.get('cust_cont', '')
pincode = request.GET.get('pincode', '')
city_name = request.GET.get('city_name', '')
state = request.GET.get('state', '')
contry = request.GET.get('contry', '')
gender = request.GET.get('gender', '')
paid_amt = request.GET.get('paid_amt', '')
due_amt = request.GET.get('due_amt', '')
order_date = request.GET.get('order_date', '')
prod_name = request.GET.getlist('prod_name[]', '')
prod_qty = request.GET.getlist('prod_qty[]', '')
prod_price = request.GET.getlist('prod_price[]', '')
# insert customer information into customer table
try:
# Insert Data into customer table
cust_tab = Customer(customer_name=cust_name, customer_contact=cust_cont, gender=gender, city_name=city_name, pincode=pincode, state_name=state, contry_name=contry)
cust_tab.save()
# Retrive Id from customer table
custo_id = Customer.objects.values_list('customer_id').last() #It is return Tuple as result from Queryset
custo_id = int(custo_id[0]) #It is convert the Tuple in INT
# Insert Data into Order table
order_tab = Orders(order_date=order_date, paid_amt=paid_amt, due_amt=due_amt, customer_id=custo_id)
order_tab.save()
# Insert Data into Products table
# insert multiple data at a one time from djanog using while loop
i=0
while(i<len(prod_name)):
p_n = prod_name[i]
p_q = prod_qty[i]
p_p = prod_price[i]
# this is checking the variable, if variable is null so fill the varable value in database
if p_n != "" and p_q != "" and p_p != "":
prod_tab = Products(product_name=p_n, product_qty=p_q, product_price=p_p, customer_id=custo_id)
prod_tab.save()
i=i+1
return HttpResponse('Your Record Has been Saved')
except Exception as e:
return HttpResponse(e)
return render(request, 'invoice_system/order.html')