Extracting SQL Server table data to parquet file - python

I'm trying to extract one of the SQL Server table data to parquet file format using sqlalchemy, pandas and fastparquet modules, but end up with an exception. Appreciate some help on this, I'm trying this one on a simple table with one column of non null integer type.
Code:
import sqlalchemy as sa
import pandas as pd
import urllib as ul
import fastparquet as fp
def main():
sqlInstance = 'sqlInstance'
database = 'database'
tableName = 'Numbers'
props = ul.parse.quote_plus("DRIVER={SQL Server Native Client 11.0};"
"SERVER=" + sqlInstance + ";"
"DATABASE=" + database + ";"
"Trusted_Connection=yes;")
con = sa.create_engine("mssql+pyodbc:///?odbc_connect={}".format(props))
fetch_batch_size = 1000
metadata = sa.schema.MetaData(bind=con)
table = sa.Table(tableName, metadata, autoload=True)
# Generate pandas/python compatible datatype mapping
map = {}
data_type_map_lookup = {
'int64': ['smallint', 'tinyint', 'integer'],
'float': ['bigint', 'float', 'real'],
'str': ['char', 'nchar', 'nvarchar', 'nvarchar(max)', 'uniqueidentifier', 'varchar(n)', 'varchar(max)'],
'datetime64[ns]': ['date', 'datetime', 'smalldatetime'],
'bytes': ['binary', 'varbinary', 'varbinary(max)'],
'bool': ['bit']
}
for col in table.columns:
for key, value in data_type_map_lookup.items():
types = data_type_map_lookup[key]
if list(filter(str(col.type).lower().startswith, types)):
if col.nullable and key == 'int64':
map[col.name] = 'float'
else:
map[col.name] = key
#Fetch data
output = table.select().execution_options(stream_results=True).execute()
append = False
while True:
batch = output.fetchmany(fetch_batch_size)
if not len(batch) > 0:
break
else:
df = (pd.DataFrame(data=batch, columns=map)).astype(dtype=map)
print(df.to_string()) # Prints good
fp.write("C:\\temp\\test.parquet", df, write_index=False, compression=False, append=append)
append = True
if __name__ == "__main__":
main()
Exception:
TypeError: Expected unicode, got quoted_name
Exception ignored in: 'fastparquet.cencoding.write_list'
Traceback (most recent call last):
File "C:\...lib\site-packages\fastparquet\writer.py", line 1488, in write_thrift
return f.write(obj.to_bytes())
TypeError: Expected unicode, got quoted_name
TypeError: Expected unicode, got quoted_name
Exception ignored in: 'fastparquet.cencoding.write_list'
Traceback (most recent call last):
File "C:\...lib\site-packages\fastparquet\writer.py", line 1488, in write_thrift
return f.write(obj.to_bytes())
TypeError: Expected unicode, got quoted_name

Related

Convert pandas dataframe to .hyper extract

I have an SQL output in a pandas dataframe, that I would like to first convert to a .hyper Tableau extract, and then publish to Tableau server via the Extract API. When I run my code(below), I get the error: 'module' object is not callable for tdefile = tableausdk.HyperExtract(outfilename). I believe my code is correct, but maybe modules were installed incorrectly? Has anyone seen this error?
print("Importing modules...")
import pandas as pd
import pyodbc
import re
import numpy as np
import cx_Oracle
import smtplib
import schedule
import time
import win32com.client as win32
import tableauserverclient as TSC
import os
import tableausdk
from pandleau import *
from tableausdk import *
from tableausdk.HyperExtract import *
print("Done importing modules.")
server = x
db = y
conn_sql = pyodbc.connect(#fill in your connection data)
### sql query - change from getdate() - 4 to TD# ##
sql_1 = """
select
* from test
"""
df = pd.read_sql_query(sql_1, conn_sql)
df.head()
def job(df, outfilename):
if os.path.isfile(outfilename):
os.remove(outfilename)
os.remove('DataExtract.log')
try:
tdefile = tableausdk.HyperExtract(outfilename)
except:
#os.remove(outfilename)
os.system('del ' + outfilename)
os.system('del DataExtract.log')
tdefile = tableausdk.HyperExtract(outfilename)
# define the table definition
tableDef = tableausdk.TableDefinition()
# create a list of column names
colnames = df.columns
# create a list of column types
coltypes = df.dtypes
# create a dict for the field maps
# Define type maps
# Caveat: I am not including all of the possibilities here
fieldMap = {
'float64' : tde.Types.Type.DOUBLE,
'float32' : tde.Types.Type.DOUBLE,
'int64' : tde.Types.Type.DOUBLE,
'int32' : tde.Types.Type.DOUBLE,
'object': tde.Types.Type.UNICODE_STRING,
'bool' : tde.Types.Type.BOOLEAN,
'datetime64[ns]': tde.Types.Type.DATE,
}
# for each column, add the appropriate info the Table Definition
for i in range(0, len(colnames)):
cname = colnames[i] #header of column
coltype = coltypes[i] #pandas data type of column
ctype = fieldMap.get(str(coltype)) #get integer field type in Tableau Speak
tableDef.addColumn(cname, ctype)
# add the data to the table
with tdefile as extract:
table = extract.addTable("Extract", tableDef)
for r in range(0, df.shape[0]):
row = tde.Row(tableDef)
for c in range(0, len(coltypes)):
if df.iloc[r,c] is None:
row.setNull(c)
elif str(coltypes[c]) in ('float64', 'float32', 'int64', 'int32'):
try:
row.setDouble(c, df.iloc[r,c])
except:
row.setNull(c)
elif str(coltypes[c]) == 'object':
try:
row.setString(c, df.iloc[r,c])
except:
row.setNull(c)
elif str(coltypes[c]) == 'bool':
row.setBoolean(c, df.iloc[r,c])
elif str(coltypes[c]) == 'datetime64[ns]':
try:
row.setDate(c, df.iloc[r,c].year, df.iloc[r,c].month, df.iloc[r,c].day )
except:
row.setNull
else:
row.setNull(c)
# insert the row
table.insert(row)
tdefile.close()
#df_tableau = pandleau(df_1)
#df_tableau.set_spatial('SpatialDest', indicator=True)
#df_tableau.to_tableau('test.hyper', add_index=False)
job(df, 'test_1.hyper')

Get replicationLag in mongo with pyMongo

I am trying, to get replication-delay using db.rs.printSlaveReplicationInfo from python with pymongo, but not getting any proper way to do so.
I tried the following, but no help.
>>>from pymongo import MongoClient
>>>client = MongoClient()
>>>db = client.test_database
>>>db.rs.printSlaveReplicationInfo
Collection(Database(MongoClient([u'10.0.0.19:10006', u'10.0.0.68:10002']), u'xyz'), u'rs.printSlaveReplicationInfo')
db.rs.printSlaveReplicationInfo()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/local/lib64/python2.7/site-packages/pymongo/collection.py", line 2413, in __call__
self.__name.split(".")[-1])
TypeError: 'Collection' object is not callable. If you meant to call the 'printSlaveReplicationInfo' method on a 'Collection' object it is failing because no such method exists.
>>> db.rs
Collection(Database(MongoClient([u'10.0.0.19:10006', u'10.0.0.68:10002']), u'xyz'), u'rs')
Can anyone help with this? or how to do it?
Thanks in advance.
I found out the answer.Here is the complete code :
(Note: You need to have admin privileges to run this command.)
uri = "mongodb://usernamen:password#host:port/admin"
conn = pymongo.MongoClient(uri)
db = conn['admin']
db_stats = db.command({'replSetGetStatus' :1})
primary_optime = 0
secondary_optime = 0
for key in db_stats['members'] :
if key['stateStr'] == 'SECONDARY' :
secondary_optime = key['optimeDate']
if key['stateStr'] == 'PRIMARY' :
primary_optime =key['optimeDate']
print 'primary_optime : ' + str(primary_optime)
print 'secondary_optime : ' + str(secondary_optime)
seconds_lag = (primary_optime - secondary_optime ).total_seconds()
#total_seconds() userd to get the lag in seconds rather than datetime object
print 'secondary_lag : ' + str(seconds_lag)
optime reperesents the date,till which that mongo-node has data.
You can read more about it here :
https://docs.mongodb.com/manual/reference/command/replSetGetStatus/

TypeError: 'int' object is not subscriptable Where is int?

I got an error,TypeError: 'int' object is not subscriptable .
I wanna connect 2 excel data
to User model.
So my ideal output is
1|1|Blear|40|false|l|America|A|1
2|5|Tom|23|true|o|UK|A|3
3|9|Rose|52|false|m
4|10|Karen||||Singapore|C|2
For example,Rose data of user_id=3 is not in second excel, in that case being 2nd data empty is ok.I am thinking putting 2nd excel in dictionary type to User model.
I searched the errors I thought this part for data in data_dict was wrong, I changed it into for data in range(len(data_dict)) but same error happens.I really cannot understand where is wrong.How should I fix this?
Now views.py is
#coding:utf-8
from django.shortcuts import render
import xlrd
from .models import User
book = xlrd.open_workbook('../data/excel1.xlsx')
sheet = book.sheet_by_index(1)
def build_employee(employee):
if employee == 'leader':
return 'l'
if employee == 'manager':
return 'm'
if employee == 'others':
return 'o'
for row_index in range(sheet.nrows):
rows = sheet.row_values(row_index)
is_man = rows[4] != ""
emp = build_employee(rows[5])
user = User(user_id=rows[1], name_id=rows[2], name=rows[3],
age=rows[4],man=is_man,employee=emp)
user.save()
book2 = xlrd.open_workbook('../data/excel2.xlsx')
sheet2 = book2.sheet_by_index(0)
headers = sheet2.row_values(0)
large_item = None
data_dict = {}
for row_index in range(sheet2.nrows):
rows2 = sheet2.row_values(row_index)
large_item = rows2[1] or large_item
# Create dict with headers and row values
row_data = {}
for idx_col, value in enumerate(rows2):
header_value = headers[idx_col]
# Avoid to add empty column. A column in your example
if header_value:
row_data[headers[idx_col]] = value
# Add row_data to your data_dict with
data_dict[row_index] = row_data
for row_number, row_data in data_dict.items():
user1 = User.objects.filter(user_id = data['user_id']).exists()
if user1:
user1.__dict__.update(**data_dict)
user1.save()
Now Traceback is
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/Users/XXX/testapp/app/views.py", line 123, in <module>
user1 = User.objects.filter(user_id = row_data['user_id']).exists()
KeyError: 'user_id'
data is an integer. So calling data like a dict raises that expection.
>>> a=1
>>> a['a']
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'int' object is not subscriptable
Why is it an int ? Because you're iterating over data's keys:
>>> a={1: 'x', 2: 'c'}
>>> for i in a: print(i)
...
1
2
Try using items() as such:
>>> for key, value in a.items(): print(key, value)
...
1 x
2 c
Or, in your specific case:
for row_number, row_data in data_dict.items():
print(row_number, row_data)
See looping techniques for dict documentation for details.

python : error handling Ordered dict with unicode data

My script migrates data from MySQL to mongodb. It runs perfectly well when there are no unicode columns included. But throws me below error when OrgLanguages column is added.
mongoImp = dbo.insert_many(odbcArray)
File "/home/lrsa/.local/lib/python2.7/site-packages/pymongo/collection.py", line 711, in insert_many
blk.execute(self.write_concern.document)
File "/home/lrsa/.local/lib/python2.7/site-packages/pymongo/bulk.py", line 493, in execute
return self.execute_command(sock_info, generator, write_concern)
File "/home/lrsa/.local/lib/python2.7/site-packages/pymongo/bulk.py", line 319, in execute_command
run.ops, True, self.collection.codec_options, bwc)
bson.errors.InvalidStringData: strings in documents must be valid UTF-8: 'Portugu\xeas do Brasil, ?????, English, Deutsch, Espa\xf1ol latinoamericano, Polish'
My code:
import MySQLdb, MySQLdb.cursors, sys, pymongo, collections
odbcArray=[]
mongoConStr = '192.168.10.107:36006'
sqlConnect = MySQLdb.connect(host = "54.175.170.187", user = "testuser", passwd = "testuser", db = "testdb", cursorclass=MySQLdb.cursors.DictCursor)
mongoConnect = pymongo.MongoClient(mongoConStr)
sqlCur = sqlConnect.cursor()
sqlCur.execute("SELECT ID,OrgID,OrgLanguages,APILoginID,TransactionKey,SMTPSpeed,TimeZoneName,IsVideoWatched FROM organizations")
dbo = mongoConnect.eaedw.mysqlData
tuples = sqlCur.fetchall()
for tuple in tuples:
odbcArray.append(collections.OrderedDict(tuple))
mongoImp = dbo.insert_many(odbcArray)
sqlCur.close()
mongoConnect.close()
sqlConnect.close()
sys.exit()
Above script migraates data perfectly when tried without OrgLanguages column in the SELECT query.
To overcome this, I have tried to use the OrderedDict() in another way but gives me a different type of error
Changed Code:
for tuple in tuples:
doc = collections.OrderedDict()
doc['oid'] = tuple.OrgID
doc['APILoginID'] = tuple.APILoginID
doc['lang'] = unicode(tuple.OrgLanguages)
odbcArray.append(doc)
mongoImp = dbo.insert_many(odbcArray)
Error Received:
Traceback (most recent call last):
File "pymsql.py", line 19, in <module>
doc['oid'] = tuple.OrgID
AttributeError: 'dict' object has no attribute 'OrgID'
Your MySQL connection is returning characters in a different encoding than UTF-8, which is the encoding that all BSON strings must be in. Try your original code but pass charset='utf8' to MySQLdb.connect.

JSON sub for loop produces KeyError, but key exists

I'm trying to add the JSON output below into a dictionary, to be saved into a SQL database.
{'Parkirisca': [
{
'ID_Parkirisca': 2,
'zasedenost': {
'Cas': '2016-10-08 13:17:00',
'Cas_timestamp': 1475925420,
'ID_ParkiriscaNC': 9,
'P_kratkotrajniki': 350
}
}
]}
I am currently using the following code to add the value to a dictionary:
import scraperwiki
import json
import requests
import datetime
import time
from pprint import pprint
html = requests.get("http://opendata.si/promet/parkirisca/lpt/")
data = json.loads(html.text)
for carpark in data['Parkirisca']:
zas = carpark['zasedenost']
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
parking_type = carpark.get('ID_Parkirisca')
if parking_type == "Avtomatizirano":
is_automatic = "Yes"
else:
is_automatic = "No"
scraped = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
savetodb = {
'scraped': scraped,
'id': carpark.get("ID_Parkirisca"),
'total_spaces': carpark.get("St_mest"),
'free_spaces': free_spaces,
'last_updated': last_updated,
'is_automatic': is_automatic,
'lon': carpark.get("KoordinataX_wgs"),
'lat': carpark.get("KoordinataY_wgs")
}
unique_keys = ['id']
pprint savetodb
However when I run this, it gets stuck at for zas in carpark["zasedenost"] and outputs the following error:
Traceback (most recent call last):
File "./code/scraper", line 17, in <module>
for zas in carpark["zasedenost"]:
KeyError: 'zasedenost'
I've been led to believe that zas is in fact now a string, rather than a dictionary, but I'm new to Python and JSON, so don't know what to search for to get a solution. I've also searched here on Stack Overflow for KeyErrror when key exist questions, but they didn't help, and I believe that this might be due to the fact that's a sub for loop.
Update: Now, when I swapped the double quotes for single quotes, I get the following error:
Traceback (most recent call last):
File "./code/scraper", line 17, in <module>
free_spaces = zas.get('P_kratkotrajniki')
AttributeError: 'unicode' object has no attribute 'get'
I fixed up your code:
Added required imports.
Fixed the pprint savetodb line which isn't valid Python.
Didn't try to iterate over carpark['zasedenost'].
I then added another pprint statement in the for loop to see what's in carpark when the KeyError occurs. From there, the error is clear. (Not all the elements in the array in your JSON contain the 'zasedenost' key.)
Here's the code I used:
import datetime
import json
from pprint import pprint
import time
import requests
html = requests.get("http://opendata.si/promet/parkirisca/lpt/")
data = json.loads(html.text)
for carpark in data['Parkirisca']:
pprint(carpark)
zas = carpark['zasedenost']
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
parking_type = carpark.get('ID_Parkirisca')
if parking_type == "Avtomatizirano":
is_automatic = "Yes"
else:
is_automatic = "No"
scraped = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
savetodb = {
'scraped': scraped,
'id': carpark.get("ID_Parkirisca"),
'total_spaces': carpark.get("St_mest"),
'free_spaces': free_spaces,
'last_updated': last_updated,
'is_automatic': is_automatic,
'lon': carpark.get("KoordinataX_wgs"),
'lat': carpark.get("KoordinataY_wgs")
}
unique_keys = ['id']
pprint(savetodb)
And here's the output on the iteration where the KeyError occurs:
{u'A_St_Mest': None,
u'Cena_dan_Eur': None,
u'Cena_mesecna_Eur': None,
u'Cena_splosno': None,
u'Cena_ura_Eur': None,
u'ID_Parkirisca': 7,
u'ID_ParkiriscaNC': 72,
u'Ime': u'P+R Studenec',
u'Invalidi_St_mest': 9,
u'KoordinataX': 466947,
u'KoordinataX_wgs': 14.567929171694901,
u'KoordinataY': 101247,
u'KoordinataY_wgs': 46.05457609543313,
u'Opis': u'2,40 \u20ac /dan',
u'St_mest': 187,
u'Tip_parkirisca': None,
u'U_delovnik': u'24 ur (ponedeljek - petek)',
u'U_sobota': None,
u'U_splosno': None,
u'Upravljalec': u'JP LPT d.o.o.'}
Traceback (most recent call last):
File "test.py", line 14, in <module>
zas = carpark['zasedenost']
KeyError: 'zasedenost'
As you can see, the error is quite accurate. There's no key 'zasedenost' in the dictionary. If you look through your JSON, you'll see that's true for a number of the elements in that array.
I'd suggest a fix, but I don't know what you want to do in the case where this dictionary key is absent. Perhaps you want something like this:
zas = carpark.get('zasedenost')
if zas is not None:
free_spaces = zas.get('P_kratkotrajniki')
last_updated = zas.get('Cas_timestamp')
else:
free_spaces = None
last_updated = None

Categories