I'm trying to import sqlalchemy library
import pandas as pd
import sqlalchemy as sa
engine = sa.create_engine('postgresql://{login}:{password}#database.org:5432/database'.format(login = 'login', password='password'))
engine_ch = sa.create_engine('clickhouse://database#ip')
but this ends up with an error
ImportError: cannot import name '_literal_as_label_reference'
I've checked this post, installed the right versions of both libraries, there was no errors during installation, but the import error remains
sqlalchemy version is 1.3.24, sqlalchemy-clickhouse version is 0.1.5.post0
I have a code below which connects to a MongoDB database and selects the specified JSON file, flattens it and exports it as a CSV.
So my problem is some of the JSON files in the MongoDB databases are absolutely huge with thousands of rows, so what I am trying to do is filter table down so that I only bring in data from the last 7 days.
from pymongo import MongoClient
import pandas as pd
import os, uuid, sys
import collections
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from pandas import json_normalize
mongo_client = MongoClient("connstring")
db = mongo_client.nhdb
table = db.Report
document = table.find()
mongo_docs = list(document)
mongo_docs = json_normalize(mongo_docs)
mongo_docs.to_csv("Report.csv", sep = ",", index=False)
Any help will be much appreciated.
Note: I know a way to do it in Azure Data Factory using the expression below, however, I am not sure how to go about it in Python
{"createdDatetime":{$gt: ISODate("#{adddays(utcnow(),-7)}")}}
In python create a datetime object to use as a filter; for example this shows the last 7 days:
from datetime import datetime, timedelta
document = table.find({'createdDatetime': {'$gt': datetime.utcnow() - timedelta(days=7)}})
I am running a job on a Databricks notebook that connects to my MySQL database on AWS RDS and inserts data. When I ran the notebook manually, I was able to connect to the endpoint URL and insert my data. Now I have my notebook running on a corn job every 30 min. The first job was successful, but every job after that failed with this error:
MySQLInterfaceError: MySQL server has gone away
I then tried to run my job manually again and I get the same error on tweets_pdf.to_sql(name='tweets', con=engine, if_exists = 'replace', index=False). This is the code that is running in the Databricks notebook:
from __future__ import print_function
import sys
import pymysql
import os
import re
import mysql.connector
from sqlalchemy import create_engine
from operator import add
import pandas as pd
from pyspark.sql.types import StructField, StructType, StringType
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
import json
import boto
import boto3
from boto.s3.key import Key
import boto.s3.connection
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import *
# Get AWS credentials
aws_key_id = os.environ.get("accesskeyid")
aws_key = os.environ.get("secretaccesskey")
# Start spark instance
conf = SparkConf().setAppName("first")
sc = SparkContext.getOrCreate(conf=conf)
# Allow spark to access my S3 bucket
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId",aws_key_id)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey",aws_key)
config_dict = {"fs.s3n.awsAccessKeyId":aws_key_id,
"fs.s3n.awsSecretAccessKey":aws_key}
bucket = "diego-twitter-stream-sink"
prefix = "/2020/*/*/*/*"
filename = "s3n://{}/{}".format(bucket, prefix)
# Convert file from S3 bucket to an RDD
rdd = sc.hadoopFile(filename,
'org.apache.hadoop.mapred.TextInputFormat',
'org.apache.hadoop.io.Text',
'org.apache.hadoop.io.LongWritable',
conf=config_dict)
spark = SparkSession.builder.appName("PythonWordCount").config("spark.files.overwrite","true").getOrCreate()
# Map RDD to specific columns
df = spark.read.json(rdd.map(lambda x: x[1]))
features_of_interest = ["ts", "text", "sentiment"]
df_reduce = df.select(features_of_interest)
# Convert RDD to Pandas Dataframe
tweets_pdf = df_reduce.toPandas()
engine = create_engine(f'mysql+mysqlconnector://admin:{os.environ.get("databasepassword")}#{os.environ.get("databasehost")}/twitter-data')
tweets_pdf.to_sql(name='tweets', con=engine, if_exists = 'replace', index=False)
Does anyone know what could be the issue? All of the database config variables are correct, the S3 bucket that PySpark is streaming from has data, and the AWS RDS is nowhere near any capacity or compute limits.
A default max_allowed_packets (4M) can cause this issue
In Python version 2.7.6
Pandas version 0.18.1
MySQL 5.7
import MySQLdb as dbapi
import sys
import csv
import os
import sys, getopt
import pandas as pd
df = pd.read_csv('test.csv')
rows = df.apply(tuple, 1).unique().tolist()
db=dbapi.connect(host=dbServer,user=dbUser,passwd=dbPass)
cur=db.cursor()
for (CLIENT_ID,PROPERTY_ID,YEAR) in rows:
INSERT_QUERY=("INSERT INTO {DATABASE}.TEST SELECT * FROM {DATABASE}_{CLIENT_ID}.TEST WHERE PROPERTY_ID = {PROPERTY_ID} AND YEAR = {YEAR};".format(
CLIENT_ID=CLIENT_ID,
PROPERTY_ID=PROPERTY_ID,
YEAR=YEAR,
DATABASE=DATABASE
))
print INSERT_QUERY
cur.execute(INSERT_QUERY)
db.query(INSERT_QUERY)
This will print out the query I am looking for, however, without successfully returning the results of INSERT INTO when I checked the results in MySQL
INSERT INTO test.TEST SELECT * FROM test_1.TEST WHERE PROPERTY_ID = 1 AND YEAR = 2015;
However, if I just copy and paste this MySQL query into MySQL GUI, it will execute without any problem. Could any guru enlighten?
I also tried the following
cur.execute(INSERT_QUERY, multi=True)
Returns an error
TypeError: execute() got an unexpected keyword argument 'multi'
The answer here is we need to use "from mysql.connector" and a db.commit(). Here is a good example
http://www.mysqltutorial.org/python-mysql-insert/
import MySQLdb as dbapi
import mysql.connector
import sys
import csv
import os
import sys, getopt
import pandas as pd
df = pd.read_csv('test.csv')
rows = df.apply(tuple, 1).unique().tolist()
db=dbapi.connect(host=dbServer,user=dbUser,passwd=dbPass)
cur=db.cursor()
conn = mysql.connector.connect(host=dbServer,user=dbUser,port=dbPort,password=dbPass)
cursor=conn.cursor()
for (CLIENT_ID,PROPERTY_ID,YEAR) in rows:
INSERT_QUERY=("INSERT INTO {DATABASE}.TEST SELECT * FROM {DATABASE}_{CLIENT_ID}.TEST WHERE PROPERTY_ID = {PROPERTY_ID} AND YEAR = {YEAR};".format(
CLIENT_ID=CLIENT_ID,
PROPERTY_ID=PROPERTY_ID,
YEAR=YEAR,
DATABASE=DATABASE
))
print INSERT_QUERY
cursor.execute(INSERT_QUERY)
conn.commit()
Only by having the commit, the database/ table changes will be accepted
I was using mysql-connector pool, trying to insert a new row into a table, and got the same problem. The version info: mysql-8, python3.7.
The solution is to add connection.commit at last even you didn't start transaction.
The documentation for Pandas has numerous examples of best practices for working with data stored in various formats.
However, I am unable to find any good examples for working with databases like MySQL for example.
Can anyone point me to links or give some code snippets of how to convert query results using mysql-python to data frames in Pandas efficiently ?
As Wes says, io/sql's read_sql will do it, once you've gotten a database connection using a DBI compatible library. We can look at two short examples using the MySQLdb and cx_Oracle libraries to connect to Oracle and MySQL and query their data dictionaries. Here is the example for cx_Oracle:
import pandas as pd
import cx_Oracle
ora_conn = cx_Oracle.connect('your_connection_string')
df_ora = pd.read_sql('select * from user_objects', con=ora_conn)
print 'loaded dataframe from Oracle. # Records: ', len(df_ora)
ora_conn.close()
And here is the equivalent example for MySQLdb:
import MySQLdb
mysql_cn= MySQLdb.connect(host='myhost',
port=3306,user='myusername', passwd='mypassword',
db='information_schema')
df_mysql = pd.read_sql('select * from VIEWS;', con=mysql_cn)
print 'loaded dataframe from MySQL. records:', len(df_mysql)
mysql_cn.close()
For recent readers of this question: pandas have the following warning in their docs for version 14.0:
Warning: Some of the existing functions or function aliases have been
deprecated and will be removed in future versions. This includes:
tquery, uquery, read_frame, frame_query, write_frame.
And:
Warning: The support for the ‘mysql’ flavor when using DBAPI connection objects has
been deprecated. MySQL will be further supported with SQLAlchemy
engines (GH6900).
This makes many of the answers here outdated. You should use sqlalchemy:
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine('dialect://user:pass#host:port/schema', echo=False)
f = pd.read_sql_query('SELECT * FROM mytable', engine, index_col = 'ID')
For the record, here is an example using a sqlite database:
import pandas as pd
import sqlite3
with sqlite3.connect("whatever.sqlite") as con:
sql = "SELECT * FROM table_name"
df = pd.read_sql_query(sql, con)
print df.shape
I prefer to create queries with SQLAlchemy, and then make a DataFrame from it. SQLAlchemy makes it easier to combine SQL conditions Pythonically if you intend to mix and match things over and over.
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Table
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from pandas import DataFrame
import datetime
# We are connecting to an existing service
engine = create_engine('dialect://user:pwd#host:port/db', echo=False)
Session = sessionmaker(bind=engine)
session = Session()
Base = declarative_base()
# And we want to query an existing table
tablename = Table('tablename',
Base.metadata,
autoload=True,
autoload_with=engine,
schema='ownername')
# These are the "Where" parameters, but I could as easily
# create joins and limit results
us = tablename.c.country_code.in_(['US','MX'])
dc = tablename.c.locn_name.like('%DC%')
dt = tablename.c.arr_date >= datetime.date.today() # Give me convenience or...
q = session.query(tablename).\
filter(us & dc & dt) # That's where the magic happens!!!
def querydb(query):
"""
Function to execute query and return DataFrame.
"""
df = DataFrame(query.all());
df.columns = [x['name'] for x in query.column_descriptions]
return df
querydb(q)
MySQL example:
import MySQLdb as db
from pandas import DataFrame
from pandas.io.sql import frame_query
database = db.connect('localhost','username','password','database')
data = frame_query("SELECT * FROM data", database)
The same syntax works for Ms SQL server using podbc also.
import pyodbc
import pandas.io.sql as psql
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=servername;DATABASE=mydb;UID=username;PWD=password')
cursor = cnxn.cursor()
sql = ("""select * from mytable""")
df = psql.frame_query(sql, cnxn)
cnxn.close()
And this is how you connect to PostgreSQL using psycopg2 driver (install with "apt-get install python-psycopg2" if you're on Debian Linux derivative OS).
import pandas.io.sql as psql
import psycopg2
conn = psycopg2.connect("dbname='datawarehouse' user='user1' host='localhost' password='uberdba'")
q = """select month_idx, sum(payment) from bi_some_table"""
df3 = psql.frame_query(q, conn)
For Sybase the following works (with http://python-sybase.sourceforge.net)
import pandas.io.sql as psql
import Sybase
df = psql.frame_query("<Query>", con=Sybase.connect("<dsn>", "<user>", "<pwd>"))
pandas.io.sql.frame_query is deprecated. Use pandas.read_sql instead.
import the module
import pandas as pd
import oursql
connect
conn=oursql.connect(host="localhost",user="me",passwd="mypassword",db="classicmodels")
sql="Select customerName, city,country from customers order by customerName,country,city"
df_mysql = pd.read_sql(sql,conn)
print df_mysql
That works just fine and using pandas.io.sql frame_works (with the deprecation warning). Database used is the sample database from mysql tutorial.
This should work just fine.
import MySQLdb as mdb
import pandas as pd
con = mdb.connect(‘127.0.0.1’, ‘root’, ‘password’, ‘database_name’);
with con:
cur = con.cursor()
cur.execute(“select random_number_one, random_number_two, random_number_three from randomness.a_random_table”)
rows = cur.fetchall()
df = pd.DataFrame( [[ij for ij in i] for i in rows] )
df.rename(columns={0: ‘Random Number One’, 1: ‘Random Number Two’, 2: ‘Random Number Three’}, inplace=True);
print(df.head(20))
This helped for me for connecting to AWS MYSQL(RDS) from python 3.x based lambda function and loading into a pandas DataFrame
import json
import boto3
import pymysql
import pandas as pd
user = 'username'
password = 'XXXXXXX'
client = boto3.client('rds')
def lambda_handler(event, context):
conn = pymysql.connect(host='xxx.xxxxus-west-2.rds.amazonaws.com', port=3306, user=user, passwd=password, db='database name', connect_timeout=5)
df= pd.read_sql('select * from TableName limit 10',con=conn)
print(df)
# TODO implement
#return {
# 'statusCode': 200,
# 'df': df
#}
For Postgres users
import psycopg2
import pandas as pd
conn = psycopg2.connect("database='datawarehouse' user='user1' host='localhost' password='uberdba'")
customers = 'select * from customers'
customers_df = pd.read_sql(customers,conn)
customers_df