I'm trying to load data from MYSQL to BigQuery. I'm using pandas,jaydebeapi and load_table_from_dataframe.
While using the same, getting below error:
>>> job = client.load_table_from_dataframe(chunk, table_id)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/aarpan_roy/.local/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 1993, in load_table_from_dataframe
parquet_compression=parquet_compression,
File "/home/aarpan_roy/.local/lib/python2.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 486, in dataframe_to_parquet
arrow_table = dataframe_to_arrow(dataframe, bq_schema)
File "/home/aarpan_roy/.local/lib/python2.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 450, in dataframe_to_arrow
bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)
File "/home/aarpan_roy/.local/lib/python2.7/site-packages/google/cloud/bigquery/_pandas_helpers.py", line 224, in bq_to_arrow_array
return pyarrow.Array.from_pandas(series, type=arrow_type)
File "pyarrow/array.pxi", line 755, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 265, in pyarrow.lib.array
File "pyarrow/array.pxi", line 80, in pyarrow.lib._ndarray_to_array
TypeError: an integer is required
>>>
Couple of points:
My source table exists and has below schema:
EMPID INTEGER,
EMPNAME VARCHAR,
STREETADRESS VARCHAR,
REGION VARCHAR,
STATE VARCHAR,
COUNTRY VARCHAR,
joining_date date,
last_update_date TIMESTAMP(6) -- to hold till millisecond
My target table also exists in BigQuery and below is the schema:
create table if not exists `Project.dataset.table_name`
(EMPID INT64,
EMPNAME STRING,
STREETADRESS STRING,
REGION STRING,
STATE STRING,
COUNTRY STRING,
joining_date DATE,
last_update_date TIMESTAMP
);
Below is the code I'm using:
import datetime
from google.cloud import bigquery
import pandas as pd
import jaydebeapi
import os
client = bigquery.Client()
table_id = "<project_id>.<dataset>.<target_table>"
database_host='<IP Address>'
database_user='<user id>'
database_password='<password>'
database_port='<port>'
database_db='<database_name>'
jclassname = "com.mysql.jdbc.Driver"
url = "jdbc:mysql://{host}:{port}/{database}".format(host=database_host, port=database_port, database=database_db)
driver_args = [database_user, database_password]
jars = ["/<Home_Dir>/script/jars/mysql-connector-java-5.1.45.jar"]
libs = None
cnx = jaydebeapi.connect(jclassname, url, driver_args, jars=jars, libs=libs)
query='select EMPID,EMPNAME,STREETADRESS,REGION,STATE,COUNTRY,joining_date,last_update_date from <table_name>'
cursor = cnx.cursor()
for chunk in pd.read_sql(query, cnx, coerce_float=True, params=None, parse_dates=None, columns=None,chunksize=500000):chunk.apply(lambda x: x.replace(u'\r', u' ').replace(u'\n', u' ') if isinstance(x, str) or isinstance(x, unicode) else x)
job = client.load_table_from_dataframe(chunk, table_id)
job.result()
Kindly help me getting the issue resolved. I tried to use LoadJobConfig as well, but same error is coming.
Is that the same code you copied? i mean what about the indents statements?
for chunk in pd.read_sql(query, cnx, coerce_float=True, params=None, parse_dates=None, columns=None,chunksize=500000):
chunk.apply(lambda x: x.replace(u'\r', u' ').replace(u'\n', u' ') if isinstance(x, str) or isinstance(x, unicode) else x)
job = client.load_table_from_dataframe(chunk, table_id)
job.result()
Fix the above line.
Related
I'm trying to run the below python script on AWS Lambda, which I have run manually and I could get the outcome on my output S3 bucket without any issue. But now when I invoke the script from AWS Lambda getting the below error, not sure if I am missing anything on the script?
#!/usr/bin/env python3
import boto3
#Function for executing athena queries
def run_query(Event, context):
...
run_query(query, database, s3_output)
client = boto3.client('athena')
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': 's3_accesslog'
},
ResultConfiguration={
'OutputLocation': s3_output,
}
)
#import datetime
import datetime
year = datetime.date.today()
year = year.strftime("%Y")
month = datetime.date.today()
month = month.strftime("%m")
day = datetime.date.today()
day = day.strftime("%d")
#select bucket
s3_input = "s3://smathena/cf-ant-prod/year=%s/month=%s/day=%s" % (year, month, day)
#Athena configuration
s3_ouput = 's3://smathena/athenatest/'
database = 's3_accesslog'
table = 'test_output1'
#Athena database and table definition
create_database = "CREATE DATABASE IF NOT EXISTS %s;" % (database)
delete_table = "drop table %s.%s;" % (database, table)
create_table = \
"""CREATE EXTERNAL TABLE IF NOT EXISTS %s.%s (
`Date` DATE,
Time STRING,
Location STRING,
SCBytes BIGINT,
RequestIP STRING,
Method STRING,
Host STRING,
Uri STRING,
Status INT,
Referrer STRING,
UserAgent STRING,
UriQS STRING,
Cookie STRING,
ResultType STRING,
RequestId STRING,
HostHeader STRING,
Protocol STRING,
CSBytes BIGINT,
TimeTaken FLOAT,
XForwardFor STRING,
SSLProtocol STRING,
SSLCipher STRING,
ResponseResultType STRING,
CSProtocolVersion STRING,
FleStatus STRING,
FleEncryptedFields INT,
CPort INT,
TimeToFirstByte FLOAT,
XEdgeDetailedResult STRING,
ScContent STRING,
ScContentLen BIGINT,
ScRangeStart BIGINT,
ScRangeEnd BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION '%s'
TBLPROPERTIES ('skip.header.line.count' = '2');""" % (database, table, s3_input)
#Query definitions
query_1 = "SELECT * FROM %s.%s where CAST(status AS VARCHAR) = '404';" % (database, table)
#Execute all queries
queries = [ create_database, delete_table, create_table, query_1 ]
for q in queries:
print("Executing query: %s" % (q))
res = run_query(q, database, s3_ouput)
But now when I invoke the script from AWS Lambda getting the below error, not sure if I am missing anything on the script?
{
"errorMessage": "run_query() takes 2 positional arguments but 3 were given",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/lang/lib/python3.7/imp.py\", line 234, in load_module\n return load_source(name, filename, file)\n",
" File \"/var/lang/lib/python3.7/imp.py\", line 171, in load_source\n module = _load(spec)\n",
" File \"<frozen importlib._bootstrap>\", line 696, in _load\n",
" File \"<frozen importlib._bootstrap>\", line 677, in _load_unlocked\n",
" File \"<frozen importlib._bootstrap_external>\", line 728, in exec_module\n",
" File \"<frozen importlib._bootstrap>\", line 219, in _call_with_frames_removed\n",
" File \"/var/task/lambda_function.py\", line 86, in <module>\n res = run_query(q, database, s3_ouput)\n"
]
}``
your function, lambda_handle, doesn't conform to the python lambda interface:
def handler_name(event, context):
...
return some_value
inputs to your function should be in event. Another example from that link:
def my_handler(event, context):
message = 'Hello {} {}!'.format(event['first_name'],
event['last_name'])
return {
'message' : message
}
I would expect query, database, and s3_output to be part of the event in your case. You should probably return information about the executing athena query.
I don't get what's the problem here. I want to build a web scraper that scrapes amazon and takes the price and the name into a database. But for some reason, it tells me that the columns and values are not matching. I do have one additional column in my database called "timestamp" where I automatically put in the time, but that is handled by the database. I am using MariaDB. A friend said I can use the MySQL API for MariaDB as well.
P.S. preis = price, coming from Germany, switching between English and German sometimes, just in case anyone is wondering.
import requests, time, csv, pymysql
from bs4 import BeautifulSoup as bs
#URL = input("URL")
URL = "https://www.amazon.de/gp/product/B075FTXF15/ref=crt_ewc_img_bw_3?ie=UTF8&psc=1&smid=A24FLB4J0NZBNT"
def SOUPIT (tempURL):
URL = tempURL
page = requests.get(URL,headers={"User-Agent":"Defined"})
soup = bs(page.content, "html.parser")
raw_price = soup.find(id="priceblock_ourprice").get_text()
price = raw_price[:-2]
raw_name = soup.find(id="productTitle").get_text()
name = raw_name.strip()
for i in range(0,len(name)-1):
if name[i] == "(":
name = name[:i]
break
data = [name, price, time.strftime("%H:%M:%S"), time.strftime("%d.%m.%Y")]
return(data)
data = SOUPIT(URL)
while True:
data = SOUPIT(URL)
db = pymysql.connect("localhost", "root", "root", "test")
cursor = db.cursor()
if (data == None):
break
print("break")
else:
name = data[0]
preis = data[1]
sql = """INSERT INTO amazon_preise (Name, Preis) VALUES ('{}',{})""".format(name,preis)
cursor.execute(sql)
db.commit()
print("success")
print(data)
time.sleep(60)
error message:
Traceback (most recent call last):
File "amazonscraper_advanced.py", line 43, in <module>
cursor.execute(sql)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\cursors.py", line 170, in execute
result = self._query(query)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\cursors.py", line 328, in _query
conn.query(q)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\connections.py", line 517, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\connections.py", line 732, in _read_query_result
result.read()
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\connections.py", line 1075, in read
first_packet = self.connection._read_packet()
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\connections.py", line 684, in _read_packet
packet.check_error()
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\protocol.py", line 220, in check_error
err.raise_mysql_exception(self._data)
File "C:\Users\...\AppData\Local\Programs\Python\Python36\lib\site-packages\pymysql\err.py", line 109, in raise_mysql_exception
raise errorclass(errno, errval)
pymysql.err.InternalError: (1136, "Column count doesn't match value count at row 1")
The problem is caused, at least partially, by a using string formatting to insert values into an SQL statement.
Here is the scraped data:
>>> data = ['Sweatshirt Alien VS. Predator Z100088', '32,99', '14:08:43', '08.09.2019']
>>> name, preis, *_ = data
Let's create the SQL statement
>>> sql = """INSERT INTO amazon_preise (Name, Preis) VALUES ('{}',{})""".format(name,preis)
And display it:
>>> sql
"INSERT INTO amazon_preise (Name, Preis) VALUES ('Sweatshirt Alien VS. Predator Z100088',32,99)"
Observe that the VALUES clause contains three comma-separated values; this is because the web page displays currency in the German style, that is with commas separating the cents from the euros. When interpolated into the SQL statement
preis becomes two values instead of one.
The right way to fix this would be to convert preis from a string to a float or decimal, and use parameter substitution instead of string formatting to interpolate the values..
>>> fpreis = float(preis.replace(',', '.'))
>>> sql = """INSERT INTO amazon_preise (Name, Preis) VALUES (%s, %s)"""
>>> cursor.execute(sql, (name, fpreis))
I'm using the DB-API interface of the python bigquery library https://googleapis.dev/python/bigquery/latest/index.html. It throws errors like below when I pass parameters to the Cursor.execute() for WHERE IN or WHERE ANY clause
google-cloud-bigquery version: 1.19.0
from google.cloud import bigquery
from google.cloud.bigquery import dbapi
client = bigquery.Client()
conn = dbapi.Connection(client)
curr = conn.cursor()
query = """
SELECT name, state
FROM `bigquery-public-data.usa_names.usa_1910_2013`
WHERE state = %s
LIMIT 2
"""
curr.execute(query, ('NY', ))
result = curr.fetchall()
print(result)
query = """
SELECT name, state
FROM `bigquery-public-data.usa_names.usa_1910_2013`
WHERE state IN %s
LIMIT 2
"""
curr.execute(query, (('NY', 'TX'), ))
result = curr.fetchall()
print(result)
Output
[Row(('Mildred', 'NY'), {'name': 0, 'state': 1}), Row(('Irene', 'NY'), {'name': 0, 'state': 1})]
Traceback (most recent call last):
File "hello_bq.py", line 25, in <module>
curr.execute(query, (('NY', 'TX'), ))
File "/home/haibin/.local/share/virtualenvs/python-6nCS1ipk/lib/python3.6/site-packages/google/cloud/bigquery/dbapi/cursor.py", line 159, in execute
query_parameters = _helpers.to_query_parameters(parameters)
File "/home/haibin/.local/share/virtualenvs/python-6nCS1ipk/lib/python3.6/site-packages/google/cloud/bigquery/dbapi/_helpers.py", line 117, in to_query_parameters
return to_query_parameters_list(parameters)
File "/home/haibin/.local/share/virtualenvs/python-6nCS1ipk/lib/python3.6/site-packages/google/cloud/bigquery/dbapi/_helpers.py", line 84, in to_query_parameters_list
return [scalar_to_query_parameter(value) for value in parameters]
File "/home/haibin/.local/share/virtualenvs/python-6nCS1ipk/lib/python3.6/site-packages/google/cloud/bigquery/dbapi/_helpers.py", line 84, in <listcomp>
return [scalar_to_query_parameter(value) for value in parameters]
File "/home/haibin/.local/share/virtualenvs/python-6nCS1ipk/lib/python3.6/site-packages/google/cloud/bigquery/dbapi/_helpers.py", line 69, in scalar_to_query_parameter
name, value
google.cloud.bigquery.dbapi.exceptions.ProgrammingError: encountered parameter None with value ('NY', 'TX') of unexpected type
Any help is appreciated.
I don't see support for array types on the parameter conversion code, but this alternative approach works:
from google.cloud import bigquery
from google.cloud.bigquery import dbapi
client = bigquery.Client()
conn = dbapi.Connection(client)
curr = conn.cursor()
query = """
SELECT name, state
FROM `bigquery-public-data.usa_names.usa_1910_2013`
WHERE state IN UNNEST(SPLIT(%s))
LIMIT 2
"""
curr.execute(query, ('NY,TX', ))
result = curr.fetchall()
print(result)
There's an open GitHub issue to track progress for native support:
https://github.com/googleapis/google-cloud-python/issues/9177
Hello all I have to make a bulk entry to my postgresql table from python for loop code. I have tried all possible means to enter the data. But I was unable to to do it.
data = count,ipaddress,asn_value,asno,aso,sock_value,sock,datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
print (data)
cur.execute('insert into dataoldnew values (%s,%s,%s,%s,%s,%s,%s,%s)',data)
print("Records created successfully")
conn.close()
Debug
data= (1, '217.76.156.252', 1, 8560, '1&1 Internet SE', 0, 0, '2018-06-06 11:35')
Error
Exception in callback task2() at mining.py:43
handle: <Handle task2() at mining.py:43>
Traceback (most recent call last):
File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
self._callback(*self._args)
File "mining.py", line 31, in wrapper
ret = func(*args, **kwargs)
File "mining.py", line 149, in task2
cur.execute('insert into dataoldnew values (%s,%s,%s,%s,%s,%s,%s,%s)',data)
psycopg2.DataError: invalid input syntax for integer: "217.76.156.252"
LINE 1: insert into dataoldnew values (1,'217.76.156.252',1,8560,'1&...
^
I have a table in postgresql.
Postgres Table Schema
-- Table: public.dataoldnew
-- DROP TABLE public.dataoldnew;
CREATE TABLE public.dataoldnew
(
index bigint,
idtemp bigint,
ipaddress text,
"timestamp" text,
values_asn bigint,
values_aso text,
values_host_name text,
values_host_name_true_false bigint
)
WITH (
OIDS=FALSE
);
ALTER TABLE public.dataoldnew
OWNER TO aditya;
-- Index: public.ix_dataoldnew_index
-- DROP INDEX public.ix_dataoldnew_index;
CREATE INDEX ix_dataoldnew_index
ON public.dataoldnew
USING btree
(index);
Thanks Advance
I'm trying to populate my db from a csv file using python.
Below is the code I use to populate my sales table:
import csv
import pymssql as psql
conn = psql.connect('localhost:8888', 'SA', 'superSecret','videogame')
cursor = conn.cursor()
cursor.execute("""
IF OBJECT_ID('sales', 'U') IS NOT NULL
DROP TABLE sales
CREATE TABLE sales
(
Id int,
Name varchar(250),
Platform varchar(250),
Year int,
Genre varchar(250),
Publisher varchar(250),
NA_Sales float,
EU_Sales float,
JP_Sales float,
Other_Sales float,
Global_Sales float
)
""")
conn.commit()
with open ('./sales.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
row[1] = row[1].replace("'", "")
row[5] = row[5].replace("'", "")
data = tuple(row)
query = 'insert into sales values {0}'.format(data).replace("N/A","0")
print(query)
cursor.execute(query)
conn.commit()
conn.close()
However, some of my data contains the character:(') (e.g. Assassin's creed)in their name column. This caused an error, as below:
insert into sales values ('129', "Assassin's Creed III", 'PS3', '2012', 'Action', 'Ubisoft', '2.64', '2.56', '0.16', '1.14', '6.5')
Traceback (most recent call last):
File "pymssql.pyx", line 447, in pymssql.Cursor.execute (pymssql.c:7119)
File "_mssql.pyx", line 1011, in _mssql.MSSQLConnection.execute_query (_mssql.c:11586)
File "_mssql.pyx", line 1042, in _mssql.MSSQLConnection.execute_query (_mssql.c:11466)
File "_mssql.pyx", line 1175, in _mssql.MSSQLConnection.format_and_run_query (_mssql.c:12746)
File "_mssql.pyx", line 1586, in _mssql.check_cancel_and_raise (_mssql.c:16880)
File "_mssql.pyx", line 1630, in _mssql.maybe_raise_MSSQLDatabaseException (_mssql.c:17524)
_mssql.MSSQLDatabaseException: (207, b"Invalid column name 'Assassin's Creed III'.DB-Lib error message 20018, severity 16:\nGeneral SQL Server error: Check messages from the SQL Server\n")
Is there any workaround for this other than manually update the row (e.g. row[1] = row[1].replace("'","")?
Thanks!!
You could use a proper parameterized query, like this:
row = ["Assassin's", "N/A", 9] # test data as list (e.g., from CSV)
data = tuple("0" if x=="N/A" else x for x in row)
print(data) # ("Assassin's", '0', 9)
placeholders = ','.join(['%s' for i in range(len(data))])
query = 'INSERT INTO sales VALUES ({0})'.format(placeholders)
print(query) # INSERT INTO sales VALUES (%s,%s,%s)
cursor.execute(query, data)
You can replace the ' with a \', which should stop it crashing whilst preserving the apostrophe in your data:
with open ('./sales.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
row[1] = row[1].replace("'", "\'")
row[5] = row[5].replace("'", "\'")
data = tuple(row)
query = 'insert into sales values {0}'.format(data).replace("N/A","0")
print(query)
cursor.execute(query)