How can I fix the DataFrame visible in the picture below? I mean that "order_reference" and "location" should be the columns.
Here's my code:
import pyodbc
import pandas as pd
import json
import os
from queries import QUERIES
class Connection:
def __init__(self):
try:
self.connection = pyodbc.connect("DSN=Databricks-Spark", autocommit=True)
except Exception as e:
print(e)
raise Exception("Connection failed")
def run_query(self, query_id):
with self.connection as open_connection:
cursor = open_connection.cursor()
cursor = cursor.execute(query_id)
columns = [column[0] for column in cursor.description]
results = []
for row in cursor.fetchall():
results.append(str(dict(zip(columns, row))))
return results
class DataManager:
def __init__(self, QUERIES_DICT):
self.connection = Connection()
self.queries = QUERIES_DICT
self.jsonfile = "mydata.json"
self.data = {}
self.dfs = {}
try:
if os.stat(self.jsonfile).st_size > 0:
self.get_data_from_json()
else:
self.get_data_from_database()
with open(self.jsonfile, "w") as f:
json.dump(self.data, f)
except Exception as e:
print(e)
open(self.jsonfile, "w")
self.get_data_from_database()
with open(self.jsonfile, "w") as f:
json.dump(self.data, f)
def get_df(self):
for query_id in self.queries:
self.dfs[query_id] = pd.DataFrame(self.data[query_id])
return self.dfs
def get_data_from_json(self):
with open(self.jsonfile, "r") as pre_load:
self.data = json.load(pre_load)
def get_data_from_database(self):
for query_id in self.queries:
try:
self.data[query_id] = self.connection.run_query(self.queries[query_id])
except Exception as e:
print(f"Problem with query ID: {query_id}", e)
Related
I have a webparser on the "socket" library (S03) and have a module for parse this dict for unzip needle information(before_json)
code S03
# Init .env
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
#init
secret_token = os.environ['secret_token']
#time
seconds = time.time()
local_time = time.ctime(seconds)
#HDRS
HDRS = 'HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n'
HDRS_404 = 'HTTP/1.1 404 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n'
# create webserver socket
socket_server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
socket_server.bind(('ip', 8888))
socket_server.listen(356)
socket_server.settimeout(5)
#Start procesing incoming json
def start_my_server():
#def of compare secret key
def load_secret(secret_token, data):
try:
# slack_message_pipe(step=f'I LOAD JSON')
print('load_secret')
key = str(data)
key = re.findall(f'X-Gitlab-Token:...............', key)
print(key)
key = str(key).replace("['X-Gitlab-Token: ", '')
key = str(key).replace("']", '')
print(key)
print(secret_token, ' !!! ', key)
if secret_token == key:
socket_server.settimeout(None)
try_to_verification(key)
else:
fail_verifivcation()
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.load_secret {e}')
def try_to_verification(key):
try:
print(key, 'key try_to_verification')
client_socket.send(HDRS.encode('utf-8'))
client_socket.shutdown(socket.SHUT_WR)
# with open(f"path to file('{local_time}').json", 'w+') as output_file:
# json.dump(data, output_file)
with open(f"path to file", 'w+') as file:
json.dump(data, file)
file.close()
print('next step')
json_dump_for_proj(data)
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.try_to_verification {e}')
def fail_verifivcation():
try:
print('Not find')
client_socket.send(HDRS_404.encode('utf-8'))
client_socket.shutdown(socket.SHUT_WR)
addresses = open('ipPOST', 'a')
addresses.write(str(address) + f'{local_time}\n')
addresses.close()
except Exception as e:
print(e)
return
# slack_message_pipe(step=f'start_my_server.fail_verifivcation {e}')
while True:
print('start loop')
try:
print('try loop')
while True:
print('Working...')
client_socket, address = socket_server.accept()
print('loop', address)
data = client_socket.recv(1048576).decode('utf-8')
# slack_message_pipe(step=f'I GOT JSON')
load_secret(secret_token, data)
except Exception as e:
# slack_message_pipe(step=f'start_my_server.socket.error {e}')
print(f'pass try {e}')
fail_verifivcation()
code before_json
home_path = os.environ['home']
# time
seconds = time.time()
local_time = time.ctime(seconds)
def json_dump_for_proj(data):
os.chdir(home_path)
try:
data = str(data).replace('null', '0')
# Find head json
data = re.sub('POST / HTTP/1.1\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
# data = re.sub(',total_commits_count.*', '}', data)
data = re.sub('POST / HTTP/1.1\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
# data = re.sub('.total_commits_count.*', '}', data)
data = re.sub('POST.*\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST / HTTP/1.1\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST / HTTP/1.1\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = re.sub('"POST.*\r\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n', '', data)
data = json.loads(data)
# parse needly info
# Branch
# print(data['ref'])
branch = data['ref']
# print(data['commits'])
for keys in data['commits']:
# id
id_hash = keys['id']
# author
# name
name = keys['author']['name']
# email
email = keys['author']['email']
# files
# added
added = keys['added']
for _ in range(len(added) + 1):
for j in added:
if 'path to file' not in j:
added.remove(j)
# print('path to file' not in added[-1])
# modif
modified = keys['modified']
for _ in range(len(modified) + 1):
for k in modified:
if '' not in k:
print(k)
modified.remove(k)
print(id_hash, name, email, branch, modified, sep='\n' + '*' * 100 + '\n')
list_of = [(name, email), added, modified, id_hash]
# write_list(list_of)
# print(not modified and not added)
message_dict = {"name": name, "email": email, "modified": modified, "added": added}
if not modified and not added:
slack_message_pipe_good(
step=f' \nI got commit by {message_dict.get("name")}\nEmail: {message_dict.get("email")}\n\nBut it is empty, pass')
return
try:
# slack_message_pipe_good(step=f' \nI got commit by {message_dict.get("name")}\nEmail: {message_dict.get("email")}\n\nInside this commit, the following changes\nadded:{message_dict.get("added")}\nmodified:{message_dict.get("modified")}\n\n I am going to the next step')
git_checkout(id_hash, message_dict)
except Exception as e:
slack_message_pipe(step=f'ERROS ON STEP before_deploy_Parse_Json.json_dump_for_proj: {e}')
return
except Exception as e:
with open(f'{local_time}.json',
'w+') as data_failure:
data_failure.write(data)
data_failure.close()
slack_message_pipe(step=f' before_deploy_Parse_Json.json_dump_for_proj {e}')
def write_list(list_of):
try:
with open(f'path to file', 'w+') as output_file:
output_file.write(str(list_of) + '\n')
output_file.close()
print('I all write')
except Exception as e:
slack_message_pipe(step=f' before_deploy_Parse_Json.write_list {e}')
# with open('data.json', 'r') as json_file:
# data = json.load(json_file)
# json_dump_for_proj(data)
if __name__ == '__main__':
print('Parse_Json')
problem:
I got whole json from S03 and after i begining parse him in before_json, but not ussualy it whole, some king of json gone, size of gone block separated\
I have written a little python script to get files in a directory, get a hash and then write them to a table.
The first part, getting the files and calculating the hash was easy. But now I added the function (write_record) to store the filename, log date and hash to a database. But I am struggling how to call it form the get_files function an write a record for each file in the directory
from datetime import datetime
from os import scandir
import os
import hashlib
import psycopg2
BLOCKSIZE = 65536
hasher = hashlib.sha256()
basepath = '.'
def convert_date(timestamp):
d = datetime.utcfromtimestamp(timestamp)
formated_date = d.strftime('%d%m%Y%H%M%S')
return formated_date
def get_hash(entry):
with open(entry, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
# print(hasher.hexdigest())
def get_files():
dir_entries = scandir('.')
for entry in dir_entries:
if entry.is_file():
info = entry.stat()
print(' %s %s %s' % (entry.name, convert_date(info.st_mtime),hasher.hexdigest()))
log_filename = entry.name
log_hashvalue = hasher.hexdigest()
log_date = convert_date(info.st_mtime)
return log_filename,log_hashvalue,log_date
# write_record()
def write_record():
log_filename,log_hashvalue,log_date = get_files()
try:
print(log_filename,log_hashvalue,log_date)
connection = psycopg2.connect(user="postgres",password="xxxxxxxx",host="xxx.xxx.xxx.xxx",port="5432",database="evidence_logging")
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO logfiles (log_name,log_date,log_hashvalue) VALUES (%s,%s,%s)"""
record_to_insert = (log_filename,log_date,log_hashvalue)
print(postgres_insert_query, record_to_insert)
cursor.execute(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print (count, "Record inserted successfully into logfiles table")
except (Exception, psycopg2.Error) as error :
if(connection):
print("Failed to insert record into logfiles table", error)
finally:
#closing database connection.
if(connection):
cursor.close()
connection.close()
print("PostgreSQL connection is closed")
write_record()
Thanks in advance
Regards
Georg
In your code you are calling write_record() method this will insert only one file beacause get_files() method will return the first file not all the files.
first you need to call get_files() method instead of returning in this method you should call write_record() method with the values you are returning from get_files().
And do not close the connection after insertion of every record close the connection after insertion of all the records.
try this
from datetime import datetime
from os import scandir
import os
import hashlib
import psycopg2
BLOCKSIZE = 65536
hasher = hashlib.sha256()
basepath = '.'
connection = None
def convert_date(timestamp):
d = datetime.utcfromtimestamp(timestamp)
formated_date = d.strftime('%d%m%Y%H%M%S')
return formated_date
def get_hash(entry):
with open(entry, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
# print(hasher.hexdigest())
def get_files():
dir_entries = scandir('.')
for entry in dir_entries:
if entry.is_file():
info = entry.stat()
print(' %s %s %s' % (entry.name, convert_date(info.st_mtime),hasher.hexdigest()))
log_filename = entry.name
log_hashvalue = hasher.hexdigest()
log_date = convert_date(info.st_mtime)
write_record(log_filename,log_hashvalue,log_date)
#close the connection after writing all records
close_connection()
def write_record(log_filename,log_hashvalue,log_date):
try:
print(log_filename,log_hashvalue,log_date)
connection = psycopg2.connect(user="postgres",password="xxxxxxxx",host="xxx.xxx.xxx.xxx",port="5432",database="evidence_logging")
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO logfiles (log_name,log_date,log_hashvalue) VALUES (%s,%s,%s)"""
record_to_insert = (log_filename,log_date,log_hashvalue)
print(postgres_insert_query, record_to_insert)
cursor.execute(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print (count, "Record inserted successfully into logfiles table")
except (Exception, psycopg2.Error) as error :
if(connection):
print("Failed to insert record into logfiles table", error)
finally:
cursor.close()
def close_connection():
if(connection):
connection.close()
print("PostgreSQL connection is closed")
get_files()
import psycopg2
import time
def read_database():
conn = None
try:
conn = psycopg2.connect(database="capitadb", user="capita_user", password="capita_user",
host = "127.0.0.1", port = "5432")
cur = conn.cursor()
start_time = time.time()
cur.execute("COPY stagging(Activity_ID,F_Qtr,Fiscal_Week_Num,Manager,MBadge) FROM '/home/vivek/Downloads/dell_data.csv' DELIMITER',' CSV;;")
print("--- %s seconds ---" % (time.time() - start_time))
print("Operation done successfully")
conn.commit()
except Exception as e:
print("Error: %s" % e)
finally:
conn.close()
if __name__ == '__main__':
read_database()
Here we have 15 columns in csv file, but we want to copy only 4 columns. How will we achive that without extracting data in any file?
You will need to use COPY FROM STDIN functionality - http://initd.org/psycopg/docs/cursor.html#cursor.copy_from. You will be able to provide file-like object to that function. You can use itertools module for that
from itertools import chain, islice
class some_magic_adaptor(object):
def __init__(self, src):
self.src = chain.from_iterable(src)
def read(self, n):
return "".join(islice(self.src, None, n))
def read_csv():
for line in open(csv_filename):
yield transform_line(line)
file_like_object_for_postgresql = some_magic_adaptor(read_csv())
I'm using the multiprocessing.dummy module to do some concurrent processing. I'm making HTTP requests, and there is a possibility that the object will not have any data returned. In this case I need to capture the AttributeError and move on.
I tried capturing it in the object itself, and still received the error, the only thing that worked was a try/except on the pool.map call itself. I'm wondering why this is, and if this is the best way to do error handling for multiprocessing and map functions?
Here is some of my code for reference:
all_commits = []
projects = [Project(value['id']) for value in project_data.values()]
def process_projects(project):
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest, all_commits)
pool = ThreadPool(8)
pool.map(process_projects, projects)
pool.close()
pool.join()
print 'All data gathered.'
print 'Number of commits: {}'.format(len(all_commits))
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
settings.REDSHIFT_POSTGRES_INFO)
s3_file = ('staging_gerrit_commits_{}.csv.gz'.format(
date.today())
)
with gzip.open(s3_file, 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
cnt = 0
pool = ThreadPool(8)
try:
pool.map(process_commits, all_commits)
except AttributeError:
pass
pool.close()
pool.join()
Then here is my Commit object code and the function that is being called by the map function:
class Commit(object):
def __init__(self, rev_id, change_id, full_id):
self.rev_id = rev_id
self.change_id = change_id
self.full_id = full_id
def clean_data(self, _dict):
for key, value in _dict.items():
if isinstance(value, dict):
self.clean_data(_dict[key])
else:
try:
_dict[key] = _dict[key].encode(
'utf_8',
'replace'
).encode('string_escape').replace('|', '[pipe]')
except AttributeError:
continue
def get_data(self, ger_obj):
print 'Getting data for a commit for {f_id}'.format(
f_id=self.full_id
)
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id,
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError:
try:
endpoint = (r'/changes/{f_id}/revisions/{r_id}/commit'.format(
f_id=self.full_id,
r_id=self.rev_id
))
self.data = ger_obj.get(endpoint)
except HTTPError:
logging.warning('Neither endpoint returned data: {ep}'.format(
ep=endpoint
))
raise HTTPError()
except ReadTimeout:
logging.warning('Read Timeout occurred for a commit. Endpoint: '
'{ep/}'.format(ep=endpoint))
return
self.data['change_id'] = self.change_id
self.data['proj_branch_id'] = self.full_id
self.data['revision_id'] = self.rev_id
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = settings.ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = settings.ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = settings.ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = settings.ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = settings.ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = settings.ETL_PDI_IPADDRESS
self.clean_data(self.data)
def write_data(self, writer):
print 'Writing a commit for {f_id}'.format(f_id=self.full_id)
writer.writerow(self.data)
And the controller function:
def process_commits(commit):
print 'On commit #{}'.format(cnt)
unique_id = commit.change_id + commit.rev_id
if not id_search(unique_ids, unique_id):
try:
commit.get_data(rest)
except HTTPError:
pass
try:
commit.write_data(writer=writer)
except UnicodeEncodeError:
logging.warning(
'{data} caused a Unicode Encode Error.'.format(
data=commit.data
))
pass
global cnt
cnt += 1
Since the query returns more than 1 result, at the Get_results class how could i return the data_out as an array in order to iterate on the results of the query?
import psycopg2
import sys
class Get_results():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost, database=dbName,
user=dbUser, password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
for data_out in data:
return data_out
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
sql = " some sql "
w = Get_results()
for i in w.db_call(sql, dbHost, dbName, dbUser, dbPass):
print "The result is : " + i
For aditional info, when if i add print data right after data = cur.fetchall() i have the result:
[('The_Galaxy', 'The_Galaxy:star'),
('The_Galaxy', 'The_Galaxy:planet')]
The immediate answer is to change:
for data_out in data:
data_out result
to:
for data_out in data:
yield data_out
But you should look at using a with statement (if the DB API supports it), and simplifying the code - this could just be done by making a generator function (a class is OTT for this)
import psycopg2
import sys
class Get_results():
def db_call(self,query,dbHost,dbName,dbUser,dbPass):
try:
con = None
con = psycopg2.connect(host=dbHost, database=dbName,
user=dbUser, password=dbPass)
cur = con.cursor()
cur.execute(query)
data = cur.fetchall()
resultList = []
for data_out in data:
resultList.append(data_out[1])
return resultList
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
finally:
if con:
con.close()
sql = " some sql "
w = Get_results()
for i in w.db_call(sql, dbHost, dbName, dbUser, dbPass):
print "The result is : " + i