I have been using the below script for uploading the data to do load testing of my module:
import json
import ast
import pandas as pd
import sys
import cloudant_connection as cloud
df = pd.read_csv("./deviceId_data/device_starts_"+ sys.argv[1] + ".csv")
print(" checkpoint 1 cleared ")
def push_data_to_cloudant(ID,Key,Value,Database):
Value = ast.literal_eval(Value)
temp_doc = {}
temp_doc["_id"] = ID
temp_doc["value"] = Value["value"]
temp_doc["devId"] = Value["devId"]
temp_doc["eDateTime"] = Key[0]
temp_doc["eDate"] = Value["eDate"]
temp_doc["cDateTime"] = Key[0]
temp_doc["cDate"] = Value["cDate"]
new_doc = Database.create_document(temp_doc)
if new_doc.exists():
#print("doc created")
return "Success"
else:
print("Failed in pushing document")
return "Failure"
with open("./connection_config_source.json") as f:
connect_conf = json.load(f)
print(" checkpoint 2 cleared ")
API_KEY = connect_conf['cloudant_api_key']
ACC_NAME = connect_conf['cloudant_account_name']
print(" checkpoint 3 cleared ")
try:
client = cloud.connecting_to_cloudant_via_api(ACC_NAME,API_KEY)
database_name = 'DB_NAME'
Database = client[database_name]
print(" checkpoint 4 cleared ")
if Database.exists():
print("Connected")
status = [push_data_to_cloudant(ID,Key,Value,Database) for (ID,Key,Value) in zip(df['id'],df['key'],df['value'])]
print(" last checkpoint cleared ")
except Exception as e:
print("Failed:" + str(e))
I know that there are faster ways than using list comprehension. But I don't know how to use them in this scenario.
I know df.apply() is faster than this, but I wanted to know if I could use Pandas Vectorization or Numpy Vectorization for this use case.
python-cloudant documentation:
bulk_docs(docs)
Performs multiple document inserts and/or updates through a single request. Each document must either be or extend a dict as is the case with Document and DesignDocument objects. A document must contain the _id and _rev fields if the document is meant to be updated.
Parameters: docs (list) – List of Documents to be created/updated.
Returns: Bulk document creation/update status in JSON format
Just use:
Database = client['DB_name']
Database.bulk_docs(*argv)
The argument here can be a list of dictionaries or json objects.
My code hits the variable end point and then creates the logfile(UUID.log) these log files are unique for every hit. Inside every log file there is a json(process_name,Process_id) where endpoint name gets logged in as a process_name.
The if condition checks in for the duplicate process_name inside the log files before creating a new file to ensure that the log file with duplicate process_name do not get logged in.
from flask import Flask, jsonify
import json
import uuid
import os
import test1
app = Flask(__name__)
#app.route('/<string:name>')
def get_stats(name):
proceuudi = uuid.uuid4()
stat = [
{
'process_id': str(proceuudi),
'process_name': name
}
]
os.chdir("file_path")
files = os.listdir('file_path')
l=[]
for i in files:
with open(i) as f:
data = json.load(f)
for j in data:
l.append(j)
for j in l:
print(j)
if j['process_name'] != name:
with open(str(proceuudi) + '.log', 'w+') as f: # writing JSON object
json.dump(stat, f)
return jsonify({'stats':stat})
else:
return 'Process already running'
app.run(port = 6011)
Whenever i am trying to parse the list(l=[]) containing the process_name and process_id. I am not able to parse the entire list. it is only checking at the starting index. if it gets j['process_name'] != name at the first index it is getting returned. Is there a way through which entire list could be parsed and then if the process_name do not exist in any log file the log file with that process name gets created.
use set to hold process_name as this will avoid scanning whole list.
don't scan all files on every call use global variable to hold name in memory
app = Flask(__name__)
# use set as membership (in operator) check is O(1)
l = set()
running = False
#app.route('/<string:name>')
def get_stats(name):
global l, running
proceuudi = uuid.uuid4()
# why list as from the code it is clear that one file will have only one entry
stat = [
{
'process_id': str(proceuudi),
'process_name': name
}
]
# take all name at the start of server
if not running:
# better to write new function for this stuff
files = os.listdir('./file_path')
print files
for i in files:
with open("./file_path/"+i) as f:
data = json.load(f)
for j in data:
l.add(j["process_name"])
running = True
if name in l:
# use jsonfy here too
return jsonify("proces running")
else:
# add new process_name to in momery variable
l.add(stat[0]["process_name"])
with open("./file_path/"+str(proceuudi) + '.log', 'w+') as f: # writing JSON object
json.dump(stat, f)
return jsonify({'stats':stat})
app.run(port = 6011)
NOTE: use code review for such type of question.
Below is my most recent attempt; but alas, I print 'current_file' and it's always the same (first) .zip file in my directory?
Why/how can I iterate this to get to the next file in my zip directory?
my DIRECTORY_LOCATION has 4 zip files in it.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
find_file usage:
excel_data = pandas.read_excel(self.find_file())
Update:
I just tried changing return to yield at:
yield archive.extract(file.filename, config.UNZIP_LOCATION)
and now getting the below error at my find_file line.
ValueError: Invalid file path or buffer object type: <class 'generator'>
then I alter with the generator obj as suggested in comments; i.e.:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
and now getting this error:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
TypeError: 'generator' object is not callable
Here is my /main.py if helpful
"""Start Point"""
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
import sys
import os
import config
import datetime
# from csv import DictWriter
if __name__ == "__main__":
try:
for file in os.listdir(config.DIRECTORY_LOCATION):
if 'VCCS' in file:
PENDING_RECORDS = FindPendingRecords().get_excel_data()
# Do operations on PENDING_RECORDS
# Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
# Configures Driver
VITAL_ENTRY = VitalEntry()
# Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
# Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
except:
print("exception occured")
raise
It is not tested.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
yield archive.extract(file.filename, config.UNZIP_LOCATION)
This is just your function rewritten with yield instead of return.
I think it should be used in the following way:
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
#do whatever you want to do with excel_data here
self.find_file() is a generator, should be used like an iterator (read this answer for more details).
Try to integrate the previous loop in your main script. Each iteration of the loop, it will read a different file in excel_data, so in the body of the loop you should also do whatever you need to do with the data.
Not sure what you mean by:
just one each time the script is executed
Even with yield, if you execute the script multiple times, you will always start from the beginning (and always get the first file). You should read all of the files in the same execution.
I am trying to run a python script to take data from an input .csv file and print it to an output .csv file, using an ID and column headers to cross reference.
They are very large files so I've included multiprocessing and the first step is to split the output file into parts so they can be processed in parallel.
I tried executing this on Computer A and the disk speed to split up the output file is 0 mb/s (or a very slow trickle) and the program never get executed. Meanwhile on Computer B the program runs correctly and the disk speed splits the output file into parts at a rate of ~40mb/s.
It is the same exact files and the same exact code, why is Computer A not responding properly? Is it an operating system error? Am I missing a driver on Computer A? Something else?
The input and output files are located on the WD 4TB external HDD.
Computer A specs: Windows 10 Pro 64-bit, Intel i9 7920X 12-core, 8 x 8GB Geil Evo Potenza RAM, Samsung 850 Evo 500GB local SSD, WD 4TB external HDD, ASRock X299 Killer motherboard
Copmuter B specs: Windows 10 Pro 64-bit,
Intel i7 6700K 4-core, 2 x 16GB Geil Evo Forza RAM, PNY CS1311 240GB SSD, WD 4TB external HDD, MSI B250M Gaming Pro motherboard
# std lib imports
import csv
import multiprocessing
import os
import os.path
import shutil
import sqlite3
import sys
import tempfile
import timeit
# third party imports
# our imports
buffer_size = 8192000 # 8.192 MB, 10x the default (io.DEFAULT_BUFFER_SiZE)
# working_dir = tempfile.gettempdir()
working_dir = "E:\\temp_files"
def return_csv_header(filename):
"""
Returns the first column of the csv file filename
as a list.
"""
with open(filename, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
return next(reader)
def return_input_mapping(input_file, identifier):
"""
Returns a {identifier:record} dictionary where
identifier is the value of the identifier column
for each row in the input file.
record is a dictionary of {column:value}
representing a row in a csv file.
"""
to_ret = dict()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
key = row[identifier]
to_ret[key] = row
return to_ret
def return_csv_contents(input_file):
"""
Returns a list of lists representing the rows
in a csv file.
"""
to_ret = list()
with open(input_file, "r", newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
to_ret.append(row)
return to_ret
def create_db_and_table(csv_contents, identifier):
"""
Creates a sqlite3 database and table.
Creates the identifier table column along with more
table columns named from col0 to colN. We do this because
the csv column headers can be all sorts of weird stuff. And
we only really care about the order of the columns, and the
identifier so we can set it as the primary key.
No rows are added.
Returns the database path and table name as a tuple.
"""
# assert that the identifier is in the csv_contents header
header = csv_contents[0]
assert(identifier in header)
db_path = os.path.join(working_dir, "csv_input.sqlite")
tablename = "data"
# delete the database if it already exists
if os.path.exists(db_path):
os.remove(db_path)
# create the database, table, and columns
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
# prepare the table's columns - it will look like this
# (col1 text, col2 text, col3 text primary key, ...)
name = "col"
counter = 0
column_names = "("
for column in header:
if column == identifier:
column_names += "%s text primary key," % identifier
else:
column_names += "%s text," % (name + str(counter))
counter += 1
# remove the last comma and space
if column_names.endswith(","):
column_names = column_names[0:-1]
column_names += ")"
# prepare the sql statement
sql = "CREATE TABLE %s %s" % (tablename, column_names)
# some performance tweaks for the database
cursor.execute("PRAGMA synchronous=OFF")
cursor.execute("PRAGMA cache_size=20000") # 20,000*1024 = 20.48MB
cursor.execute("PRAGMA journal_mode=off") # dont keep journal of operations
cursor.execute("PRAGMA temp_store=memory") # store temp files in memory
# execute the statement
cursor.execute(sql)
conn.commit()
return db_path, tablename
def delete_db(db_path):
"""
Deletes the sqlite3 database file at the given db_path.
"""
assert(os.path.exists(db_path) is True)
os.remove(db_path)
assert(os.path.exists(db_path) is False)
def load_db_content(db_path, table, contents):
"""
Loads the database table with the given contents.
Skips the first element in contents as that is the
header aka the database column names.
"""
header = contents[0]
num_of_cols = len(header)
assert(num_of_cols != 0)
contents = contents[1:] # remove the header from the contents
# connect to the database
with sqlite3.connect(db_path) as conn:
# only commit once versus after every statement
cursor = conn.cursor()
cursor.execute("BEGIN IMMEDIATE")
# insert into the database in chunks if needed
limit = 999
remaining = num_of_cols
beginning = 0
while remaining > limit: # sqlite column limit
# prepare the sql statement
# this makes the string (?, ?, ?, ...)
columns = "(" + "?," * limit
columns = columns[:-1] # remove last comma
columns += ")"
# prepare the columns to insert
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+limit])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
remaining -= 999
beginning += 999
columns = "(" + "?," * remaining
columns = columns[:-1] # remove last comma
columns += ")"
to_insert = []
for i in range(len(contents)):
to_insert.append(contents[i][beginning:beginning+remaining])
sql = "INSERT INTO %s VALUES %s" % (table, columns)
cursor.executemany(sql, to_insert)
# commit the changes
conn.commit()
def get_na_dict(columns, identifier):
"""
Returns a dict with the given columns as keys, and
"n/a" as the values.
Skip over the identifier because we want to keep
that piece of data as it is.
"""
to_ret = dict()
for column in columns:
if column == identifier:
continue
else:
to_ret[column] = "n/a"
return to_ret
def run_vlookup(input_file, output_file, identifier, db_path, table):
"""
Completes the output file with data from the input file
that matches the record identifier and the header columns.
See the description at the top of this file for an example.
Returns the path of the new output file.
"""
# header of input file
input_header = return_csv_header(input_file)
# fill in the output file with data from the input file
output_file_name = os.path.basename(output_file)
temp_file = os.path.join(working_dir, output_file_name + ".tmp")
with open(output_file, "r", newline="", buffering=buffer_size, encoding="utf-8") as inputcsv:
with open(temp_file, "w", newline="", buffering=buffer_size, encoding="utf-8") as tempcsv:
reader = csv.DictReader(inputcsv)
# set restval to "" which will write empty values for columns
# in the output file that are not in the input file
# set extrasaction to "ignore" which will skip over columns
# from the input file that are not in the output file
writer = csv.DictWriter(tempcsv,
fieldnames=reader.fieldnames,
restval="",
extrasaction="ignore")
writer.writeheader()
# open databse connection
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
for row in reader:
key = row[identifier] # get the value for the lookup
# fetch the key from the database
sql = "SELECT * FROM %s where %s = '%s'" % (table, identifier, key)
cursor.execute(sql)
result = cursor.fetchone()
# update the output file with "n/a" if the item
# does not exist in the database
if result is None:
lookup_values = get_na_dict(input_header, identifier)
# otherwise update it with the values from the database
else:
lookup_values = dict(zip(input_header, result))
row.update(lookup_values) # merge the two dictionaries
writer.writerow(row)
return temp_file
def split_file(output_file, cpu_count):
"""
Splits the output_file into cpu_count number of
temporary files.
Returns the names of the temporary files as a list.
"""
# generate the temp file names
output_file_name = os.path.basename(output_file)
tempdir = working_dir
temp_files = []
for i in range(cpu_count):
temp_file = os.path.join(tempdir, "%s_%s" % (output_file_name, i))
temp_files.append(temp_file)
# create the files
with open(output_file, "rb", buffering=0) as outfile:
# calculate size of the file
size = outfile.seek(0, 2)
outfile.seek(0, 0)
# read the header in (at the same time moving the file pointer forward)
header = return_header_bytes(outfile)
# calculate the size of the smaller files (excluding the header)
chunk_size = (size - len(header)) / cpu_count
for file in temp_files:
create_temp_file(file, chunk_size, outfile, header)
return temp_files
def create_temp_file(file, chunk_size, outfile, header):
"""
Creates file with the given header plus chunk_size data
from the given outfile.
Header is a byte string.
If chunk_size is bigger than 100MB we read/write it in
chunks of 100MB.
After reading chunk_size amount of data, if the last byte
does not end with a newline, we keep reading until
it does. That way we dont write a file with truncated data.
If we reach the end of outfile then we stop reading and we
finish our last write.
"""
# start the file with the header
data = header
memory_safe_chunk = 100000000 # bytes, so 100MB
# read in chunk_size data from outfile
if isinstance(chunk_size, float):
chunk_size = int(chunk_size)
# write the data to the temp file
with open(file, "wb", buffering=0) as fp:
# safe to read/write chunk in one go
if chunk_size < memory_safe_chunk:
data += outfile.read(chunk_size)
# need to read/write chunk in chunks, go figure
else:
remaining = chunk_size
while remaining > memory_safe_chunk:
data += outfile.read(memory_safe_chunk)
fp.write(data)
data = b""
remaining -= memory_safe_chunk
data += outfile.read(remaining)
# keep reading 1 byte until we reach a newline
# or the end of the file
while not data.endswith(b"\n"):
char = outfile.read(1)
# reached EOF
if char == b"":
break
else:
data += char
fp.write(data)
del data # free up memory
def return_header_bytes(file_pointer):
"""
Returns a string starting from the file_pointer until
the first newline character.
"""
to_ret = file_pointer.read(1)
while not to_ret.endswith(b"\n"):
to_ret += file_pointer.read(1)
return to_ret
def merge_files(files):
"""
Returns a file that has the contents of files merged
together in one.
Keeps only the header from the first file, and discards
the rest as they are duplicates.
"""
chunk_size = 100000000 # bytes, so 100MB
master_file = os.path.join(working_dir, "temp.csv")
with open(files[0], "rb") as fpointer:
header = return_header_bytes(fpointer)
# open master file for writing
with open(master_file, "wb", buffering=0) as master_fp:
master_fp.write(header)
# loop through each file copying over the contents minus
# the header
for file in files:
# read the temp file in chunks
# and write it to the master file
with open(file, "rb", buffering=0) as temp_fp:
temp_fp.seek(len(header))
data = temp_fp.read(chunk_size)
while data != b"":
master_fp.write(data)
data = temp_fp.read(chunk_size)
del data # free up memory
return master_file
def launch_processes(input_file, output_file, identifier):
"""
Splits the output file into N temporary files.
Launches a process to run the vlookup on each temp file.
Merges the temp files back into one.
Moves the final temp file to the output_file location.
Deletes the N temp files.
"""
# create temporary files equal to the amount of cpu cores
cpu_count = multiprocessing.cpu_count()
files = split_file(output_file, cpu_count)
temp_files = []
# load the input file into memory; this is a memory-hungry operation,
# see note at top of file
contents = return_csv_contents(input_file)
# create sqlite3 database to store input mapping
db_path, table = create_db_and_table(contents, identifier)
load_db_content(db_path, table, contents)
del contents # free up memory
# run vlookup with N processes equal to CPU count
with multiprocessing.Pool(processes=cpu_count) as pool:
results = []
# launch asynchronous processing of each file
for file in files:
res = pool.apply_async(run_vlookup, (input_file, file, identifier, db_path, table))
results.append(res)
# wait for the processes to finish
for result in results:
res = result.get()
temp_files.append(res)
# collect the processes
pool.close()
pool.join()
# delete input mapping db
delete_db(db_path)
# delete the small files
for i in range(len(files)):
os.remove(files[i])
# merge temp files
temp_file = merge_files(temp_files)
# delete temp files
for i in range(len(temp_files)):
os.remove(temp_files[i])
# replace original output file with merged temp file
shutil.move(temp_file, output_file)
if __name__ == "__main__":
print(timeit.default_timer())
input_file = sys.argv[1]
output_file = sys.argv[2]
identifier = sys.argv[3]
launch_processes(input_file, output_file, identifier)
print(timeit.default_timer())
I have a function that read from a JSON file and display the content in a QtextEdit using Pyqt5.
Problem is that when i tried to parse the content in the TextEdit the last record of the File Name is displayed.
while if i print to the console all the records of the File Name are printed as it should.
at the end i need to display the result as the comment print
def displayReport(self,item):
searchRes=os.path.join(os.getcwd(),"search_result")
path = os.listdir(searchRes)
data =[]
try:
for file in path:
rpjson = json.load(open(os.path.join(searchRes,item)))
for js in rpjson:
fileName = js["File Name"]
srchwRD = js["Searched Word"]
nbrOfOccur = str(js["Number Of Occurence"])
result = [fileName + srchwRD + nbrOfOccur]
print("this is file name {}".format(fileName))
data.append(result)
#print("****************" + "\n" + "File Name: " +
#js["File Name"] + "\n" + "Searched Word: " +
#js["Searched Word"] + "\n" + "Number Of Occurence: " +
#str(js["Number Of Occurence"]))
except Exception as e:
print("can't read JSON because {}".format(e))
self.textEdit.setHtml("\n".join (data))
You are not "adding" to the textedit, instead you are replacing its content on each iteration of the loop - only the last content sticks.
Change
self.textEdit.setHtml(str(fileName)) # replaces the whole content of the qtestedit
to addding to its current content instead of replacing it.
Possbible solutions
See this SO post: How to append text to QPlainTextEdit without adding newline, and keep scroll at the bottom? for how to achieve this. (maybe you should add a newline between file names - you can adapt the solutions).
Maybe better way to do it:
Have a look at the API, append might also be a valid choice:QTextEdit.append(...)
collect all the text you need into a normal list of strings and set the QTextEdit only once after you visited all files. This should be faster performane wise as well as Qt does not update its element multiple times and has to process triggers and signals on it:
def displayReport(self,item): # code untested, no mvce provided, may need small fixup(s)
foundText = []
searchRes=os.path.join(os.getcwd(),"search_result")
path = os.listdir(searchRes)
try:
for file in path:
rpjson = json.load(open(os.path.join(searchRes,item)))
for js in rpjson:
fileName = js["File Name"]
print("this is file name {}".format(fileName))
foundText.append(fileName) # add to list, join later, faster then
# add to the immutable string multiple times
except Exception as e:
print("can't read JSON because {}".format(e))
self.textEdit.setHtml('\n'.join(foundText)) # set text only once after processing
I would probably go for the last option (collecting into list + set once) as it minimizes Signals and Triggers.