How to convert csv's to line protocol

How to convert csv's to line protocol - python

How do I display a CSV file in line protocol format, like influxdb uses?
My CSV data as below...
time, avg_FreshOrders, p95_FreshOrders, FreshOrders
1593648000000,1479.08407079646,2589,226
1593475200000,2242.8617021276596,5622,188
1593734400000,1682.3375,2738,160
I am using Python to convert to line protocol as below
import pandas as pd
#convert csv's to line protocol
df_full = pd.read_csv("data/FreshOrders.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = [str(df_full["measurement"][d])
+ ",type=FreshOrders"
+ " "
+ "avg_FreshOrders=" + str(df_full["avg_FreshOrders"][d]) + ","
+ "p95_FreshOrders=" + str(df_full["p95_FreshOrders"][d]) + ","
+ "FreshOrders=" + str(df_full["FreshOrders"][d])
+ " " + str(df_full["time"][d]) for d in range(len(df_full))]
#append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)
While running this Python code i am getting below errors.
Traceback (most recent call last):
File "csv_to_line.py", line 6, in <module>
lines = [str(df_full["measurement"][d])
File "csv_to_line.py", line 6, in <listcomp>
lines = [str(df_full["measurement"][d])
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\series.py", line 871, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\site-packages\pandas\core\indexes\base.py", line 4405, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 90, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 997, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1004, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
Can someone please help me to resolve this issue?

If you have problem with the keyword, you can use index instead.
import pandas as pd
# convert csv's to line protocol
df_full = pd.read_csv("haha.csv")
df_full["measurement"] = ['FO' for t in range(len(df_full))]
lines = []
for idx in range(len(df_full)):
temp = str(df_full.iloc[idx, -1]) + ",type=FreshOrders" + " " + "avg_FreshOrders=" + str(df_full.iloc[idx, 1]) + ","\
+ "p95_FreshOrders=" + str(df_full.iloc[idx, 2]) + "," + "FreshOrders=" + str(df_full.iloc[idx, 3]) + " " + \
str(df_full.iloc[idx, 0])
lines.append(temp)
# append lines to a text file with DDL & DML:
thefile = open('data/import.txt', 'a+')
for item in lines:
thefile.write("%s\n" % item)

Related

how to solve Exception while handling 'read_file' (thonny.plugins.micropython.mp_back.ManagementError: Script produced errors)?

i'm using Thonny to operate on rasberry pi pico with micropython the script is:
def writejson(gps_altitudes, gps_time, baro_altitudes, baro_time):
f = open("samples.json", "w")
f.write('{\n"gps_altitude": [')
result=""
c=0
for element in gps_altitudes:
flag=True
result = result+str(element)
c+=1
if c != len(gps_altitudes):
result += ", "
f.write(result)
result=""
result = result + '],\n"gps_time": ['
c=0
for element in gps_time:
result = result+"["+str(element[0])+", "+ str(element[1])+", "+str(element[2])+"]"
c+=1
if c != len(gps_time):
result += ", "
f.write(result)
result=""
result = result +'],\n"baro_altitude": ['
c=0
for element in baro_altitudes:
flag = True
result = result + str(element)
c+=1
if c != len(baro_altitudes):
result += ", "
f.write(result)
result=""
result = result+'],\n"baro_time": ['
c=0
for element in baro_time:
result = result+"["+str(element[0])+", "+ str(element[1])+", "+str(element[2])+"]"
c+=1
if c != len(baro_time):
result += ", "
f.write(result)
result=""
f.write(']\n}')
f.close()
to resume everything i just created a json file,
the problem is that if the file exceeds a certain lenght that is variable, Thonny gives me this error when i try to open it.
PROBLEM IN THONNY'S BACK-END: Exception while handling 'read_file' (thonny.plugins.micropython.mp_back.ManagementError: Script produced errors).
log is:
Traceback (most recent call last):
File "B:\Thonny\lib\tkinter\__init__.py", line 1921, in __call__
return self.func(*args)
File "B:\Thonny\lib\site-packages\thonny\base_file_browser.py", line 586, in on_double_click
self.open_file(path)
File "B:\Thonny\lib\site-packages\thonny\base_file_browser.py", line 1201, in open_file
get_workbench().get_editor_notebook().show_remote_file(path)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1069, in show_remote_file
return self.show_file(make_remote_path(target_filename))
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1043, in show_file
editor = self.get_editor(filename, True)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1134, in get_editor
return self._open_file(filename_or_id)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 1116, in _open_file
if editor._load_file(filename):
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 206, in _load_file
result = self._load_remote_file(filename)
File "B:\Thonny\lib\site-packages\thonny\editors.py", line 265, in _load_remote_file
raise RuntimeError(response["error"])
RuntimeError: Backend terminated
i don't understand why this happen if someone can help thanks in advance (:

tensorboard caffe2 can't find blob

I'm trying to record some values with tensorboard writer with the following lines. If I leave the commented section commented, then the code runs fine. But when I uncomment it, I get an error. Any advice would be appreciated.
if (tb_writer != None):
for key, value in qps.items():
tb_writer.add_scalar("QPv2_" + mode + "_Threshold_" + str(key), value, global_step)
# for annotation_idx in range(1, args.num_classes-1):
# annotation_class = annotation_classes[annotation_idx]
# for threshold_idx in range(len(annotation_thresholds)):
# threshold = annotation_thresholds[threshold_idx]
# tb_writer.add_scalar(annotation_class + "_Threshold_" + str(threshold) + "_Precision", str(precision[threshold_idx][annotation_idx].item()), global_step)
# tb_writer.add_scalar(annotation_class + "_Threshold_" + str(threshold) + "_Recall", str(recall[threshold_idx][annotation_idx].item()), global_step)
Traceback (most recent call last):
File "train_transformer_v5.py", line 782, in <module>
main()
File "train_transformer_v5.py", line 763, in main
global_step, tr_loss, metrics_result = trainer.train(args)
File "train_transformer_v5.py", line 451, in train
tb_writer=self.tb_writer, global_step=self.global_step)
File "/home/xianx/model/metrics_v5.py", line 606, in eval_metrics
tb_writer.add_scalar(annotation_class + "_Threshold_" + str(threshold) + "_Precision", str(precision[threshold_idx][annotation_idx].item()), global_step)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/tensorboard/writer.py", line 348, in add_scalar
scalar_value = workspace.FetchBlob(scalar_value)
File "/opt/conda/lib/python3.6/site-packages/caffe2/python/workspace.py", line 379, in FetchBlob
result = C.fetch_blob(StringifyBlobName(name))
RuntimeError: [enforce fail at pybind_state.cc:221] ws->HasBlob(name). Can't find blob: 0.7158585786819458

My issue was that I was using .add_scalar(...) while converting my value to a string. Removing that conversion fixed the problem.

Python how to concatenate file names

I'd like to include the month and year in my file name based on the month submitted.
I defined a variable that would be used to insert my file name.
MTH = '20' + df['Month'].astype(str) + '/01'
MONTH = pd.to_datetime(MTH).dt.month
YEAR = pd.to_datetime(MTH).dt.year
SAVE File Directory:
df.to_excel(r'C:\OUTPUT\SALES_'+ MONTH + '_' + YEAR + '.xlsx', index=False )
Error:
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\generic.py", line 2284, in to_excel
formatter.write(
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\io\formats\excel.py", line 834, in write
writer = ExcelWriter( # type: ignore[abstract]
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\io\excel\_xlsxwriter.py", line 191, in __init__
super().__init__(
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\io\excel\_base.py", line 925, in __init__
self.handles = get_handle(
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\io\common.py",
line 608, in get_handle
ioargs = _get_filepath_or_buffer(
File "C:\SR\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\io\common.py",
line 395, in _get_filepath_or_buffer
raise ValueError(msg)
ValueError: Invalid file path or buffer object type: <class 'pandas.core.series.Series'>
PS C:\SR\Desktop\Python>

You probably mean:
df.to_excel(r'C:\OUTPUT\SALES_'+ MONTH.item() + '_' + YEAR.item() + '.xlsx', index=False )
Or:
df.to_excel(r'C:\OUTPUT\SALES_'+ MONTH[0] + '_' + YEAR[0] + '.xlsx', index=False )

Why am I getting this KeyError using multiprocessing and pandas?

I'm trying to use multiprocessing on a fuzzy matching script I've written, I need to go through 1.4 Billion comparisons and it takes 30+ hours without multiprocessing so I'm attempting to integrate it here.
def fuzzyCompare(data1, data2):
print("Performing Fuzzy Matches...\n")
similarityDf = pd.DataFrame(columns = ["Similarity Ratio", "Id1", Id2])
count = 0
for i in range(len(data1)):
str1 = data1["CompanyName"][i] + "," + data1["Address1"][i] + "," + data1["City"][i] + "," + data1["PostalZip"][i]
str1 = str1.lower().replace(" ","")
for j in range(len(data2)):
str2 = data2["Company"][j] + "," + data2["Physical Street 1"][j] + "," + data2["Physical City"][j] + "," + data2["Physical Postal Code/ZIP"][j]
str2 = str2.lower().replace(" ","")
ratio = fuzz.ratio(str1,str2)
if(ratio > 0):
similarityDf.at[count, "Similarity Ratio"] = str(ratio) + "%"
similarityDf.at[count, "Id1"] = data1["Id1][i]
similarityDf.at[count, "Id2"] = data2["Id2][j]
count = count + 1
print("Performed " + str(len(data1)*len(data2)) + " Fuzzy Comparisons.\n")
return similarityDf
def main():
data1 = readData(excelFile1) *#read excelfile into dataframe*
data2 = readData(excelFile2) *#read excelfile into dataframe*
df_split = np.array_split(data2, 4) *#split data2 into 4*
args = [(data1, df_split[0]),
(data1, df_split[1]),
(data1, df_split[2]),
(data1, df_split[3])]
with mp.Pool(processes=4) as p:
outputData = pd.concat(p.starmap(fuzzyCompare, args))
if __name__ == "__main__":
mp.freeze_support()
main()
I have a print statement at the end of my fuzzyCompare() and it prints the result for only 1 worker then I receive the following error:
multiprocessing.pool.RemoteTraceback
Traceback (most recent call last):
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 47, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "C:\Users\...\Documents\Code\Python\fuzzyCompare\multiFuzzyCLI.py", line 47, in fuzzyCompare
str2 = data2["Company"][j] + "," + data2["Physical Street 1"][j] + "," + data2["Physical City"][j] + "," + data2["Physical Postal Code/ZIP"][j]
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\series.py", line 1068, in __getitem__
result = self.index.get_value(self, key)
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\site-packages\pandas\core\indexes\base.py", line 4730, in get_value
return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
File "pandas\_libs\index.pyx", line 80, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 88, in pandas._libs.index.IndexEngine.get_value
File "pandas\_libs\index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 992, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 998, in
pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "multiFuzzyCLI.py", line 145, in <module>
main()
File "multiFuzzyCLI.py", line 132, in main
outputData = pd.concat(p.starmap(fuzzyCompare, args))
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 276, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "C:\Users\...\AppData\Local\Programs\Python\Python37-32\lib\multiprocessing\pool.py", line 657, in get
raise self._value
KeyError: 0
I know what a KeyError is, I just don't understand how it's getting the error in this case.
Thanks

You get a KeyError because you try to index every DataFrame with an index starting from 0, and np.array_split will maintain the original Index of each split.
To properly index the i-th row of a DataFrame you should always use DataFrame.iloc as this works in general for any Index, not necessarily a RangeIndex that begins from 0. So you need to change all of your selections to be of the form:
data2["Company"].iloc[j] # Not data2["Company"][j]
Worked Example
import pandas as pd
import numpy as np
df = pd.DataFrame({'CompanyName': list('abcdefghij')})
df_split = np.array_split(data2, 4)
# For first split this works as we get lucky the index starts from 0
data2 = df_split[0]
for j in range(len(data2)):
print(data2['CompanyName'][j])
# a
# b
# c
# Later slices fail; `df_split[1].index` is RangeIndex(start=3, stop=6, step=1)
data2 = df_split[1]
for j in range(len(data2)):
print(data2['CompanyName'][j])
# KeyError: 0
# Instead properly select with `.iloc`
for j in range(len(data2)):
print(data2['CompanyName'].iloc[j])
# d
# e
# f

KeyException in Python Pandas

I am receiving the following error related to a Key Error. I have a large data set (in the realm of 10 million records) and I am trying to filter only the records that contain a key word in the 'tags' field. I am able to compare for matching key words easily, but to parse for including key words seems to be quite difficult and any of the methods I have tried that I found on SO throw an error. I am new to Pandas so please forgive me if I am committing a cardinal sin. (Took BigData in university and we worked mostly in Spark. I realize the code is a bit hacky right now just trying to get it to function)
Notes: 1 The data is stored across quarterly files so I am iterating over the files and con-concatenating the results (which is the reason for the index and the counter) 2 I commented out the lines that allow me to parse for exact matches (#is_goodwill = data_frame['tag'] == word_of_interest && #good_will_relation = data_frame[is_goodwill])
Goal: Filter for records containing key word word_of_interest
It does not have to be an exact match to the key word, but rather contain the keyword. Code is below the error
Error
Traceback (most recent call last):
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "parsePandasSECData.py", line 64, in <module>
main()
File "parsePandasSECData.py", line 42, in main
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Users\tyler\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'tags'
Code
import pandas as pd
import os.path
import time
import csv
def main():
start_time = time.time()
file_path = "C:/Users/TA/Desktop/Data/"
word_of_interest = "ShareholdersEquity"
NUM_FILE_NAME = "number.csv"
SUB_FILE_NAME = "subnumber.csv"
quarterly_list = ['Q1', 'Q2', 'Q3', 'Q4']
all_concat_data = None
pd.set_option('display.max_row', 1000)
for counter in range(9,19):
for index in range(len(quarterly_list)):
#iterates over all file locations
num_file_path = file_path + quarterly_list[index] + str(counter) + '/' + NUM_FILE_NAME
sub_file_path = file_path + quarterly_list[index] + str(counter) + '/' + SUB_FILE_NAME
if os.path.exists(num_file_path) and os.path.exists(sub_file_path):
print('Starting ' + quarterly_list[index] + str(counter) + ' Data')
#Load data
data_frame = pd.read_csv(num_file_path, dtype={'adsh': str, 'tag': str, 'version coreg': str, 'ddate': int, 'qtrs': int, 'uom': str, 'value': float, 'footnote': str}, \
header=0, delimiter='\t', low_memory= False, encoding= 'ISO-8859-1')
#Comparative Data
transaction_descriptions = pd.read_csv(sub_file_path, dtype={'adsh': str}, header = 0, delimiter = '\t', low_memory=False, encoding='ISO-8859-1')
#is_goodwill = data_frame['tag'] == word_of_interest
#good_will_relation = data_frame[is_goodwill]
good_will_relation = data_frame[data_frame['tags'].str.contains(word_of_interest)]
captured_data = good_will_relation.merge(transaction_descriptions, how='inner', left_on='adsh', right_on='adsh')
if all_concat_data is not None:
all_concat_data = pd.concat([all_concat_data,captured_data])
else:
all_concat_data = captured_data
else:
print(quarterly_list[index] + str(counter) + ' Does not exist...Skipping')
print('Starting Writer operation')
writer = pd.ExcelWriter('output.xlsx')
all_concat_data.to_excel(writer, 'Sheet1')
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to convert csv's to line protocol - python

Related

how to solve Exception while handling 'read_file' (thonny.plugins.micropython.mp_back.ManagementError: Script produced errors)?

tensorboard caffe2 can't find blob

Python how to concatenate file names

Why am I getting this KeyError using multiprocessing and pandas?

KeyException in Python Pandas

Categories

Resources