I have different JSON files in my local directory and I read all of them with this code
path_to_json = 'C:/Users/../Desktop/NewData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
try:
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['something']]
df = df['something'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["something", "something1"]])
df['Index'] = 'weather5' + str(6+i)
except:
pass
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','something', 'something1']]
df.head()
The name of the JSON files that I read are weather56, weather57, weather58, weather59, weather60, weather61
I am using this line df['Index'] = 'weather5' + str(6+i) to read them properly and adjust them to a dataframe. However seem that I do not read them properly as now appears in the dataframe as:
Index
weather56
weather57
weather58
weather59
weather510
weather511
How to adjust this line df['Index'] = 'weather5' + str(6+i) to read the JSON files with their names?
df['Index'] = 'weather5' + str(6+i)
As i goes from 0 to 6, the corresponding values generated are going to be
weather56 // '5' + str(6 + 0)
weather57
weather58
weather59
weather510 // '5' + str(6 + 4) := '5' + '10'
weather511
If you change the line to
df['Index'] = 'weather' + str(56+i)
it should appear as -
weather56
weather57
weather58
weather59
weather60
weather61
Related
As Iam New to Python I need some help to compare two XML files.
These are the Following Conditions:
To print Common fullPath Name and Name (fullPath and Name are the attributes present in the XML file) between the two XML files.
To print the values which is present in only first file and not in second file.
To print the values which is present in only second file and not in first file.
Later, Have to print this output in excel file having different sheets.
for example (1st condition in sheet 1, 2nd condition in sheet2, 3rd condition in sheer3 of the same excel file.)
Can please anyone help me with the code that satisfies the above condition which I have mentioned.
This is the code which I have tried.
from lxml import etree
Base = etree.parse('Base.xml')
Target = etree.parse('Target.xml')
Base_fullPath = Base.xpath("//Member/#fullPath")
Target_fullPath = Target.xpath("//Member/#fullPath")
Base_name = Base.xpath("//Member/#name")
Target_name = Target.xpath("//Member/#name")
def match(Base_fullPath, Target_fullPath, Base_name,Target_name):
Base_fullPath_set = set(Base_fullPath)
Target_fullPath_set = set(Target_fullPath)
Base_name_set = set(Base_name)
Target_name_set = set(Target_name)
if (Base_fullPath_set & Target_fullPath_set, Base_name_set & Target_name_set):
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv', 'w')
y=(Base_fullPath_set & Target_fullPath_set)
z=(Base_name_set & Target_name_set)
print("common details Full Path: \n", *y, sep='\n', file = x)
print("\n")
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv', 'w')
print("\n common details Name: \n", *z, sep='\n', file=x)
else:
print("No Matches Found")
match(Base_fullPath, Target_fullPath, Base_name,Target_name)
def non_match_elements(list_base, list_target):
non_match_base = []
non_match_target = []
for i in list_base:
if i not in list_target:
non_match_base.append(i)
for i in list_target:
if i not in list_base:
non_match_target.append(i)
return non_match_base
return non_match_target
list_base = Base.xpath("//Member/#*")
list_target = Target.xpath("//Member/#*")
non_match_base = non_match_elements(list_base, list_target)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', 'w')
print("\n Base Details: \n", *non_match_base, sep='\n', file = x)
non_match_target = non_match_elements(list_target, list_base)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', 'w')
print("\n Target Details: \n", *non_match_target, sep='\n', file = x)
import pandas as pd
df = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv')
df1 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv')
df2 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', delimiter=';;', on_bad_lines = 'skip', engine = 'python' )
df3 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', delimiter=';', on_bad_lines = 'skip', engine = 'python')
with pd.ExcelWriter("C:\\Users\\pvl\\Desktop\\New folder\\combined.xlsx") as writer:
df1.to_excel(writer, sheet_name="Common_name", index=False)
df2.to_excel(writer, sheet_name="base_Details", index=False)
df3.to_excel(writer, sheet_name = "target_Details", index=Fal
I am currently having an issue with the Apache NiFi processor ExecuteStreamCommand and the implementation of a Python-script.
I wrote the code below to transform 50 different csv-files into json. Afterwards I'am going to write those JSON to HDFS.
import json
import pandas as pd
df = pd.read_csv(r'***.csv',
sep='\t',
skiprows=2)
df = df.dropna(axis=1, how='all')
df = df.drop_duplicates(keep='first')
del df['Charge']
df = df.rename(columns={df.columns[0]: "Zeitstempel", df.columns[1]: "Maschine"})
df.columns = map(str.lower, df.columns)
df['zeitstempel'] = pd.to_datetime(df['zeitstempel'], format='%d.%m.%y, %X')
df['zeitstempel'] = df['zeitstempel'].astype(str)
columns = list(df.columns)
for column in range(len(columns)):
if str(columns[column]).startswith('_'):
columns[column] = columns[column][1:]
df.columns = columns
machine = df["maschine"][0]
day = str(df["zeitstempel"][0])[5:7]
month = str(df["zeitstempel"][0])[8:10]
year = str(df["zeitstempel"][0])[0:4]
fileName = machine + "_" + year + "_" + month + "_" + day + ".json"
filePath = "***" + fileName
df.to_json(filePath, orient='records', date_format='iso', date_unit='s', lines=True)
The script works fine on my local directory, but how do I need to change the input and output for NiFi?
The NiFi-flow is as follow: ListFile > FetchFile > ExecuteStreamCommand > PutHDFS.
I tried the code as follows:
#!/usr/bin/env python2
import json
import pandas as pd
df = pd.read_csv(sys.stdin,
sep='\t',
skiprows=2)
df = df.dropna(axis=1, how='all')
df = df.drop_duplicates(keep='first')
del df['Charge']
df = df.rename(columns={df.columns[0]: "Zeitstempel", df.columns[1]: "Maschine"})
df.columns = map(str.lower, df.columns)
df['zeitstempel'] = pd.to_datetime(df['zeitstempel'], format='%d.%m.%y, %X')
df['zeitstempel'] = df['zeitstempel'].astype(str)
columns = list(df.columns)
for column in range(len(columns)):
if str(columns[column]).startswith('_'):
columns[column] = columns[column][1:]
df.columns = columns
machine = df["maschine"][0]
day = str(df["zeitstempel"][0])[5:7]
month = str(df["zeitstempel"][0])[8:10]
year = str(df["zeitstempel"][0])[0:4]
fileName = machine + "_" + year + "_" + month + "_" + day + ".json"
df.to_json(sys.stdout, orient='records', date_format='iso', date_unit='s', lines=True)
And configured the processor like:
Thank you in advance from Germany!
Nicko
Configure your ExecuteStreamCommand processor something like this way -
Also please check the official docs - ExecuteStreamCommand
I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
I am attempting to convert some columns to a date format but am not having any luck. Here is my code:
from datetime import date, datetime
from utils import misc_errors
from os import listdir, remove
from os.path import isfile, join
from pathlib import Path
import pandas as pd
import shutil
import csv
import io
import codecs
# getting paths that will be used later
path = str(Path().absolute()) + r'\\'
files = []
fetch = r'C:\' +'\\'
net = r'C:\' +'\\'
# getting the names of the files needed to copy
allfiles = [f for f in listdir(fetch) if isfile(join(fetch, f))]
for name in allfiles:
if name.endswith('csv'):
files.append(name)
for file_name in files:
#copy the file
shutil.copy2(fetch + file_name, path + file_name)
#get the date for later
file_date = date.today().strftime("%Y%m%d")
# Reading the data from the csv file
#file_df = pd.read_csv(file_name, sep=',', quotechar='"', thousands=',', encoding='Latin-1')
file_df = pd.read_csv(file_name, sep=',',delimiter=',', quotechar='"', thousands=',', encoding='Latin-1', dtype='object', low_memory=False, skiprows=5)
file_df.columns = [col.strip() for col in file_df.columns]
#populate the count column
total = len(file_df.index)
count = []
for i in range(0, total):
count.append('1')
file_df["count()"] = count
# get a list of the headers for use later
headers = file_df.columns.values.tolist()
file_df.fillna('',inplace=True)
if 'project' in file_name:
# remove all duplicates from the projects file
file_df = file_df.drop_duplicates(keep='first')
file_final = "PROJECTS.FULL." + file_date
supplier = []
for i in range(0, total):
supplier.append('Unclassified')
file_df["Suppliers - ERP Supplier ID"] = supplier
file_df["Suppliers - ERP Supplier"] = supplier
file_df = file_df.apply(lambda x: pd.Series([str(x[i]).replace("\n",'') for i in range(0, len(x))], index=headers), axis=1)
num_headers = [r"sum(Annual Spend Amount)", r"sum(Total Contract Value Amount)"]
for header in num_headers:
file_df[header] = ['{0:.0f}'.format(float(file_df[header][i])) if file_df[header][i] == 0 else '{0:,.2f}'.format(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0,len(file_df[header]))]
header = r"sum(% of Total Contract Value in US)"
file_df[header] = [int(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0, len(file_df[header]))]
header = "Reporting Year"
file_df[header] = [int(float(file_df[header][i])) if file_df[header][i] != '' else '' for i in range(0, len(file_df[header]))]
word_headers = ["Description", "Key Considerations", "Key Highlights / Value Statement", "Status Update"]
for header in word_headers:
file_df = misc_errors(file_df, header)
file_df.columns = [c.replace("–", "-") for c in file_df]
file_headers = ["Begin Date","End Date - Date","Estimated Completion Date - Date",
"Anticipated T&O Legal Engagement Date - Date","Benefits Start Date - Date", "Benefits End Date - Date" ]
pd.to_datetime(file_headers['Begin Date'], errors='ignore')
file_df.to_csv(file_final, index=False, encoding="latin-1")
remove(file_name)
shutil.copy2(path + file_final, net + file_final)
I am trying to convert the columns in file_header (near the bottom) to date,
Updated and added the full error log here:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-3-f04094ba593a> in <module>
79 "Anticipated T&O Legal Engagement Date - Date","Benefits Start Date - Date", "Benefits End Date - Date" ]
80
---> 81 pd.to_datetime(file_headers['Begin Date'], errors='ignore')
82
83 file_df.to_csv(file_final, index=False, encoding="latin-1")
TypeError: list indices must be integers or slices, not str
Still not working though. Thanks for all the help so far, and let me know if there is anything else I can do here.
You may want to check the content of your Date column. Try to loop over it and spot the non-working values:
for row in df.itertuples():
try:
pd.to_datetime(row.Date)
except:
print(row.Date)
Then think of what you want to do with them
I am combining very large data sets using python. The script works completely fine. However there is one specific row that may or may not have a comma in side of it. Does anyone know how to remove the comma? FYI this is how the data is collected it cannot be removed on collection. The field that it is in is the ["NAME'] field.
I have tried to implement a sep=r',(?!\s)' look ahead and that screws my data up even more
THANKS!
import csv
import shutil
import os
import pandas as pd
from os import path
def combinecsv(source_folder):
all_files = os.listdir(source_folder)
master_df = None
for anyfile in all_files:
if anyfile.lower().endswith(".csv"):
file_path = path.join(source_folder, anyfile)
print("opening file path: {}".format(file_path))
df = pd.read_csv(file_path)
if master_df is None:
master_df = df
else:
master_df = master_df.append(df)
new_df = pd.DataFrame()
new_df["MSG_TYPE"] = master_df["MSG_TYPE"]
new_df["MMSI"]= master_df["MMSI"]
new_df["NAME"]= master_df.apply(lambda row: check_for_none(row["NAME"]), axis = 1)
new_df["LAT_AVG"]= master_df["LAT_AVG"]
new_df["LON_AVG"]= master_df["LON_AVG"]
new_df["PERIOD"]= master_df.apply(lambda row: convert_period(row["PERIOD"]),axis = 1)
new_df["SPEED_KNOTS"]= master_df.apply(lambda row: check_for_none(row["SPEED_KNOTS"]), axis = 1)
new_df["COG_DEG"]= master_df.apply(lambda row: check_for_none(row["COG_DEG"]), axis = 1)
new_df["SHIP_AND_CARGO_TYPE"]= master_df.apply(lambda row: check_for_none(row["SHIP_AND_CARGO_TYPE"]), axis = 1)
new_df["DRAUGHT"]= master_df.apply(lambda row: check_for_none(row["DRAUGHT"]), axis = 1)
new_df["LEN"]= master_df.apply(lambda row: combine_bowstern(row["DIM_BOW"],row["DIM_STERN"]), axis = 1)
# axis traverses rows not columns
new_folder = path.join(source_folder, "output")
if not path.exists(new_folder):
os.mkdir(new_folder)
new_csvpath = path.join(new_folder, "output.csv")
print("saving csv to {}".format(new_csvpath))
new_df.to_csv(new_csvpath, index=False, quoting = csv.QUOTE_NONNUMERIC)
def check_for_none(df):
if (df) == 'None':
return ""
else:
return (df)
def convert_period(period):
y = str(period[2:4])
m = str(period[5:7])
d = str(period[8:10])
t = str(period[11:16])
periodnewformat = "{}/{}/{} {}".format(d,m,y,t)
return periodnewformat
def combine_bowstern(bow, stern):
bow_int = 0
stern_int = 0
if bow !="None":
bow_int = int(bow)
if stern !="None":
stern_int = int(stern)
return bow_int + stern_int
if __name__ == "__main__":
source_folder = r'C:\Users\MTTA Standalone\Desktop\Code\csvcombine'
combinecsv(source_folder)
Here is a sample of with and without comma data set:
MSG_TYPE,MMSI,NAME,IMO_NUMBER,CALL_SIGN,LAT_AVG,LON_AVG,PERIOD,SPEED_KNOTS,COG_DEG,HEADING_DEG,NAV_STATUS,NAV_SENSOR,SHIP_AND_CARGO_TYPE,DRAUGHT,DIM_BOW,DIM_STERN,DIM_PORT,DIM_STARBOARD,MMSI_COUNTRY_CD,RECEIVER
1,249830000,ZIM LUANDA,9403229,9HA2029,37.825850,-74.340755,2018-08-01 00:00:00.000,11.5,196.4,198,0,1,71,10.9,197,63,21,11,MT,D05MN-HR-CHIBS1
1,256819000,IOLCOS, DESTINY,9486049,9HA2936,36.833089,-75.672449,2018-08-01 00:00:00.000,9.7,93.1,95,0,1,70,14.4,199,30,13,24,MT,D05MN-NC-MAMBS1