Python pandas generate excel file with 1000 rows - python

I have one excel file with 38 000 rows. I would like an excel file to be created after every 1000 rows. At the end I should have 38 excel files. This is my very simple script in Python:
import pandas as pd
import os
import deepl
WD = r'C:\Users\Admin\XXX\\'
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df['TRANSLATE'] = df['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df = df.reindex(columns = cn)
df.to_excel(r'C:\Users\Admin\\FINAL_FILE.xlsx', index=False)
Have you any Idea?
Thank you very much!!

Use DataFrame.groupby by helper array by numpy.arange with integer division by N:
N = 1000
for val, df1 in df.groupby(np.arange(len(df)) // N):
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
EDIT: For processing by 1000 rows use:
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for val, df1 in df.groupby(np.arange(len(df)) // N):
for col in df1.columns.to_list():
df1[col] = df1[col].map({True: '', False: ''}).fillna(df1[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)

In the end, it's this script that solved my problem :-). But thank you #jezrael for your help:
auth_key = 'xxx'
translator = deepl.Translator(auth_key)
FILE = r"XXX\File.xlsx"
df = pd.read_excel(FILE)
nor = df.shape[0]
breakpoint = 0
chunk_size = 1000
iteration_number = int(round(nor/chunk_size))
for y in range(0,iteration_number+1):
df1 = df.iloc[breakpoint:breakpoint+chunk_size,:]
if df1.empty == False:
# print(df1)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x, target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
breakpoint = breakpoint+chunk_size

Related

read in sequence different json file

I have different JSON files in my local directory and I read all of them with this code
path_to_json = 'C:/Users/../Desktop/NewData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
try:
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['something']]
df = df['something'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["something", "something1"]])
df['Index'] = 'weather5' + str(6+i)
except:
pass
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','something', 'something1']]
df.head()
The name of the JSON files that I read are weather56, weather57, weather58, weather59, weather60, weather61
I am using this line df['Index'] = 'weather5' + str(6+i) to read them properly and adjust them to a dataframe. However seem that I do not read them properly as now appears in the dataframe as:
Index
weather56
weather57
weather58
weather59
weather510
weather511
How to adjust this line df['Index'] = 'weather5' + str(6+i) to read the JSON files with their names?
df['Index'] = 'weather5' + str(6+i)
As i goes from 0 to 6, the corresponding values generated are going to be
weather56 // '5' + str(6 + 0)
weather57
weather58
weather59
weather510 // '5' + str(6 + 4) := '5' + '10'
weather511
If you change the line to
df['Index'] = 'weather' + str(56+i)
it should appear as -
weather56
weather57
weather58
weather59
weather60
weather61

Transform CSV to JSON with Apache NiFi ExecuteStreamCommand - Python

I am currently having an issue with the Apache NiFi processor ExecuteStreamCommand and the implementation of a Python-script.
I wrote the code below to transform 50 different csv-files into json. Afterwards I'am going to write those JSON to HDFS.
import json
import pandas as pd
df = pd.read_csv(r'***.csv',
sep='\t',
skiprows=2)
df = df.dropna(axis=1, how='all')
df = df.drop_duplicates(keep='first')
del df['Charge']
df = df.rename(columns={df.columns[0]: "Zeitstempel", df.columns[1]: "Maschine"})
df.columns = map(str.lower, df.columns)
df['zeitstempel'] = pd.to_datetime(df['zeitstempel'], format='%d.%m.%y, %X')
df['zeitstempel'] = df['zeitstempel'].astype(str)
columns = list(df.columns)
for column in range(len(columns)):
if str(columns[column]).startswith('_'):
columns[column] = columns[column][1:]
df.columns = columns
machine = df["maschine"][0]
day = str(df["zeitstempel"][0])[5:7]
month = str(df["zeitstempel"][0])[8:10]
year = str(df["zeitstempel"][0])[0:4]
fileName = machine + "_" + year + "_" + month + "_" + day + ".json"
filePath = "***" + fileName
df.to_json(filePath, orient='records', date_format='iso', date_unit='s', lines=True)
The script works fine on my local directory, but how do I need to change the input and output for NiFi?
The NiFi-flow is as follow: ListFile > FetchFile > ExecuteStreamCommand > PutHDFS.
I tried the code as follows:
#!/usr/bin/env python2
import json
import pandas as pd
df = pd.read_csv(sys.stdin,
sep='\t',
skiprows=2)
df = df.dropna(axis=1, how='all')
df = df.drop_duplicates(keep='first')
del df['Charge']
df = df.rename(columns={df.columns[0]: "Zeitstempel", df.columns[1]: "Maschine"})
df.columns = map(str.lower, df.columns)
df['zeitstempel'] = pd.to_datetime(df['zeitstempel'], format='%d.%m.%y, %X')
df['zeitstempel'] = df['zeitstempel'].astype(str)
columns = list(df.columns)
for column in range(len(columns)):
if str(columns[column]).startswith('_'):
columns[column] = columns[column][1:]
df.columns = columns
machine = df["maschine"][0]
day = str(df["zeitstempel"][0])[5:7]
month = str(df["zeitstempel"][0])[8:10]
year = str(df["zeitstempel"][0])[0:4]
fileName = machine + "_" + year + "_" + month + "_" + day + ".json"
df.to_json(sys.stdout, orient='records', date_format='iso', date_unit='s', lines=True)
And configured the processor like:
Thank you in advance from Germany!
Nicko
Configure your ExecuteStreamCommand processor something like this way -
Also please check the official docs - ExecuteStreamCommand

looping through a list of dataframes, writing each element of that list to a new .csv file on disk

I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')

output is returning NaN how do i get ride of the NaN without dropping the whole row

i am trying to combine columns from excel sets and when i combine the columns it places a NaN were the column is empty. how do i get rid of the NaN without dropping the whole row?
import os
import pandas as pd
import numpy as np
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xls:
qw = pd.read_excel(f)
df = df.append(qw)
cf = df.iloc[:, df.columns.str.contains('address1|address2|city|state|zip|Location Address', case=False)]
vf = df['address1'].map(str) + '-' + df['address2'].map(str) + '-' + df['city'].map(str) + '-' + df['state'].map(str) + '-' + df['zip'].map(str)
export_csv = vf.to_csv('dataframe.csv', index=None, header=True)
use replace function
import os
import pandas as pd
import numpy as np
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-4:] == 'xlsx']
df = pd.DataFrame()
for f in files_xls:
qw = pd.read_excel(f)
df = df.append(qw)
cf = df.iloc[:,
df.columns.str.contains('address1|address2|city|state|zip|Location Address', case=False)]
vf = df['address1'].map(str) + '-' + df['address2'].map(str) + '-' +
df['city'].map(str) + '-' + df['state'].map(str) + '-' + df['zip'].map(str)
df = df.replace(np.nan, '', regex=True)
export_csv = vf.to_csv('dataframe.csv', index=None, header=True)

Extra Comma in specific data field using pandas

I am combining very large data sets using python. The script works completely fine. However there is one specific row that may or may not have a comma in side of it. Does anyone know how to remove the comma? FYI this is how the data is collected it cannot be removed on collection. The field that it is in is the ["NAME'] field.
I have tried to implement a sep=r',(?!\s)' look ahead and that screws my data up even more
THANKS!
import csv
import shutil
import os
import pandas as pd
from os import path
def combinecsv(source_folder):
all_files = os.listdir(source_folder)
master_df = None
for anyfile in all_files:
if anyfile.lower().endswith(".csv"):
file_path = path.join(source_folder, anyfile)
print("opening file path: {}".format(file_path))
df = pd.read_csv(file_path)
if master_df is None:
master_df = df
else:
master_df = master_df.append(df)
new_df = pd.DataFrame()
new_df["MSG_TYPE"] = master_df["MSG_TYPE"]
new_df["MMSI"]= master_df["MMSI"]
new_df["NAME"]= master_df.apply(lambda row: check_for_none(row["NAME"]), axis = 1)
new_df["LAT_AVG"]= master_df["LAT_AVG"]
new_df["LON_AVG"]= master_df["LON_AVG"]
new_df["PERIOD"]= master_df.apply(lambda row: convert_period(row["PERIOD"]),axis = 1)
new_df["SPEED_KNOTS"]= master_df.apply(lambda row: check_for_none(row["SPEED_KNOTS"]), axis = 1)
new_df["COG_DEG"]= master_df.apply(lambda row: check_for_none(row["COG_DEG"]), axis = 1)
new_df["SHIP_AND_CARGO_TYPE"]= master_df.apply(lambda row: check_for_none(row["SHIP_AND_CARGO_TYPE"]), axis = 1)
new_df["DRAUGHT"]= master_df.apply(lambda row: check_for_none(row["DRAUGHT"]), axis = 1)
new_df["LEN"]= master_df.apply(lambda row: combine_bowstern(row["DIM_BOW"],row["DIM_STERN"]), axis = 1)
# axis traverses rows not columns
new_folder = path.join(source_folder, "output")
if not path.exists(new_folder):
os.mkdir(new_folder)
new_csvpath = path.join(new_folder, "output.csv")
print("saving csv to {}".format(new_csvpath))
new_df.to_csv(new_csvpath, index=False, quoting = csv.QUOTE_NONNUMERIC)
def check_for_none(df):
if (df) == 'None':
return ""
else:
return (df)
def convert_period(period):
y = str(period[2:4])
m = str(period[5:7])
d = str(period[8:10])
t = str(period[11:16])
periodnewformat = "{}/{}/{} {}".format(d,m,y,t)
return periodnewformat
def combine_bowstern(bow, stern):
bow_int = 0
stern_int = 0
if bow !="None":
bow_int = int(bow)
if stern !="None":
stern_int = int(stern)
return bow_int + stern_int
if __name__ == "__main__":
source_folder = r'C:\Users\MTTA Standalone\Desktop\Code\csvcombine'
combinecsv(source_folder)
Here is a sample of with and without comma data set:
MSG_TYPE,MMSI,NAME,IMO_NUMBER,CALL_SIGN,LAT_AVG,LON_AVG,PERIOD,SPEED_KNOTS,COG_DEG,HEADING_DEG,NAV_STATUS,NAV_SENSOR,SHIP_AND_CARGO_TYPE,DRAUGHT,DIM_BOW,DIM_STERN,DIM_PORT,DIM_STARBOARD,MMSI_COUNTRY_CD,RECEIVER
1,249830000,ZIM LUANDA,9403229,9HA2029,37.825850,-74.340755,2018-08-01 00:00:00.000,11.5,196.4,198,0,1,71,10.9,197,63,21,11,MT,D05MN-HR-CHIBS1
1,256819000,IOLCOS, DESTINY,9486049,9HA2936,36.833089,-75.672449,2018-08-01 00:00:00.000,9.7,93.1,95,0,1,70,14.4,199,30,13,24,MT,D05MN-NC-MAMBS1

Categories