I am combining very large data sets using python. The script works completely fine. However there is one specific row that may or may not have a comma in side of it. Does anyone know how to remove the comma? FYI this is how the data is collected it cannot be removed on collection. The field that it is in is the ["NAME'] field.
I have tried to implement a sep=r',(?!\s)' look ahead and that screws my data up even more
THANKS!
import csv
import shutil
import os
import pandas as pd
from os import path
def combinecsv(source_folder):
all_files = os.listdir(source_folder)
master_df = None
for anyfile in all_files:
if anyfile.lower().endswith(".csv"):
file_path = path.join(source_folder, anyfile)
print("opening file path: {}".format(file_path))
df = pd.read_csv(file_path)
if master_df is None:
master_df = df
else:
master_df = master_df.append(df)
new_df = pd.DataFrame()
new_df["MSG_TYPE"] = master_df["MSG_TYPE"]
new_df["MMSI"]= master_df["MMSI"]
new_df["NAME"]= master_df.apply(lambda row: check_for_none(row["NAME"]), axis = 1)
new_df["LAT_AVG"]= master_df["LAT_AVG"]
new_df["LON_AVG"]= master_df["LON_AVG"]
new_df["PERIOD"]= master_df.apply(lambda row: convert_period(row["PERIOD"]),axis = 1)
new_df["SPEED_KNOTS"]= master_df.apply(lambda row: check_for_none(row["SPEED_KNOTS"]), axis = 1)
new_df["COG_DEG"]= master_df.apply(lambda row: check_for_none(row["COG_DEG"]), axis = 1)
new_df["SHIP_AND_CARGO_TYPE"]= master_df.apply(lambda row: check_for_none(row["SHIP_AND_CARGO_TYPE"]), axis = 1)
new_df["DRAUGHT"]= master_df.apply(lambda row: check_for_none(row["DRAUGHT"]), axis = 1)
new_df["LEN"]= master_df.apply(lambda row: combine_bowstern(row["DIM_BOW"],row["DIM_STERN"]), axis = 1)
# axis traverses rows not columns
new_folder = path.join(source_folder, "output")
if not path.exists(new_folder):
os.mkdir(new_folder)
new_csvpath = path.join(new_folder, "output.csv")
print("saving csv to {}".format(new_csvpath))
new_df.to_csv(new_csvpath, index=False, quoting = csv.QUOTE_NONNUMERIC)
def check_for_none(df):
if (df) == 'None':
return ""
else:
return (df)
def convert_period(period):
y = str(period[2:4])
m = str(period[5:7])
d = str(period[8:10])
t = str(period[11:16])
periodnewformat = "{}/{}/{} {}".format(d,m,y,t)
return periodnewformat
def combine_bowstern(bow, stern):
bow_int = 0
stern_int = 0
if bow !="None":
bow_int = int(bow)
if stern !="None":
stern_int = int(stern)
return bow_int + stern_int
if __name__ == "__main__":
source_folder = r'C:\Users\MTTA Standalone\Desktop\Code\csvcombine'
combinecsv(source_folder)
Here is a sample of with and without comma data set:
MSG_TYPE,MMSI,NAME,IMO_NUMBER,CALL_SIGN,LAT_AVG,LON_AVG,PERIOD,SPEED_KNOTS,COG_DEG,HEADING_DEG,NAV_STATUS,NAV_SENSOR,SHIP_AND_CARGO_TYPE,DRAUGHT,DIM_BOW,DIM_STERN,DIM_PORT,DIM_STARBOARD,MMSI_COUNTRY_CD,RECEIVER
1,249830000,ZIM LUANDA,9403229,9HA2029,37.825850,-74.340755,2018-08-01 00:00:00.000,11.5,196.4,198,0,1,71,10.9,197,63,21,11,MT,D05MN-HR-CHIBS1
1,256819000,IOLCOS, DESTINY,9486049,9HA2936,36.833089,-75.672449,2018-08-01 00:00:00.000,9.7,93.1,95,0,1,70,14.4,199,30,13,24,MT,D05MN-NC-MAMBS1
Related
I have one excel file with 38 000 rows. I would like an excel file to be created after every 1000 rows. At the end I should have 38 excel files. This is my very simple script in Python:
import pandas as pd
import os
import deepl
WD = r'C:\Users\Admin\XXX\\'
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df['TRANSLATE'] = df['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df = df.reindex(columns = cn)
df.to_excel(r'C:\Users\Admin\\FINAL_FILE.xlsx', index=False)
Have you any Idea?
Thank you very much!!
Use DataFrame.groupby by helper array by numpy.arange with integer division by N:
N = 1000
for val, df1 in df.groupby(np.arange(len(df)) // N):
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
EDIT: For processing by 1000 rows use:
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for val, df1 in df.groupby(np.arange(len(df)) // N):
for col in df1.columns.to_list():
df1[col] = df1[col].map({True: '', False: ''}).fillna(df1[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
In the end, it's this script that solved my problem :-). But thank you #jezrael for your help:
auth_key = 'xxx'
translator = deepl.Translator(auth_key)
FILE = r"XXX\File.xlsx"
df = pd.read_excel(FILE)
nor = df.shape[0]
breakpoint = 0
chunk_size = 1000
iteration_number = int(round(nor/chunk_size))
for y in range(0,iteration_number+1):
df1 = df.iloc[breakpoint:breakpoint+chunk_size,:]
if df1.empty == False:
# print(df1)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x, target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
breakpoint = breakpoint+chunk_size
As Iam New to Python I need some help to compare two XML files.
These are the Following Conditions:
To print Common fullPath Name and Name (fullPath and Name are the attributes present in the XML file) between the two XML files.
To print the values which is present in only first file and not in second file.
To print the values which is present in only second file and not in first file.
Later, Have to print this output in excel file having different sheets.
for example (1st condition in sheet 1, 2nd condition in sheet2, 3rd condition in sheer3 of the same excel file.)
Can please anyone help me with the code that satisfies the above condition which I have mentioned.
This is the code which I have tried.
from lxml import etree
Base = etree.parse('Base.xml')
Target = etree.parse('Target.xml')
Base_fullPath = Base.xpath("//Member/#fullPath")
Target_fullPath = Target.xpath("//Member/#fullPath")
Base_name = Base.xpath("//Member/#name")
Target_name = Target.xpath("//Member/#name")
def match(Base_fullPath, Target_fullPath, Base_name,Target_name):
Base_fullPath_set = set(Base_fullPath)
Target_fullPath_set = set(Target_fullPath)
Base_name_set = set(Base_name)
Target_name_set = set(Target_name)
if (Base_fullPath_set & Target_fullPath_set, Base_name_set & Target_name_set):
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv', 'w')
y=(Base_fullPath_set & Target_fullPath_set)
z=(Base_name_set & Target_name_set)
print("common details Full Path: \n", *y, sep='\n', file = x)
print("\n")
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv', 'w')
print("\n common details Name: \n", *z, sep='\n', file=x)
else:
print("No Matches Found")
match(Base_fullPath, Target_fullPath, Base_name,Target_name)
def non_match_elements(list_base, list_target):
non_match_base = []
non_match_target = []
for i in list_base:
if i not in list_target:
non_match_base.append(i)
for i in list_target:
if i not in list_base:
non_match_target.append(i)
return non_match_base
return non_match_target
list_base = Base.xpath("//Member/#*")
list_target = Target.xpath("//Member/#*")
non_match_base = non_match_elements(list_base, list_target)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', 'w')
print("\n Base Details: \n", *non_match_base, sep='\n', file = x)
non_match_target = non_match_elements(list_target, list_base)
x = open('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', 'w')
print("\n Target Details: \n", *non_match_target, sep='\n', file = x)
import pandas as pd
df = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_FullPath.csv')
df1 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Common_name.csv')
df2 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_base.csv', delimiter=';;', on_bad_lines = 'skip', engine = 'python' )
df3 = pd.read_csv('C:\\Users\\pvl\\Desktop\\New folder\\Present_in_target.csv', delimiter=';', on_bad_lines = 'skip', engine = 'python')
with pd.ExcelWriter("C:\\Users\\pvl\\Desktop\\New folder\\combined.xlsx") as writer:
df1.to_excel(writer, sheet_name="Common_name", index=False)
df2.to_excel(writer, sheet_name="base_Details", index=False)
df3.to_excel(writer, sheet_name = "target_Details", index=Fal
I have different JSON files in my local directory and I read all of them with this code
path_to_json = 'C:/Users/../Desktop/NewData'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
try:
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['something']]
df = df['something'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["something", "something1"]])
df['Index'] = 'weather5' + str(6+i)
except:
pass
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','something', 'something1']]
df.head()
The name of the JSON files that I read are weather56, weather57, weather58, weather59, weather60, weather61
I am using this line df['Index'] = 'weather5' + str(6+i) to read them properly and adjust them to a dataframe. However seem that I do not read them properly as now appears in the dataframe as:
Index
weather56
weather57
weather58
weather59
weather510
weather511
How to adjust this line df['Index'] = 'weather5' + str(6+i) to read the JSON files with their names?
df['Index'] = 'weather5' + str(6+i)
As i goes from 0 to 6, the corresponding values generated are going to be
weather56 // '5' + str(6 + 0)
weather57
weather58
weather59
weather510 // '5' + str(6 + 4) := '5' + '10'
weather511
If you change the line to
df['Index'] = 'weather' + str(56+i)
it should appear as -
weather56
weather57
weather58
weather59
weather60
weather61
I have a list of dataframes and am attempting to export each using the pandas.df.to_csv method to a folder on disk. However, only the last item in the list of dataframes is being written to disk as a .csv
Please see code below:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs = []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_id = df.iloc[0, 37]
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
new_dfs contains the correct number of dataframes. However, when looping through the new list of dataframes and calling .to_csv on each item in the list, only the last item in the list is written to the disk.
The problem lies in the way in which you name your exported file.
After running through the loop, county_id will be equal to the last county_id, or the county_id of the last iterated df.
Since the name of your exported dataframe is {Output}\ADDR_{county_id}_{date}.csv, all the exported files are being named by the same count_id and date, or in other words, they are being rewritten.
To avoid this, you can create a new list called county_ids and then use the last loop to change the name of the saved file. This would be your resulting code:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path('C:\PA_Boundaries\Tests')
Output = r'C:/PA_Boundaries/test_output'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
try:
dfs = []
for file in os.listdir(CSV_Folder):
df = pd.read_csv(CSV_Folder / file)
dfs.append(df)
new_dfs, county_ids = [], []
for df in dfs:
new_df = pd.DataFrame()
new_df['Original Addr string'] = df['StreetConc']
new_df['Addr #'] = df['AddNum']
new_df['Prefix'] = df['StPreDir']
new_df['Street Name'] = df['StName']
new_df['StreetType'] = df['StType']
new_df['Suffix'] = df['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = df['City']
new_df['Zip Code'] = df['PostCode']
new_df['4'] = df['PostalExt']
new_df['County'] = df['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = df['X']
new_df['Long'] = df['Y']
replaced_address_names = []
for index, row in new_df.iterrows():
new_row = row['Original Addr string'].replace(',', ' ')
replaced_address_names.append(new_row)
new_df['Original Addr string'] = replaced_address_names
county_ids.append(df.iloc[0, 37])
new_dfs.append(new_df)
for i in range(len(new_dfs)):
new_dfs[i].to_csv(f'{Output}\ADDR_{county_id[i]}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
Obviously I cannot test this - if you do run it there maybe lines that need tweaking. However, I'd do the code something like the below. Basically I'd call a function to replace as I'm opening and write out immediately.
If you can get it working it will probably be faster and reads slightly better as there are less lines.
Example:
import pandas as pd
import os
import datetime
from pathlib import Path
CSV_Folder = Path(r'C:/PA_Boundaries/Tests')
Output = r'C:/PA_Boundaries/test_output/'
today = datetime.date.today()
date = today.strftime('%Y%m%d')
def updateFrame(f):
new_df = pd.DataFrame()
new_df['Original Addr string'] = f['StreetConc']
new_df['Addr #'] = f['AddNum']
new_df['Prefix'] = f['StPreDir']
new_df['Street Name'] = f['StName']
new_df['StreetType'] = f['StType']
new_df['Suffix'] = f['StDir']
new_df['Multi-Unit'] = ''
new_df['City'] = f['City']
new_df['Zip Code'] = f['PostCode']
new_df['4'] = f['PostalExt']
new_df['County'] = f['CountyID']
new_df['Addr Type'] = ''
new_df['Precint Part Name'] = ''
new_df['Lat'] = f['X']
new_df['Long'] = f['Y']
# better way to replace without looping the rows...
new_df['Original Addr string'] = new_df['Original Addr string'].str.replace(',', ' ')
return new_df
for file in os.listdir(CSV_Folder):
working_file = str(CSV_Folder) + '/' + file
if working_file.endswith('.csv'):
try:
df = pd.read_csv(working_file)
county_id = str(df.iloc[0, 37])
# the function returns a frame so you can treat it as such...
updateFrame(df).to_csv(f'{Output}ADDR_{county_id}_{date}.csv', index=False)
except FileNotFoundError:
print(f'{file} not found in {CSV_Folder}')
except PermissionError:
print('Check syntax of paths')
else:
print('Process Complete')
so, toward the end of my first file; we'll call /file.py.
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
# excel_data = pandas.read_excel(self.find_file())
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
# print(excel_data)
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
total_records = 0
num_valid_records = 0
num_invalid_records = 0
for row in excel_data.itertuples():
mrn = row.MRN
total_records += 1
if mrn in ("", " ", "N/A", "NaT", "NaN", None) or math.isnan(mrn):
# print(f"Invalid record: {row}")
num_invalid_records += 1
# total_invalid = num_invalid_records + dup_count
excel_data = excel_data.drop(excel_data.index[row.Index])
# continue
else:
# print(mrn) # outputs all MRN ids
for row in excel_data.itertuples():
num_valid_records += 1
continue
with open("./logs/metrics.csv", "a", newline="\n") as f:
csv_writer = DictWriter(f, ['date', 'total_records', 'processed', 'skipped', 'success_rate'])
# csv_writer.writeheader()
currentDT = datetime.datetime.now()
success_rate = num_valid_records / total_records * 100
csv_writer.writerow(dict(date=currentDT,
total_records=total_records,
processed=num_valid_records,
skipped=num_invalid_records,
success_rate=num_valid_records / total_records * 100))
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['MRN'] = data_frame['MRN'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
in my second file I would like to keep that end state; and do another iteration for instance.... second_file.py
def process_records(self, records, map_data, completed=None, errors=None):
"""Code to execute after webdriver initialization."""
series_not_null = False
try:
num_attempt = 0
for record in data_frame.itertuples(): # not working
print(record)
series_not_null = True
mrn = record.MRN
self.navigate_to_search(num_attempt)
self.navigate_to_member(mrn)
self.navigate_to_assessment()
self.add_assessment(record, map_data)
self.driver.switch_to.parent_frame() # not working
sleep(.5)
error_flag = self.close_member_tab(self.driver, mrn, error_flag)
except Exception as exc:
if series_not_null:
errors = self.process_series_error(exc)
return completed, error
both have import pandas
you can save your dataframe in a pickle file like this. it is also worth noting that you can store most anything in a pickle file. here is a link to some info here: pickle info
import pandas as pd
import pickle
x = pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
#this will create a file called pickledata.p that will store the data frame
with open('pickledata.p', 'wb') as fh: #notice that you need the 'wb' for the dump
pickle.dump(x, fh)
#to load the file do this
with open('pickledata.p', 'rb') as fh: #you need to use 'rb' to read
df = pickle.load(fh)
#you can now use df like a normal dataframe
print(df)
you dont actually need the '.p' extension for a pickle file, i just like it.
so you save your dataframe at the end of script one, and then load it in at the start of script 2.
Use Dataframe.to_pickle and pandas.read_pickle:
To persist
df.to_pickle('./dataframe.pkl')
To load
df = pd.read_pickle('./dataframe.pkl')