I've got the following python script that opens a single CSV file and performs a few bits to recreate columns and slices lists, finally outputs to CSV.
I would like the script to one all current .CSV files in a directory and perform the actions and output a final CSV file.
FYI: I'm trying to avoid PANDAS.
import csv
projects = []
count = 0
with open('timesheets.csv') as csvfile:
timesheets = csv.reader(csvfile, delimiter=',')
for rows in timesheets:
# IF statement will run until count is more than 3, then move out of IF statement and print remaining rows.
if count < 3:
count += 1
continue
# Columns 1,8,9,13,15,18
columns1 = rows[1:2]
columns8_9 = rows[8:10]
columns13 = rows[13:14]
columns15 = rows[15:15]
columns18 = rows[18:19]
project = columns1 + columns8_9 + columns13 + columns15 + columns18
# Append each line as a seperate list to projects list. You end up with multiple lists within in a list.
projects.append(project)
# Remove the last list in projects list, since being empy and causes errors.
projects = projects[:-1]
newlist = []
# Remove the first 6 characters from each line[1] & line[4]
for lists in projects:
# Remove the first 6 characters from each line[1]
engineer = lists[1]
engineer = engineer[8:]
lists[1] = engineer
# Remove the first 6 characters from each line[3]
employee = lists[3]
employee = employee[8:]
lists[3] = employee
newlist.append(lists)
# Change the first list to the following list, which effectively changes the column names.
newlist[0] = ['Project Name', 'Line Manager', 'Element', 'Employee', 'Hours']
writer = csv.writer(open('output.csv', 'w'))
writer.writerows(newlist)
Put list of csv's in a list and iterate. This will help you:-
import glob
files_list = glob.glob("path/*.csv")
for file in files_list:
#your remaining code goes here...
Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 2 years ago.
Improve this question
For a current project, I am planning to run several iterations of the script below and to save the results in different CSV files with a new file for each iteration (the CSV part is at the end of the script).
The given code currently shows the relevant results in the terminal while it only creates empty CSV files. I have spent days figuring out how to solve the situation but cannot get to a solution. Is there anyone who can help?
Note: I have updated the code in accordance with user recommendations while the original issue/challenge still persists.
import string
import json
import csv
import pandas as pd
import datetime
from dateutil.relativedelta import *
import numpy as np
import matplotlib.pyplot as plt
# Loading and reading dataset
file = open("Glassdoor_A.json", "r")
data = json.load(file)
df = pd.json_normalize(data)
df['Date'] = pd.to_datetime(df['Date'])
# Allocate periods for individual CSV file names
periods = pd.period_range('2009Q1','2018Q4',freq='Q')
ts = pd.Series(np.random.randn(40), periods)
type(ts.index)
intervals = ts.index
# Create individual empty files with headers
for i in intervals:
name = 'Glassdoor_A_' + 'Text Main_' + str(i)
with open(name+'.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Create an empty dictionary
d = dict()
# Filtering by date
start_date = pd.to_datetime('2009-01-01')
end_date = pd.to_datetime('2009-03-31')
last_end_date = pd.to_datetime('2017-12-31')
mnthBeg = pd.offsets.MonthBegin(3)
mnthEnd = pd.offsets.MonthEnd(3)
while end_date <= last_end_date:
filtered_dates = df[df.Date.between(start_date, end_date)]
n = len(filtered_dates.index)
print(f'Date range: {start_date.strftime("%Y-%m-%d")} - {end_date.strftime("%Y-%m-%d")}, {n} rows.')
if n > 0:
print(filtered_dates)
start_date += mnthBeg
end_date += mnthEnd
# Processing Text Main section
for index, row in filtered_dates.iterrows():
line = row['Text Main']
# Remove the leading spaces and newline character
line = line.split(' ')
line = [val.strip() for val in line]
# Convert the characters in line to
# lowercase to avoid case mismatch
line = [val.lower() for val in line]
# Remove the punctuation marks from the line
line = [val.translate(val.maketrans("", "", string.punctuation)) for val in line]
print(line)
# Split the line into words
# words = [val.split(" ") for val in line]
# print(words)
# Iterate over each word in line
for word in line:
# Check if the word is already in dictionary
if word in d.keys():
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
print(d)
# Print the contents of dictionary
for key in list(d.keys()):
print(key, ":", d[key])
# Count the total number of words
total = sum(d.values())
percent = d[key] / total
print(d[key], total, percent)
# Save as CSV file
while end_date <= last_end_date:
for index, row in filtered_dates.iterrows():
for i in data:
name = 'Glassdoor_A_' + str(i)
with open(name+'.csv', 'a', newline='') as file:
writer.writerow(["Word", "Occurrences", "Percentage"])
writer.writerows([key, d[key], percent] for key in list(d.keys()))
Wrt your inner loop which writes the CSV files:
# Create individual file names
for i in data:
name = 'Glassdoor_A_' + str(i)
# Save output in CSV file
with open(name+'.csv', 'w', newline='') as file:
...
⬆ is executed for each iteration of the outer loop for index, row in filtered_dates.iterrows():. So each iteration while overwrite the previously created files. Try using mode as 'a' (append) and write the headers with empty data outside of these two loops.
Without getting into the details of what you're calculating and writing out, the way to make it append data to the outfiles would be:
Create the files with just the headers at the start of the script.
The last inner loop should write to the files in append mode.
So, at the start of your script, add:
data = json.load(file)
# Create individual empty files with headers
for i in data:
name = 'Glassdoor_A_' + str(i)
with open(name+'.csv', 'w', newline='') as file:
writer = csv.writer(file) # you probably don't need to use the csv module for the first part
writer.writerow(["Text Main Words", "Text Main Occurrences"])
# nothing else here for now
Then at the end of your script, for the inner most loop where you're writing out the data, do:
while end_date <= last_end_date:
...
for index, row in filtered_dates.iterrows():
...
for i in data:
name = 'Glassdoor_A_' + str(i)
with open(name+'.csv', 'a', newline='') as file: # note the 'append' mode
writer = csv.writer(file)
writer.writerows([occurrence])
Btw, that last line writer.writerows([occurrence]) should probably be writer.writerows(list(occurrence)) if occurrence is not already a list of tuples or a list of lists with two elements in each inner list.
import csv
cred = open("AllCredits.csv", "r")
creader = csv.reader(cred)
pur = open("AllPurchases.csv", "r")
preader = csv.reader(pur)
out = open("output.txt", "r+")
for row in creader:
tn = #current phone number
crednum = #number of rows with that phone number
for row in preader:
purnum = #number of rows with that phone number
if crednum != 2*(purnum):
out.write(str(tn) + "\n")
cred.close()
pur.close()
out.close()
For both files I am only looking at the first column (0th), which is for phone numbers. The files are sorted by phone number, so any duplicates are next to each other. I need to know how many rows there are of the same phone number in the cred file, and then I need to know how many rows with that same phone number there are in the pur file. I need to do this as many times as it takes to compare all number of duplicate phone numbers between files
ex:
Credits File
TN,STUFF,THINGS
2476,hseqer,trjar
2476,sthrtj,esreet
3654,rstrhh,trwtr
Purchases File
TN,STUFF,THINGS
2476,hseher,trjdr
3566,sthztj,esrhet
3654,rstjhh,trjtr
What I would need to know with this example is that there are 2 instances of 2476 in the credits file versus 1 in the purchases file, and then that there is 1 instance of 3654 in the credits file versus 1 instance in the purchases file. I need to compare every single phone number in the cred file and get the number of occurrences in both files, but if there are phone numbers present in the pur file that are not in the cred file, I don't need to count anything. (But if there are 2 of a number in cred and none in pur, I do need a 0 to be returned for purnum.) Note that the real two files are 5,000kb and 13,000kb in size and have tens of thousands of lines.
I'm a serious newbie to python so I'm not sure of the best way to go about this. Looping in python is definitely different than I'm used to (I mostly use c++)
I will edit to add anything needed so please let me know if anything needs clarification. This isn't like any project I've ever had to do before so the explanation may not be ideal.
EDIT: I think I may have skipped explaining an important factor because it was included in my sample code. I need to know those numbers only to compare them, not necessarily to print the counts. If crednum != 2*purnum, then I want to print that phone number and only that phone number, otherwise I don't need to see it in the output file, and I'll never need to actually print the counts, just use them for comparison to figure out what phone numbers need printing.
import csv
cred = open("AllCredits.csv", "r")
creader = csv.reader(cred)
pur = open("AllPurchases.csv", "r")
preader = csv.reader(pur)
out = open("output.txt", "r+")
def x(reader): # function takes in a reader
dictionary = {} # this is a python date type of key value pairs
for row in reader: # for each row in the reader
number = row[0] # take the first element in the row (the number)
if number == 'TN': # skip the headers
continue
number = int(number) #convert it to number now ('TN' cannot be converted which is why we do it after)
if number in dictionary: # if the number appears alreader
dictionary[number] = dictionary[number]+1 # increment it
else:
dictionary[number] = 1 # else store it in the dictionary as 1
return dictionary # return the dictionary
def assertDoubles(credits, purchases):
outstr = ''
for key in credits:
crednum = credits[key]
if crednum != 2*purchases[key]:
outstr += str(key) + '\n'
print(key)
out.write(outstr)
credits = x(creader)
purchases = x(preader)
assertDoubles(credits,purchases)
#print(credits)
#print('-------')
#print(purchases)
cred.close()
pur.close()
out.close()
I wrote some code. It essentially stores the number you're looking for duplicates as a key in the dictionary. The value that gets stored is the number of occurrences of that number within the file. It skips the first line (headers).
Output is the following:
{2476: 2, 3654: 1}
-------
{2476: 1, 3654: 1, 3566: 1}
New code above simply outputs:
3654
EDIT: I updated the code to fix what you are referring to.
Since you're not interested in new entries, all you need is to run through the first file and collect all the entries in the first column (counting them in the process) and then run through the second file, check if any of its first column entries has been collected in the first step and if so - count them as well. You cannot avoid running the necessary number of loops to read all the lines of both files but you can use a hashmap (dict) for blazingly fast lookups afterwards, so:
import csv
import collections
c_phones = collections.defaultdict(int) # initiate a 'counter' dict to save us some typing
with open("AllCredits.csv", "r") as f: # open the file for reading
reader = csv.reader(f) # create a CSV reader
next(reader) # skip the first row (header)
for row in reader: # iterate over the rest
c_phones[row[0]] += 1 # increase the count of the current phone
Now that you have count of all the phone numbers from the first file stored in the c_phones dictionary, you should clone it but reset the counters so you can count the occurences of these numbers in the second CSV file:
p_phones = {key: 0 for key in c_phones} # reset the phone counter for purchases
with open("AllPurchases.csv", "r") as f: # open the file for reading
reader = csv.reader(f) # create a CSV reader
next(reader) # skip the first row (header)
for row in reader: # iterate over the rest
if row[0] in p_phones: # we're only interested in phones from both files
p_phones[row[0]] += 1 # increase the counter
And now that you have both dictionaries, and you have both counts you can easily iterate over them to print out the counts
for key in c_phones:
print("{:<15} Credits: {:<4} Purchases: {:<4}".format(key, c_phones[key], p_phones[key]))
Which, with your example data, will yield:
3654 Credits: 1 Purchases: 1
2476 Credits: 2 Purchases: 1
To help with my understanding, I've broken this problem into smaller, more manageable tasks:
Read phone numbers from the first column of two sorted csv files.
Find duplicate numbers that appear in both lists of phone numbers.
Reading the phone numbers is a reusable function, so let's separate it:
def read_phone_numbers(file_path):
file_obj = open(file_path, 'r')
phone_numbers = []
for row in csv.reader(file_obj):
phone_numbers.append(row[0])
file_obj.close()
return phone_numbers
For the task of finding duplicates a set() is a useful tool. From the python docs:
A set is an unordered collection with no duplicate elements.
def find_duplicates(credit_nums, purchase_nums):
phone_numbers = set(credit_nums) # the unique credit numbers
duplicates = []
for phone_number in phone_numbers:
credit_count = credit_nums.count(phone_number)
purchase_count = purchase_nums.count(phone_number)
if credit_count > 0 and purchase_count > 0:
duplicates.append({
'phone_number': phone_number,
'credit_count': credit_count,
'purchase_count': purchase_count,
})
return duplicates
And to put it all together:
def main(credit_csv_path, purchase_csv_path, out_csv_path):
credit_nums = read_phone_numbers(credit_csv_path)
purchase_nums = read_phone_numbers(purchase_csv_path)
duplicates = find_duplicates(credit_nums, purchase_nums)
with open(out_csv_path, 'w') as file_obj:
writer = csv.DictWriter(
file_obj,
fieldnames=['phone_number', 'credit_count', 'purchase_count'],
)
writer.writerows(duplicates)
If you need to process files that are hundreds of times larger, you can look into the collections.Counter module.
The way i understand your situation is that you have two files, namely cred and pur.
Now for each of the tn in cred, find whether the same tn exist in pur. Return the count if exist, or 0 if non-exist.
You can use pandas and the algo can be as below:
Agg pur by TN and count
For each row in cred, get the count. Else 0
Below is the ex:
import pandas as pd
# read the csv
# i create my own as suggested in your desc
cred = pd.DataFrame(
dict(
TN = [2476, 2476, 3654],
STUFF = ['hseqer', 'sthrtj', 'rstrhh'],
THINGS = ['trjar', 'esreet', 'trwtr']
),
columns = ['TN','STUFF','THINGS']
)
pur = pd.DataFrame(
dict(
TN = [2476, 3566, 3654, 2476],
STUFF = ['hseher', 'sthztj', 'rstjhh', 'hseher'],
THINGS = ['trjdr', 'esrhet', 'trjtr', 'trjdr']
),
columns = ['TN','STUFF','THINGS']
)
dfpur = pur.groupby('TN').TN.count() # agg and count (step 1)
# step 2
count = []
for row, tnval in enumerate(cred.TN):
if cred.at[row, 'TN'] in dfpur.index:
count.append(dfpur[tnval])
else:
count.append(0)
There you go! you have your count in the list
The code is supposed to find duplicates by comparing FirstName, LastName, and Email. All Duplicates should be written to the Dupes.csv file, and all Uniques should be written to Deduplicated.csv, but this is currently not happening..
Example:
If row A shows up in Orginal.csv 10 times, the code writes A1 to deduplicated.csv, and it writes A2 - A10 to dupes.csv.
This is incorrect. A1-A10 should ALL be written to the dupes.csv file, leaving only unique rows in deduplicated.csv.
Another strange behavior is that A2-A10 are all getting written to dupes.csv TWICE!
I would really appreciate any and all feedback as this is my first professional python script and I'm feeling pretty disheartened.
Here is my code:
import csv
def read_csv(filename):
the_file = open(filename, 'r', encoding='latin1')
the_reader = csv.reader(the_file, dialect='excel')
table = []
#As long as the table row has values we will add it to the table
for row in the_reader:
if len(row) > 0:
table.append(tuple(row))
the_file.close()
return table
def create_file(table, filename):
join_file = open(filename, 'w+', encoding='latin1')
for row in table:
line = ""
#build up the new row - don't comma on last item so add last item separate
for i in range(len(row)-1):
line += row[i] + ","
line += row[-1]
#adds the string to the new file
join_file.write(line+'\n')
join_file.close()
def main():
original = read_csv('Contact.csv')
print('finished read')
#hold duplicate values
dupes = []
#holds all of the values without duplicates
dedup = set()
#pairs to know if we have seen a match before
pairs = set()
for row in original:
#if row in dupes:
#dupes.append(row)
if (row[4],row[5],row[19]) in pairs:
dupes.append(row)
else:
pairs.add((row[4],row[5],row[19]))
dedup.add(row)
print('finished first parse')
#go through and add in one more of each duplicate
seen = set()
for row in dupes:
if row in seen:
continue
else:
dupes.append(row)
seen.add(row)
print ('writing files')
create_file(dupes, 'duplicate_leads.csv')
create_file(dedup, 'deduplicated_leads.csv')
if __name__ == '__main__':
main()
You should look into the pandas module for this, it will be extremely fast, and much easier than rolling your own.
import pandas as pd
x = pd.read_csv('Contact.csv')
duplicates = x.duplicated(['row4', 'row5', 'row19'], keep = False)
#use the names of the columns you want to check
x[duplicates].to_csv('duplicates.csv') #write duplicates
x[~duplicates].to_csv('uniques.csv') #write uniques
I need help sorting a list from a text file. I'm reading a .txt and then adding some data, then sorting it by population change %, then lastly, writing that to a new text file.
The only thing that's giving me trouble now is the sort function. I think the for statement syntax is what's giving me issues -- I'm unsure where in the code I would add the sort statement and how I would apply it to the output of the for loop statement.
The population change data I am trying to sort by is the [1] item in the list.
#Read file into script
NCFile = open("C:\filelocation\NC2010.txt")
#Save a write file
PopulationChange =
open("C:\filelocation\Sorted_Population_Change_Output.txt", "w")
#Read everything into lines, except for first(header) row
lines = NCFile.readlines()[1:]
#Pull relevant data and create population change variable
for aLine in lines:
dataRow = aLine.split(",")
countyName = dataRow[1]
population2000 = float(dataRow[6])
population2010 = float(dataRow[8])
popChange = ((population2010-population2000)/population2000)*100
outputRow = countyName + ", %.2f" %popChange + "%\n"
PopulationChange.write(outputRow)
NCFile.close()
PopulationChange.close()
You can fix your issue with a couple of minor changes. Split the line as you read it in and loop over the sorted lines:
lines = [aLine.split(',') for aLine in NCFile][1:]
#Pull relevant data and create population change variable
for dataRow in sorted(lines, key=lambda row: row[1]):
population2000 = float(dataRow[6])
population2010 = float(dataRow[8])
...
However, if this is a csv you might want to look into the csv module. In particular DictReader will read in the data as a list of dictionaries based on the header row. I'm making up the field names below but you should get the idea. You'll notice I sort the data based on 'countryName' as it is read in:
from csv import DictReader, DictWriter
with open("C:\filelocation\NC2010.txt") as NCFile:
reader = DictReader(NCFile)
data = sorted(reader, key=lambda row: row['countyName'])
for row in data:
population2000 = float(row['population2000'])
population2010 = float(row['population2010'])
popChange = ((population2010-population2000)/population2000)*100
row['popChange'] = "{0:.2f}".format(popChange)
with open("C:\filelocation\Sorted_Population_Change_Output.txt", "w") as PopulationChange:
writer = csv.DictWriter(PopulationChange, fieldnames=['countryName', 'popChange'])
writer.writeheader()
writer.writerows(data)
This will give you a 2 column csv of ['countryName', 'popChange']. You would need to correct this with the correct fieldnames.
You need to read all of the lines in the file before you can sort it. I've created a list called change to hold the tuple pair of the population change and the country name. This list is sorted and then saved.
with open("NC2010.txt") as NCFile:
lines = NCFile.readlines()[1:]
change = []
for line in lines:
row = line.split(",")
country_name = row[1]
population_2000 = float(row[6])
population_2010 = float(row[8])
pop_change = ((population_2010 / population_2000) - 1) * 100
change.append((pop_change, country_name))
change.sort()
output_rows = []
[output_rows.append("{0}, {1:.2f}\n".format(pair[1], pair[0]))
for pair in change]
with open("Sorted_Population_Change_Output.txt", "w") as PopulationChange:
PopulationChange.writelines(output_rows)
I used a list comprehension to generate the output rows which swaps the pair back in the desired order, i.e. country name first.
I have two sets of data. Both have a little over 13000 rows and, one of them (the one I open as a csv in the main function), has two columns that I need to match up to the other file (opened as text file and put into list of dictionaries in example_05() function).
They are from the same source and I need to make sure the data stays the same when I add the last two parameters for each row in the list of dicts because I have about 20 extra rows in the .csv file that I'm adding to the list of dicts so I must have extra or null data in the .csv file.
To delete these anomalous rows, I'm trying to compare the indices of the list of Q* values from the .csv file to the {'Q*':} value in the dictionary within the list of dictionaries (each dictionary is a row) to look for mismatches because they should be the same and then just delete the item from the mass_list before I add it to the list of dictionaries as I do at the end of example_05() function.
When I try to compare them I get an 'IndexError: list index out of range' error at this line:
if row10['Q*'] != Q_list_2[check_index]:
Can anybody tell me why? Here's example_05() and the main function:
def example_05(filename):
with open(filename,'r') as file : data = file.readlines()
header, data = data[0].split(), data[1:]
#...... convert each line to a dict, using header words keys
global kept
kept = []
for line in data :
line = [to_float(term) for term in line.split()]
kept.append( dict( zip(header, line) ) )
del mass_list[0]
mass_list_2 = [to_float(j) for j in mass_list]
del Q_list[0]
Q_list_2 = [to_float(k) for k in Q_list]
print "Number in Q_list_2 list = "
print len(Q_list_2)
check_index = 0
delete_index = 0
for row10 in kept:
if row10['Q*'] != Q_list_2[check_index]:
del mass_list_2[delete_index]
del Q_list_2[delete_index]
check_index+=1
delete_index+=1
else:
check_index+=1
delete_index+=1
continue
k_index=0
for d in kept:
d['log_10_m'] = mass_list_2[k_index]
k_index+=1
print "Number in mass_list_2 list = "
print len(mass_list_2)
if __name__ == '__main__' :
f = open('MagandMass20150401.csv')
csv_f = csv.reader(f)
mag_list = []
mass_list = []
Q_list = []
for row in csv_f:
mag_list.append(row[17])
mass_list.append(row[18])
Q_list.append(row[15])
del csv_f
f.close()
example_05('summ20150401.txt')