import csv
TextFileContent = open('tickets.txt')
with open('example4.csv', 'w') as csvfile:
fieldnames = ['Author', 'ticket number', 'Revision']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for TextLine in TextFileContent:
if 'Revision:' in TextLine:
tmp=TextLine.replace('Revision:', "")
print(tmp)
writer.writerow({'Revision': tmp})
elif 'Author:' in TextLine:
tmp=TextLine.replace("Author:", "")
print(tmp)
writer.writerow({'Author': tmp})
elif 'Contributes to:' in TextLine:
tmp=TextLine.replace("Contributes to:", "")
print(tmp)
writer.writerow({'ticket number': tmp})
Hi all i have developed above python script to extract "Author", "Ticket" and "revision" details from text file and then filled up that infomation to CSV file.
Now i am able to extract all information but the data not correctly filled up in CSV file.
the text file content is like below
Revision: 22904
Author: Userx
Contributes to: CF-1159
Revision: 22887
Author: Usery
Contributes to: CF-955
Revision: 22884
Author: UserZ
Contributes to: CPL-7768
And i want result in CSV file like below
Author ticket number Revision
Userx CF-1159 22904
Usery CF-955 22887
UserZ CPL-7768 22884
Your code writes a row as soon as it finds any field instead of waiting until it has read a full set of fields. The following edit waits for a full set and then writes the row.
with open('/tmp/out.csv', 'w') as csvfile:
fieldnames = ['Author', 'ticket number', 'Revision']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
row = {}
for TextLine in TextFileContent:
if 'Revision:' in TextLine:
row['Revision'] = TextLine.replace('Revision: ', "")
elif 'Author:' in TextLine:
row['Author'] = TextLine.replace("Author: ", "")
elif 'Contributes to:' in TextLine:
row['ticket number'] = TextLine.replace("Contributes to: ", "")
if len(row) == len(fieldnames):
writer.writerow(row)
row = {}
Note that this will not function correctly unless all records contain all fields.
Related
I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()
I'm using python 2.7 (I know....) and cannot import any pypi modules (eyeroll) - work controls downloads, upgrades etc.
I am trying to edit data within a CSV file that I created from a feature class. The feature class to csv outputs fine.
I have tried 'with open' to access the CSV it keeps errors with RuntimeError: cannot open 'file'.
I have also reverted back to trying to use arcpy.da.searchcursor it errors with IO error 'no such file or directory'. The file does exist and opens in ArcMap, Notepad ++.
(I had a working version previously that was editing data using search cursor but not maintaining the data order - so I created the csv FIRST instead of at the end.)
Here is the scrip snippet:
def fc_to_csv():
fields = ["ID", "ADMIN_ORG", "RTE_CN", "GIS_MILES"]
with open('{}.csv'.format(roadcore), 'wb') as outf:
dw = csv.DictWriter(outf, fieldnames=fields)
dw.writeheader()
with arcpy.da.SearchCursor(fc, fields) as rows:
for row in rows:
if row[3] == 0:
pass
elif row[3] == None:
pass
else:
dw.writerow(dict(zip(fields, row)))
print "File written: {}".format(roadcore)
#********FORMATTING LABELS********
def newlabel():
with open(roadcore, 'rb') as csvfile:
rc_reader = csv.reader(csvfile, delimiter=',')
for row in rc_reader:
# with arcpy.da.SearchCursor(roadcore, fields) as sc:
# for row in sc:
if row[0].startswith(' '):
labels.append('{}'.format(
row[0]
.replace(' ', '')))
elif row[1].startswith('01'):
labels.append('{}'.format(
row[0]
.lstrip(ascii_letters)
.replace(' ', '')
.replace('.', '-')
.lstrip('0')))
The DBF export isn't working either - but guess I'll do one issue at a time. The full code is:
fc = (r'D:\JJ_Development\Projects\Basemap\Transportation\Roads.gdb\RoadCore_Existing')
fields = ["ID", "ADMIN_ORG", "RTE_CN", "GIS_MILES"]#Don't use object id - when data is updated this can change.
identical = ["ID", "ADMIN_ORG", "RTE_CN"]
labels = []
rd_names = (' ROAD', ' STREET', ' DRIVE', ' BLUFF', ' COVE',' LOOP', ' LANE', ' RD', ' LN', ' DR')
#ADJUST R08 To skip Puerto Rico... all named roads.
#********Initial FS Roadcore Existing Data Cleanup********
#rc_existing = os.path.join(curr_dir, 'roadcore')
roadcore = os.path.join(curr_dir, 'rdlabels')
def fc_to_csv():
fields = ["ID", "ADMIN_ORG", "RTE_CN", "GIS_MILES"]
with open('{}.csv'.format(roadcore), 'wb') as outf:
dw = csv.DictWriter(outf, fieldnames=fields)
dw.writeheader()
with arcpy.da.SearchCursor(fc, fields) as rows:
for row in rows:
if row[3] == 0:
pass
elif row[3] == None:
pass
else:
dw.writerow(dict(zip(fields, row)))
print "File written: {}".format(roadcore)
#********FORMATTING LABELS********
def newlabel():
with open(roadcore, 'rb') as csvfile:
rc_reader = csv.reader(csvfile, delimiter=',')
for row in rc_reader:
# with arcpy.da.SearchCursor(roadcore, fields) as sc:
# for row in sc:
if row[0].startswith(' '):
labels.append('{}'.format(
row[0]
.replace(' ', '')))
elif row[1].startswith('01'):
labels.append('{}'.format(
row[0]
.lstrip(ascii_letters)
.replace(' ', '')
.replace('.', '-')
.lstrip('0')))
elif row[1].startswith('02'):
labels.append('{}'.format(row[0].lstrip(ascii_letters).replace(' ', '').replace('.', '-').lstrip('0')))
elif row[1].startswith('03'):
labels.append('{}'.format(
row[0]
.lstrip(ascii_letters)
.replace(' ', '')
.replace('.', '-')
.lstrip('0')))
elif row[1].startswith('04'):
labels.append('{}'.format(
row[0]
.lstrip(ascii_letters)
.replace('-ADMIN', '')
.replace('-A-ADMIN','')
.replace('-PARKING','')[2:]
.lstrip('0')
.replace('.', '-')))
elif row[1].startswith('05'):
labels.append('{}'.format(
row[0].rstrip('0')
.lstrip('0')
.lstrip(ascii_letters)
.replace('.', '-')))
elif row [1].startswith('0501'):
#Note: 0501 forest label data entered in unicode.
labels.append('{}'.format(
row[0].rstrip('0').split('.',1)
.replace(' ', '')))
elif row[1].startswith('06'):
labels.append('{}'.format(
row[0].rstrip('0')
.replace('.', '-')
.lstrip(ascii_letters)))
elif row[1].startswith('06'):
labels.append('{}'.format(
row[0].rstrip(ascii_letters)))
elif row[1].startswith('08'):
labels.append('{}'.format(
row[0].lstrip(ascii_letters)))
#Note: R08 label data entered in unicode.
elif row[1].startswith('09'):
#Note: R09 label data entered in unicode.
labels.append('{}'.format(
row[0].lstrip('0')
.replace('.', '-')
.lstrip(ascii_letters)))
elif row[1].startswith('10'):
labels.append('{}'.format(
row[0]
.lstrip('0')
.replace('.', '-')
.rstrip('0')
.lstrip(ascii_letters)))
else:
labels.append(row[0])
#print(labels)
print('Label formatting done')
#********CONVERT CSV TO DBASE TABLE IN GDB********
def cleanup_labels():
with open('roadcore', 'rb') as csvfile:
rc_reader = csv.reader(csvfile, delimiter=',')#, quotechar='|')
for row in rc_reader:
if row[0].startswith('-'):
labels.append('{}'.format(row[0].replace('-', '')))
elif row[0].endswith('-'):
labels.append('{}'.format(row[0].replace('-', '')))
elif row[0].endswith(rd_names):
labels.append('{}'.format(row[0].strip(ascii_letters)))
csvfile.close()
#Python 2.7 will not go directly from a list to a dbf. Python 3 has added a dbf module, step not needed in future versions.
#********CONVERT CSV TO DBASE TABLE IN GDB********
def dbfexport():
roadsgdb = arcpy.CreateFileGDB_management(curr_dir, 'Roads_labels.gdb')
print('gdb created')
arcpy.TableToDBASE_conversion('roadcore', roadsgdb)
# for filenames in os.listdir(curr_dir):
# if filenames.endswith('.csv'):
# arcpy.TableToTable_conversion(os.path.join(curr_dir, 'roadcore'), roadsgdb, os.path.splitext(filenames)[0] + ".dbf")
print('dBASE created')
if __name__ == '__main__':
fc_to_csv()#functions
newlabel()
# cleanup_labels()
# dbfexport()
end = time.time()
print(end-start)
Is there a way I can use python to take my animals.txt file results and convert it to csv and format it differently?
Currently the animals.txt file looks like this:
ID:- 512
NAME:- GOOSE
PROJECT NAME:- Random
REPORT ID:- 30321
REPORT NAME:- ANIMAL
KEYWORDS:- ['"help,goose,Grease,GB"']
ID:- 566
NAME:- MOOSE
PROJECT NAME:- Random
REPORT ID:- 30213
REPORT NAME:- ANIMAL
KEYWORDS:- ['"Moose, boar, hansel"']
I would like the CSV file to present it as:
ID, NAME, PROJECT NAME, REPORT ID, REPORT NAME, KEYWORDS
Followed by the results underneath each header
Here is a script I have wrote:
import re
import csv
with open("animals.txt") as f: text = f.read()
data = {}
keys = ['ID', 'NAME', 'PROJECT NAME', 'REPORT ID', 'REPORT NAME', 'KEYWORDS']
for k in keys:
data[k] = re.findall(r'%s:- (.*)' % k, text)
csv_file = 'out.csv'
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=keys)
writer.writeheader()
for x in data:
writer.writerow(x)
An easy way to do is parsing using regex and store them in a dict, just before you write the final csv:
import re
# `text` is your input text
data = {}
keys = ['ID', 'NAME', 'PROJECT NAME', 'REPORT ID', 'REPORT NAME', 'KEYWORDS']
for k in keys:
data[k] = re.findall(r'%s:- (.*)' % k, text)
And to CSV:
import csv
csv_file = 'out.csv'
with open(csv_file, 'w') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_NONE, escapechar='\\')
writer.writerow(data.keys())
for i in range(len(data[keys[0]])):
writer.writerow([data[k][i] for k in keys])
Output in csv:
ID,NAME,PROJECT NAME,REPORT ID,REPORT NAME,KEYWORDS
512,GOOSE,Random,30321,ANIMAL,['\"help\,goose\,Grease\,GB\"']
566,MOOSE,Random,30213,ANIMAL,['\"Moose\, boar\, hansel\"']
Note that I used re.M multiline mode since there's a trick in your text, preventing matching ID twice! Also the default write rows needed to be twisted.
Also uses \ to escape the quote.
This should work:
fname = 'animals.txt'
with open(fname) as f:
content = f.readlines()
content = [x.strip() for x in content]
output = 'ID, NAME, PROJECT NAME, REPORT ID, REPORT NAME, KEYWORDS\n'
line_output = ''
for i in range(0, len(content)):
if content[i]:
line_output += content[i].split(':-')[-1].strip() + ','
elif not content[i] and not content[i - 1]:
output += line_output.rstrip(',') + '\n'
line_output = ''
output += line_output.rstrip(',') + '\n'
print(output)
That's the code in Autoit (www.autoitscript.com)
Global $values_A = StringRegExp(FileRead("json.txt"), '[ID|NAME|KEYWORDS]:-\s(.*)?', 3)
For $i = 0 To UBound($values_A) - 1 Step +6
FileWrite('out.csv', $values_A[$i] & ',' & $values_A[$i + 1] & ',' & $values_A[$i + 2] & ',' & $values_A[$i + 3] & ',' & $values_A[$i + 4] & ',' & $values_A[$i + 5] & #CRLF)
Next
I am new to Python and am using version 2.7.1 as part of Hyperion FDMEE.
I have a file which I need to reorder the columns plus, split one column into 3 as part of the same file.
Source file;
ACCOUNT;UD1;UD2;UD3;PERIOD;PERIOD;AMOUNT
QTY;032074;99953;53;2017.07.31;2017.07.31;40.91
COGS;032074;99953;53;2017.07.31;2017.07.31;-7488.36
TURNOVER;032074;99953;53;2017.07.31;2017.07.31;505.73
QTY;032075;99960;60;2017.07.31;2017.07.31;40.91
COGS;032075;99960;60;2017.07.31;2017.07.31;-7488.36
TURNOVER;032075;99960;60;2017.07.31;2017.07.31;505.73
I have managed to reorder the columns per this script;
infilename = fdmContext["OUTBOXDIR"]+"/Targit_1707.dat"
outfilename = fdmContext["OUTBOXDIR"]+"/TargitExport.csv"
import csv
infile = open(infilename, 'r')
outfile = open(outfilename, 'w+')
for line in infile:
column = line.split(';')
outfile.write(column[1] + ";" + column[2] + ";" + column[3] + ";" + column[4] + ";" + column[0] + ";" + str(column[6].strip('\n')) + ";201701" + "\n")
outfile.close()
infile.close()
Producing the result;
UD1;UD2;UD3;PERIOD;ACCOUNT;AMOUNT;201701
032074;99953;53;2017.07.31;QTY;40.91;201701
032074;99953;53;2017.07.31;COGS;-7488.36;201701
032074;99953;53;2017.07.31;TURNOVER;505.73;201701
032075;99960;60;2017.07.31;QTY;40.91;201701
032075;99960;60;2017.07.31;COGS;-7488.36;201701
032075;99960;60;2017.07.31;TURNOVER;505.73;201701
but I am struggling to transpose the Account column (QTY, COGS, TURNOVER) into seperate columns as in the example below;
UD1;UD2;UD3;PERIOD;QTY;COGS;TURNOVER;201701
032074;99953;53;2017.07.31;40.91;-7488.36;505.73;201701
032075;99960;60;2017.07.31;40.91;-7488.36;505.73;201701
Any suggestions would be very much appreciated.
Use a dict, for instance:
import csv
fieldnames = infile.readline()[:-1]
fieldnames = fieldnames.split(';')[1:5] + ['QTY', 'COGS', 'TURNOVER']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
record_dict = {}
for i, line in enumerate(infile):
if not line: break
line = line[:-1].split(';')
# Assign column data every 1,2,3 lines
mod_row = (i % 3)+1
if mod_row == 1:
record_dict['QTY'] = line[6]
record_dict['UD1'] = line[1]
# ... and so on
if mod_row == 2:
record_dict['COGS'] = line[6]
if mod_row == 3:
record_dict['TURNOVER'] = line[6]
writer.writerow(record_dict)
record_dict = {}
Output:
UD1,UD2,UD3,PERIOD,QTY,COGS,TURNOVER
032074,,,,40.91,-7488.36,505.73
032075,,,,40.91,-7488.36,505.73
Tested with Python: 3.4.2
Read about:
Python ยป 3.6.1 Documentation csv.DictWriter
While going through the examples of the Dedupe library in Python which is used for records deduplication, I found out that it creates a Cluster Id column in the output file, which according to the documentation indicates which records refer to each other. Athough I am not able to find out any relation between the Cluster Id and how is this helping in finding duplicate records. If anyone has an insight into this, please explain this to me. This is the code for deduplication.
# This can run either as a python2 or python3 code
from future.builtins import next
import os
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode
input_file = 'data/csv_example_input_with_true_ids.csv'
output_file = 'data/csv_example_output1.csv'
settings_file = 'data/csv_example_learned_settings'
training_file = 'data/csv_example_training.json'
# Clean or process the data
def preProcess(column):
try:
column = column.decode('utf-8')
except AttributeError:
pass
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
if not column:
column = None
return column
# Read in the data from CSV file:
def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
print('importing data ...')
data_d = readData(input_file)
if os.path.exists(settings_file):
print('reading from', settings_file)
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)
else:
fields = [
{'field' : 'Site name', 'type': 'String'},
{'field' : 'Address', 'type': 'String'},
{'field' : 'Zip', 'type': 'Exact', 'has missing' : True},
{'field' : 'Phone', 'type': 'String', 'has missing' : True},
]
deduper = dedupe.Dedupe(fields)
deduper.sample(data_d, 15000)
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file, 'rb') as f:
deduper.readTraining(f)
print('starting active labeling...')
dedupe.consoleLabel(deduper)
deduper.train()
with open(training_file, 'w') as tf:
deduper.writeTraining(tf)
with open(settings_file, 'wb') as sf:
deduper.writeSettings(sf)
threshold = deduper.threshold(data_d, recall_weight=1)
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)
print('# duplicate sets', len(clustered_dupes))
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data_d[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores):
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}
singleton_id = cluster_id + 1
with open(output_file, 'w') as f_output, open(input_file) as f_input:
writer = csv.writer(f_output)
reader = csv.reader(f_input)
heading_row = next(reader)
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)
writer.writerow(heading_row)
for row in reader:
row_id = int(row[0])
if row_id in cluster_membership:
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)
Thanks in advance
You're right, the Cluster ID isn't used for anything.
You should look at the Cluster ID as the output of the dedupe execution. Dedupe is not interested in merging your records. It's core focus is to attempt to identify records that are likely similar.
It does this by assigning rows that it thinks are similar with the same Cluster ID.
It is your job as the Software Engineer to then use that data in an intelligent way and decide how you want to merge that data (if at all).
If my input is the following:
my output will be something like the following:
So, remember, your input number of records should always match dedupes output number of records. Difference being only that you have a new column "Cluster ID" that you can now use to "group" your likely duplicates.