How to find a data in CSV file with python - python

I have a problem , i wanna to search a data with python from csv file
my code like this
#search process area
area_proses = []
sg1 = []
sg2 = []
sg3 = []
avg = []
#input number you want to search
number = raw_input('Masukan id Spesific Goal\n')
#read csv, and split on "," the line
csv_file = csv.reader(open('C:/xampp_2/htdocs/SkripsiV2/fuzzy/download.csv', "rb"), delimiter=",")
#loop through csv list
for row in csv_file:
area_proses.append(row[1])
sg1.append(row[2])
sg2.append(row[3])
sg3.append(row[4])
avg.append(row[5])
#if current rows 1nd value is equal to input, print that row
if number == row[0]:
#masukan data
print(area_proses,sg1,sg2,sg3,avg)
my problem is when i search with id 11 the output is like this:
(['area_proses', 'Service Delivery'], ['sg1', '3.71'], ['sg2', '3.48'], ['sg3',
'3.30'], ['avg', '3.50'])
but when i search id 12 the output is like :
(['area_proses', 'Service Delivery', 'Incident Resolution and Prevention'], ['sg
1', '3.71', '3.83'], ['sg2', '3.48', '3.65'], ['sg3', '3.30', '3.70'], ['avg', '
3.50', '3.73'])
How i can solved this problem?
Download.csv
"id","area_proses","sg1","sg2","sg3","avg","fuzzy",
"11","Service Delivery","3.71","3.48","3.30","3.50","0.00000000000",
"12","Incident Resolution and Prevention","3.83","3.65","3.70","3.73","0.00000000000",
"13","Service System Development","3.93","3.29","3.26","3.49","0.00000000000",
"14","Service System Transition","3.00","3.43","0.00","3.22","0.00000000000",
"15","Strategic Service Management","3.48","3.86","0.00","3.67","0.00000000000",
"16","Configuration Management","3.14","3.57","0.00","3.36","0.00000000000",
"17","Measurement and Analysis","2.93","3.18","0.00","3.06","0.00000000000",

Try using the pandas library. Install it, then do:
import pandas as pd
df = pd.read_csv('csv_file.csv')
df[df['id'] == number]

Just change 'rb' to 'r'
fopn = open(file_loc, "r")
csv_file = csv.reader(fopn)
for row in csv_file:
if number == row[0]:
print(row)

Related

Slice data frame without pandas based on user input

I'm trying to end the code, but I have problem how to slice data frame based on user's input. Is there any option to do this without pandas?
def dataSet_read():
enter = input('Enter file path:')
csvreader = csv.reader(open(enter))
head_inp = input('Has the file headers? Select Y or N:\n').upper()
header = []
if head_inp == 'Y':
header = next(csvreader)
print('\nFile headers:\n\n', header)
elif head_inp == 'N':
print("'\nFile doesn't have headers")
else:
print('Incorrect selection!!!')
sys.exit()
with open(str(enter), "r") as csvfile:
reader_variable = csv.reader(csvfile, delimiter = ",")
rows_inp = input("\nPlease provide range which you'd like to see using ',', otherwise all dataframe will open all dataset.\n")
if rows_inp == '':
for row in reader_variable:
print(row)
else:
print("????")
cast it to list then you can slice like what it is in normal list structure.
enter = input('Enter file path:')
rows_inp = input("slice")
with open(enter , 'r') as f:
reader_variable = csv.reader(f)
reader_list= list(reader_variable)
for row in reader_list[:rows_inp]:#if you want slice the whole data
current_date = row[:rows_inp] #if you want slice per row
print(current_date)
I found the way to get what I need, maybe it's not the best approach but works :)
with open(str(enter), "r") as csvfile:
reader_variable = csv.reader(csvfile, delimiter = ",")
rows_inp = input("\nPlease provide range which you'd like to see using ',', otherwise all dataframe will open all dataset.\n")
if rows_inp == '':
for row in reader_variable:
print(row)
else:
i, j = rows_inp.split(',')
reader_list = list(reader_variable)
print(reader_list[int(i):int(j)+1])

Is there a way to read and alter the contents of a huge csv file in PyCharm?

I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()

Returning a row that matches specified condition, and edit particular columns in row. Then write to csv file with changed row

I'm writing a python script that works with two csv files. Lets call them csv1.csv (original file to read) and csv2.csv (exact copy of csv1). The goal is to find the row and column in the csv file that corresponds to the the modified user-defined input.
csv format:(continues for about 2-3 thousand lines)
record LNLIM, ID_CO,OD_DV,ID_LN, ST_LN, ZST_LN, ID_LNLIM,LIMIT1_LNLIM, LIMIT2_LNLIM, LIMIT3_LNLIM
LNLIM, 'FPL', 'SOUT', '137TH_LEVEE_B', 'B', '137TH_AV', 'LEVEE', 'A', 1000, 1100, 1200
LNLIM, 'FPL', 'SOUT', '137TH_DAVIS_B', 'A', '137TH_AV', 'NEWTON', 'A', 1000, 1100, 1200
...
Let's say that the user is looking for 137TH_AV and NEWTON. I want to be able to go row by row and compare the two columns/row indices ST_LN and ZST_LN. If both columns match what the user inputted then I want to capture which row in the csv file that happened on, and use that information to edit the remaining columns LIMIT1_LNLIM LIMIT2_LNLIM LIMIT3_LNLIM on that row with new analog values.
I want to get the 3 new values provided by the user and edit a specific row, and a specific row element. Once I've found the place to replace the number values I want to overwrite csv2.csv with this edit.
Determining where the line segment is located in the array
import sys
import csv
import os
import shutil
LineSectionNames = []
ScadaNames = []
with open('Vulcan_Imp_Summary.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
LineSectionName = row[1]
ScadaName = row[29]
LineSectionNames.append(LineSectionName)
ScadaNames.append(ScadaName)
#Reformatting arrays for accurate references
LineSectionNames = [character.replace('\xa0', ' ') for character in LineSectionNames]
LineSectionNames = [character.replace('?', '-') for character in LineSectionNames]
ScadaNames = [character.replace('\xa0', ' ') for character in ScadaNames]
#Setting Line Section name as key and Scada name as value
ScadaDict = {}
for i in range(len(LineSectionNames)):
ScadaDict[LineSectionNames[i]] = ScadaNames[i]
#Prompt user for grammatical name of Line Section
print ('Enter the Line Section Name: (Example = Goulds-Princeton) \n')
user_input = input()
#Reference user input to dictionary value to convert input into SCADA format
def reformat():
print ('Searching for Line Section...' + user_input)
if user_input in ScadaDict:
value = ScadaDict[user_input]
print ('\n\t Match!\n')
else:
print ('The Line Section name you have entered was incorrect. Try again. \n Example = Goulds-Princeton')
reformat()
# Copying the exported file from Genesys
path = 'I://PSCO//DBGROUP//PatrickL//'
shutil.copyfile(path + 'lnlim_import.csv', path + 'lnlim_import_c.csv')
#Using the SCADA format to search through csv file
print ('Searching csv file for...' + user_input)
# Reading the copied file
record_lnlims = []
id_cos = []
id_dvs = []
id_lines = []
id_lns = []
st_lns = []
zst_lns = []
id_lnlims = []
limit1_lnlims = []
limit2_lnlims = []
limit3_lnlims = []
with open('lnlim_import_c.csv', 'r') as copy:
reader = csv.reader(copy)
for row in reader:
record_lnlim = row[0]
id_co = row[1]
id_dv = row[2]
id_line = row[3]
id_ln = row[4]
st_ln = row[5]
zst_ln = row[6]
id_lnlim = row[7]
limit1_lnlim = row[8]
limit2_lnlim = row[9]
limit3_lnlim = row[10]
record_lnlims.append(record_lnlim)
id_cos.append(id_co)
id_dvs.append(id_dv)
id_lines.append(id_line)
id_lns.append(id_ln)
st_lns.append(st_ln)
zst_lns.append(zst_ln)
id_lnlims.append(id_lnlim)
limit1_lnlims.append(limit1_lnlim)
limit2_lnlims.append(limit2_lnlim)
limit3_lnlims.append(limit3_lnlim)
#Reformatting the user input from GOULDS-PRINCETON to 'GOULDS' and 'PRINCETON'
input_split = user_input.split('-', 1)
st_ln1 = input_split[0]
zst_ln1 = input_split[1]
st_ln2 = st_ln1.upper()
zst_ln2 = zst_ln1.upper()
st_ln3 = "'" + str(st_ln2) + "'"
zst_ln3 = "'" + str(zst_ln2) + "'"
#Receiving analog values from user
print ('\n\t Found! \n')
print ('Enter the Specified Emergency Rating (A) for 110% for 7 minutes: ')
limit1_input = input()
print ('Enter the Specified Emergency Rating (A) for 120% for 7 minutes: ')
limit2_input = input()
print ('Enter the Specified Emergency Rating (A) for 130% for 5 minutes: ')
limit3_input = input()
Whenever I print the row_index it prints the initialized value of 0.
i = 0
row_index = 0
for i in range(len(st_lns)):
if st_ln3 == st_lns[i] and zst_ln3 == zst_lns[i]:
row_index = i
print(row_index)
limit1_input = limit1_lnlims[row_index]
limit2_input = limit2_lnlims[row_index]
limit3_input = limit3_lnlims[row_index]
csv_list = []
csv_list.append(record_lnlims)
csv_list.append(id_cos)
csv_list.append(id_dvs)
csv_list.append(id_lines)
csv_list.append(st_lns)
csv_list.append(zst_lns)
csv_list.append(id_lnlims)
csv_list.append(limit1_lnlims)
csv_list.append(limit2_lnlims)
csv_list.append(limit3_lnlims)
#Editing the csv file copy to implement new analog values
with open('lnlim_import_c.csv', 'w') as edit:
for x in zip(csv_list):
edit.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\n".format(x))

Python if statement - exclude "--"

I'm reading data from a netcdf file, I've noticed some fields are written to my file as "--".
When I print the content variable to the console its value is:
[-33.939999, 151.03918, masked, masked, masked, masked, masked, masked, masked]
I've tried the below code block to check if the row doesn't contain masked or "--" and write those values. Its not working!
How do I exclude or check for those values?
Tried:
if "masked" not in content:
outputwriter.writerow(content)
print content
UPDATE
with open(r'C:/output.csv', 'wb') as csvFile:
outputwriter = csv.writer(csvFile, delimiter=',')
for date_val in date_strings:
header.append(date_val)
outputwriter.writerow(header)
for lat_index, lat in enumerate(lats):
for lon_index, lon in enumerate(lons):
content = [lat,lon]
for time_index, time in enumerate(times[:]):
data = value[time_index,lat_index,lon_index]
content.append(data)
#outputwriter.writerow(content)
temp = content
contentVal = (set(temp)-set(exclude))
for item in contentVal:
outputwriter.writerow(item)
print item
UPDATE 2
from netCDF4 import Dataset, num2date
import csv
filename = "C:/netcdf.nc"
nc = Dataset(filename, 'r', Format='NETCDF4')
lats = nc.variables['latitude'][:]
lons = nc.variables['longitude'][:]
sfc = nc.variables['Min_SFC'][:]
times = nc.variables['time']
dates = num2date(times[:],times.units)
date_strings = [date.strftime('%d-%m-%Y') for date in dates]
header = ['Latitude', 'Longitude']
exclude = ['masked','--']
with open(r'C:/output.csv', 'wb') as csvFile:
outputwriter = csv.writer(csvFile, delimiter=',')
for date_val in date_strings:
header.append(date_val)
outputwriter.writerow(header)
for lat_index, lat in enumerate(lats):
for lon_index, lon in enumerate(lons):
content = [lat,lon]
for time_index, time in enumerate(times[:]):
data = sfc[time_index,lat_index,lon_index]
content.append(data)
contentVal = (set(content)-set(exclude))
print "Content Val"
print contentVal
print "Content"
print content
outputwriter.writerow(contentVal)
You need to use a for loop.
for i in content:
if i != "masked" or i != "--":
outputwriter.writerow(i)
print i
I've never used outputwriter, although if your items are in a list, then it's possible to make another list to exclude values:
l = [-33.939999, 151.03918, '--', 'masked', 'masked']
x = ['masked','--']
content = (set(l)-set(x))
for item in content:
outputwriter.writerow(item)
print item
Output:
151.03918
-33.939999

How to extract column and row in csv using python

I have this input in a file.csv
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
I wanted to write a simple program to find the city with the lowest rainfall which is Missouri in this case. How can I do that using Python csv reader?
I can try extract the items but unfortunately the first row of the file has to be there.
I wanted to have something like count[Missouri]=300
count[Amsterdam]=1212 etc.. so that I can do a minimum and reference back to print the city.
Please advise. Thanks.
import csv
def main():
with open('file.csv', 'rb') as inf:
data = [(int(row['rainfall']), row['']) for row in csv.DictReader(inf)]
data.sort()
print data[0]
if __name__=="__main__":
main()
returns
(300, 'Missouri')
One way to do this would be to use the csv module's DictReader class to write a function to extract the column of data. DictReader will take care of handling the first row of field names automatically. The built-in min() function can then be used to determine the item with the smallest value in the column.
import csv
def csv_extract_col(csvinput, colname, key):
""" extract a named column from a csv stream into a dictionary
colname: name of columm to extract
key: name of another columm to use as keys in returned dict
"""
col = {}
for row in csv.DictReader(csvinput):
col[row[key]] = row[colname]
return col
if __name__=='__main__':
import StringIO
csvdata = """\
"","min","max","rainfall","days_clear" # field name row
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
csvfile = StringIO.StringIO(csvdata)
rainfall = csv_extract_col(csvfile, 'rainfall', '')
print rainfall
# {'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
print min(rainfall.iteritems(), key=lambda r: float(r[1]))
# ('Missouri', '300')
import StringIO
import csv
example = """"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"LA",10,20,1000,54
"""
data_in = StringIO.StringIO(example)
#data_in = open('mycsvdata.csv')
def read_data(data_in):
reader = csv.reader(data_in)
cols = []
results = {}
for row in reader:
if not cols:
cols = row
continue
row = [ int(x) if x.lstrip('-').isdigit() else x for x in row ]
results[row[0]] = dict(zip(cols[1:],row[1:]))
return results
data = read_data(data_in)
min(data.items(),key=lambda x: x[1].get('rainfall'))
Returns
('Missouri', {'max': 10, 'days_clear': 23, 'rainfall': 300, 'min': -2})
To read from a file, you need to remove all code that deals with a string:
reader = csv.reader(open('file.csv', 'rb'))
rainfall = csv_extract_col(reader, 'rainfall', '')
Update: Sorry, it neads a bit more work than that. The first arg of csv_extract_col will be used as the first arg of csv.DictReader so (in this case) it should be an open file object, and should never be a csv.reader instance. See below:
import csv
### def csv_extract_col(csvinput, colname, key):
### exactly as provided by #martineau
if __name__ == '__main__':
import sys
filename, data_col_name, key_col_name = sys.argv[1:4]
input_file_object = open(filename, 'rb')
result_dict = csv_extract_col(input_file_object, data_col_name, key_col_name)
print result_dict
print min(result_dict.iteritems(), key=lambda r: float(r[1]))
Results:
command-prompt>\python27\python joj_csv.py joj.csv rainfall ""
{'Amsterdam': '1212', 'LA': '1000', 'Missouri': '300'}
('Missouri', '300')
command-prompt>\python27\python joj_csv.py joj.csv days_clear ""
{'Amsterdam': '34', 'LA': '54', 'Missouri': '23'}
('Missouri', '23')
Update 2 in response to comment """there must be something i missed out.. i tried.. [what looks like #martineau's function] with the above main function you define. Then in my shell, i define python rainfall "". But it gives me KeyError: 'rainfall'"""
Two possibilities:
(1) You made a mistake patching the pieces of source code together. Check your work.
(2) Your file doesn't have the expected heading row contents. Try some debugging e.g. change #martineau's code so that you can insert a print statement etc. to show what the csv.DictReader thinks about your heading row:
reader = csv.DictReader(csvinput)
print "fieldnames", reader.fieldnames
assert colname in reader.fieldnames
assert key in reader.fieldnames
for row in reader:
If you are still stuck, show us ALL of your code plus the full traceback and error message -- either edit your question or put it up on pastbin or dropbox; DON'T put it into a comment!!
My code for cases in which there are several cities having the same minimum or several cities having the same maximum:
import csv
def minmax_col(filename,key,colname):
with open(filename,'rb') as csvfile:
rid = csv.DictReader(csvfile,
fieldnames=None,
quoting=csv.QUOTE_NONNUMERIC)
mini = float('inf')
maxi = float('-inf')
limin = limax =[]
for row in rid:
if row[colname] == maxi:
limax.append(row[key])
elif row[colname] > maxi:
maxi = row[colname]
limax = [row[key]]
if row[colname] == mini:
limin.append(row[key])
elif row[colname] < mini:
mini = row[colname]
limin = [row[key]]
return (key,(maxi,limax),(mini,limin))
key = 'rainfall'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
print
key = 'min'
city,(Ma,liMa),(mi,limi) = minmax_col('filename.csv','',key)
print 'Cities analysed on ' + repr(key) + ' parameter :'
print 'maximum==',Ma,' cities :',', '.join(liMa)
print 'minimum==',mi,' cities :',', '.join(limi)
On a file like that:
"","min","max","rainfall","days_clear"
"Missouri",-2,10,300,23
"Amsterdam",-3,5,1212,34
"Oslo",-2,8,800,12
"LA",10,20,1000,54
"Kologoro",28,45,1212,1
the result is
Cities analysed according the 'rainfall' parameter :
maximum== 1212.0 cities : Amsterdam, Kologoro
minimum== 300.0 cities : Missouri
Cities analysed according the 'min' parameter :
maximum== 28.0 cities : Kologoro
minimum== -3.0 cities : Amsterdam

Categories