Convert text file into csv format - python

The original file format is like this
ID DC_trip
AC A9999
SY DC,Foggy_bottom,22201,H_St.
SY DC,Smithsonian,12345,14th_St.
//
ID ...
AC ...
SY ...
SY ...
SY ...
I want to convert it to .csv file format and transform it into
DC_trip,A9999,DC,Foggy_bottom,22201,H_St.
DC_trip,A9999,DC,Smithsonian,12345,14th_St.
.
.
.
I tried to use if statement and elif.....
if lines.find('ID'):
lines[5:]
elif lines.find('SY'):
lines[5:]
If I use this way, each time I can only get one value.
Could someone give me some recommendation?
Thank you

Assuming the data in the original file is tab separated, you can use the csv module, and do this:
data = []
# Extract the second row from the input file
# and store it in data
with open('input') as in_file:
csv_reader = csv.reader(in_file, delimiter='\t')
for row in csv_reader:
data.append(row[1])
# The first two values in data is the suffix
# for the rest of your output file
suffix = ','.join(data[:2])
# Append the suffix to the rest of the values
# and write it out to the output file.
with open('output') as op_file:
for item in data[2:]:
op_file.write('{},{}\n'.format(suffix, item))
If the data in the original file is delimited by space, you would replace the first part with:
data = []
with open('file1') as in_file:
for line in in_file:
data.append(line.strip().split())
data = [a[1] for a in data if a[1]]

Related

How to copy a given column of several textx files into a csv, each column from each file in one csv column

I'm quite an amateur regarding python and I'm stuck.
So I managed to write a script to extract the second column of floats of a list of text files, from the line 5025 to the end, and create a list containing all these columns to export them into a csv.
My problem is that in the csv, all the columns from each file are pasted in one single column in the csv. So, what I wanted is that each column of each file in my list to be pasted in a different column of the csv (if I have 4 files to process, I would like to have 4 columns in the csv, one per file).
So here is what I have now:
#!/usr/bin/python3
import numpy as np
def read_csv(file_path):
with open(file_path, "r") as f:
lines = f.readlines()
return lines[5025:len(lines)] #ignoring 5025 first lines
def extract_second_column(file_path): #returns second column
lines = read_csv(file_path)
second_col = []
for elem in lines:
elem = elem.split()
second_col.append(float(elem[1]))
return second_col
def combined_array(file_path_list): #combines all columns to one array
all_values = []
for file_path in file_path_list:
col_data = extract_second_column(file_path)
all_values.append(col_data)
return all_values
def write_csv(data, csv_name ): #writes the array to a csv file
np.savetxt(csv_name, data, delimiter="\n")
#Now the magic happens:
file_name_list = ["GLN-46_Coul.xvg","GLN-46_LJ.xvg","GLU-102_Coul.xvg","GLU-102_LJ.xvg"]
data = combined_array(file_name_list) #array containing all columns
write_csv(data, "ENERGIES.csv") #writing to csv file
I will appreciate whatever suggestion! I'm aware that the code looks ugly but what I need right now is something that works.

Reading CSV file from stdin in Python and modifying it

I need to read csv file from stdin and output the rows only the rows which values are equal to those specified in the columns. My input is like this:
2
Kashiwa
Name,Campus,LabName
Shinichi MORISHITA,Kashiwa,Laboratory of Omics
Kenta Naai,Shirogane,Laboratory of Functional Analysis in Silico
Kiyoshi ASAI,Kashiwa,Laboratory of Genome Informatics
Yukihide Tomari,Yayoi,Laboratory of RNA Function
My output should be like this:
Name,Campus,LabName
Shinichi MORISHITA,Kashiwa,Laboratory of Omics
Kiyoshi ASAI,Kashiwa,Laboratory of Genome Informatics
I need to sort out the people whose values in column#2 == Kashiwa and not output first 2 lines of stdin in stdout.
So far I just tried to read from stdin into csv but I am getting each row as a list of strings (as expected from csv documentation). Can I change this?
#!usr/bin/env python3
import sys
import csv
data = sys.stdin.readlines()
for line in csv.reader(data):
print(line)
Output:
['2']
['Kashiwa']
['Name', 'Campus', 'LabName']
['Shinichi MORISHITA', 'Kashiwa', 'Laboratory of Omics']
['Kenta Naai', 'Shirogane', 'Laboratory of Functional Analysis in
Silico']
['Kiyoshi ASAI', 'Kashiwa', 'Laboratory of Genome Informatics']
['Yukihide Tomari', 'Yayoi', 'Laboratory of RNA Function']
Can someone give me some advice on reading stdin into CSV and manipulating the data later (outputting only needed values of columns, swapping the columns, etc.,)?
#!usr/bin/env python3
import sys
import csv
data = sys.stdin.readlines() # to read the file
column_to_be_matched = int(data.pop(0)) # to get the column number to match
word_to_be_matched = data.pop(0) # to get the word to be matched in said column
col_headers = data.pop(0) # to get the column names
print(", ".join(col_headers)) # to print the column names
for line in csv.reader(data):
if line[column_to_be_matched-1] == word_to_be_matched: #while it matched
print(", ".join(line)) #print it
Use Pandas to read your and manage your data in a DataFrame
import pandas as pd
# File location
infile = r'path/file'
# Load file and skip first two rows
df = pd.read_csv(infile, skiprows=2)
# Refresh your Dataframe en throw out the rows that contain Kashiwa in the campus column
df = df[df['campus'] != 'Kashiwa']
You can perform all kinds edits for example sort your DataFrame simply by:
df.sort(columns='your column')
Check the Pandas documentation for all the possibilities.
This is one approach.
Ex:
import csv
with open(filename) as csv_file:
reader = csv.reader(csv_file)
next(reader) #Skip First Line
next(reader) #Skip Second Line
print(next(reader)) #print Header
for row in reader:
if row[1] == 'Kashiwa': #Filter By 'Kashiwa'
print(row)
Output:
['Name', 'Campus', 'LabName']
['Shinichi MORISHITA', 'Kashiwa', 'Laboratory of Omics']
['Kiyoshi ASAI', 'Kashiwa', 'Laboratory of Genome Informatics']
import csv, sys
f= sys.stdin.readline()
data = csv.reader(f)
out = []
data_lines = list(data)
for line in data_lines[2:5]:#u can increase index to match urs
if line[1] == 'kashiwa':
new = [line[0], line[1], line[2]]#u can use string instead if list
string = f"{line[0]},{line[1]},{line[2]}"
#print(string)#print does same as stdout u can use dis
sys.stdout.write(string+'\n')
out.append(new)
sys.stdout.write(str(out))#same thing dat happens in print in the background#it out puts it as a list after the string repr
#print(out)#u can use dis too instead of stdout
f.close()

Delete data in csv file using python?

I have two csv files with a single column of data. How can I remove data in the second csv file in-place by comparing it with the data in the first csv file? For example:
import csv
reader1 = csv.reader(open("file1.csv", "rb"))
reader = csv.reader(open("file2.csv", "rb"))f
for line in reader:
if line in reader1:
print line
if both files are just single columns, then you could use set to remove the differences. However, this presumes that the entries in each file do not need to be duplicated and their order doesn't really matter.
#since each file is a column, unroll each file into a single list:
dat1 = [x[0] for x in reader1]
dat2 = [y[0] for y in reader]
#take the set difference
dat1_without_dat2 = set(dat1).difference(dat2)

Edit a piece of data inside a csv

I have a csv file looking like this
34512340,1
12395675,30
56756777,30
90673412,45
12568673,25
22593672,25
I want to be able to edit the data after the comma from python and then save the csv.
Does anybody know how I would be able to do this?
This bit of code below will write a new line, but not edit:
f = open("stockcontrol","a")
f.write(code)
Here is a sample, which adds 1 to the second column:
import csv
with open('data.csv') as infile, open('output.csv', 'wb') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
# Transform the second column, which is row[1]
row[1] = int(row[1]) + 1
writer.writerow(row)
Notes
The csv module correctly parses the CSV file--highly recommended
By default, each row will be parsed as text, what is why I converted into integer: int(row[1])
Update
If you really want to edit the file "in place", then use the fileinput module:
import fileinput
for line in fileinput.input('data.csv', inplace=True):
fields = line.strip().split(',')
fields[1] = str(int(fields[1]) + 1) # "Update" second column
line = ','.join(fields)
print line # Write the line back to the file, in place
You can use python pandas to edit the column you want for e.g increase the column number by n:
import pandas
data_df = pandas.read_csv('input.csv')
data_df = data_df['column2'].apply(lambda x: x+n)
print data_df
for adding 1 replace n by 1.

Use Python to split a CSV file with multiple headers

I have a CSV file that is being constantly appended. It has multiple headers and the only common thing among the headers is that the first column is always "NAME".
How do I split the single CSV file into separate CSV files, one for each header row?
here is a sample file:
"NAME","AGE","SEX","WEIGHT","CITY"
"Bob",20,"M",120,"New York"
"Peter",33,"M",220,"Toronto"
"Mary",43,"F",130,"Miami"
"NAME","COUNTRY","SPORT","NUMBER","SPORT","NUMBER"
"Larry","USA","Football",14,"Baseball",22
"Jenny","UK","Rugby",5,"Field Hockey",11
"Jacques","Canada","Hockey",19,"Volleyball",4
"NAME","DRINK","QTY"
"Jesse","Beer",6
"Wendel","Juice",1
"Angela","Milk",3
If the size of the csv files is not huge -- so all can be in memory at once -- just use read() to read the file into a string and then use a regex on this string:
import re
with open(ur_csv) as f:
data=f.read()
chunks=re.finditer(r'(^"NAME".*?)(?=^"NAME"|\Z)',data,re.S | re.M)
for i, chunk in enumerate(chunks, 1):
with open('/path/{}.csv'.format(i), 'w') as fout:
fout.write(chunk.group(1))
If the size of the file is a concern, you can use mmap to create something that looks like a big string but is not all in memory at the same time.
Then use the mmap string with a regex to separate the csv chunks like so:
import mmap
import re
with open(ur_csv) as f:
mf=mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
chunks=re.finditer(r'(^"NAME".*?)(?=^"NAME"|\Z)',mf,re.S | re.M)
for i, chunk in enumerate(chunks, 1):
with open('/path/{}.csv'.format(i), 'w') as fout:
fout.write(chunk.group(1))
In either case, this will write all the chunks in files named 1.csv, 2.csv etc.
Copy the input to a new output file each time you see a header line. Something like this (not checked for errors):
partNum = 1
outHandle = None
for line in open("yourfile.csv","r").readlines():
if line.startswith('"NAME"'):
if outHandle is not None:
outHandle.close()
outHandle = open("part%d.csv" % (partNum,), "w")
partNum += 1
outHandle.write(line)
outHandle.close()
The above will break if the input does not begin with a header line or if the input is empty.
You can use the python csv package to read your source file and write multile csv files based on the rule that if element 0 in your row == "NAME", spawn off a new file. Something like this...
import csv
outfile_name = "out_%.csv"
out_num = 1
with open('nameslist.csv', 'rb') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
csv_buffer = []
for row in csvreader:
if row[0] != "NAME":
csv_buffer.append(row)
else:
with open(outfile_name % out_num, 'wb') as csvout:
for b_row in csv_buffer:
csvout.writerow(b_row)
out_num += 1
csv_buffer = [row]
P.S. I haven't actually tested this but that's the general concept
Given the other answers, the only modification that I would suggest would be to open using csv.DictReader. pseudo code would be like this. Assuming that the first line in the file is the first header
Note that this assumes that there is no blank line or other indicator between the entries so that a 'NAME' header occurs right after data. If there were a blank line between appended files the you could use that as an indicator to use infile.fieldnames() on the next row. If you need to handle the inputs as a list, then the previous answers are better.
ifile = open(filename, 'rb')
infile = cvs.Dictreader(ifile)
infields = infile.fieldnames
filenum = 1
ofile = open('outfile'+str(filenum), 'wb')
outfields = infields # This allows you to change the header field
outfile = csv.DictWriter(ofile, fieldnames=outfields, extrasaction='ignore')
outfile.writerow(dict((fn, fn) for fn in outfields))
for row in infile:
if row['NAME'] != 'NAME':
#process this row here and do whatever is needed
else:
close(ofile)
# build infields again from this row
infields = [row["NAME"], ...] # This assumes you know the names & order
# Dict cannot be pulled as a list and keep the order that you want.
filenum += 1
ofile = open('outfile'+str(filenum), 'wb')
outfields = infields # This allows you to change the header field
outfile = csv.DictWriter(ofile, fieldnames=outfields, extrasaction='ignore')
outfile.writerow(dict((fn, fn) for fn in outfields))
# This is the end of the loop. All data has been read and processed
close(ofile)
close(ifile)
If the exact order of the new header does not matter except for the name in the first entry, then you can transfer the new list as follows:
infileds = [row['NAME']
for k in row.keys():
if k != 'NAME':
infields.append(row[k])
This will create the new header with NAME in entry 0 but the others will not be in any particular order.

Categories