I try to find a way to add a function in my script to ignore or delete the first line of my CSV files. I know we can do that with pandas but it is possible without?
Many thanks for your help.
Here is my code -
from os import mkdir
from os.path import join, splitext, isdir
from glob import iglob
from csv import DictReader
from collections import defaultdict
from urllib.request import urlopen
from shutil import copyfileobj
csv_folder = r"/Users/folder/PycharmProjects/pythonProject/CSVfiles/"
glob_pattern = "*.csv"
for file in iglob(join(csv_folder, glob_pattern)):
with open(file) as csv_file:
reader = DictReader(csv_file)
save_folder, _ = splitext(file)
if not isdir(save_folder):
mkdir(save_folder)
title_counter = defaultdict(int)
for row in reader:
url = row["link"]
title = row["title"]
title_counter[title] += 1
_, ext = splitext(url)
save_filename = join(save_folder, f"{title}_{title_counter[title]}{ext}".replace('/', '-'))
print(f"'{save_filename}'")
with urlopen(url) as req, open(save_filename, "wb") as save_file:
copyfileobj(req, save_file)
Use the next() function to skip the first row of your CSV.
with open(file) as csv_file:
reader = DictReader(csv_file)
# skip first row
next(reader)
You could just read the raw text from the file as normal and then split the text by new line and delete the first line:
file = open(filename, 'r') # Open the file
content = file.read() # Read the file
lines = content.split("\n") # Split the text by the newline character
del lines[0] # Delete the first index from the resulting list, ie delete the first line.
Although this may take a long time for larger CSV files, so this may not be the best solution.
Or you could simply skip the first row in your for loop.
Instead of:
...
for row in reader:
...
Could you use:
...
for row_num, row in enumerate(list(reader)):
if row_num == 0:
continue
...
instead? I think that should skip the first row.
Related
import csv
import operator
import os
import sys
with open('test.csv', 'r') as cvsfile:
f = csv.reader(csvfile, delimiter=',')
for line in f:
line = [line:-1]
print (line)
=====
Tried the above
I am trying to strip the last line from a csv file before sorting. I am able to strip the header with a next() command but am unable to strip the footer.
To strip first/last line, you can convert the csv.reader to list and then use [1:-1] on it. For example:
import csv
with open('test.csv', 'r') as csvfile:
f = csv.reader(csvfile, delimiter=',')
for line in list(f)[1:-1]: # [1:-1] to strip first/last line from csv
print (line)
If you are dealing a with a big dataset, list(f)[1:-1] will load the entire set into memory at once and then make of copy of that list without the first and last element of the original list. To keep using the generator so that only one line is loaded into memory at a time,
import csv
import operator
import os
import sys
curr = None
next_line = None
with open('test.csv', 'r') as cvsfile:
f = csv.reader(csvfile, delimiter=',')
next(f) # remove first line
try:
curr = next(f)
next_line = next(f)
while True:
print(curr)
curr = next_line
next_line = next(f)
except StopIteration:
pass
I obtain multiple CSV files from API, in which I need to remove New Lines present in the CSV and join the record, consider the data provided below;
My Code to remove the New Line:
## Loading necessary libraries
import glob
import os
import shutil
import csv
## Assigning necessary path
source_path = "/home/Desktop/Space/"
dest_path = "/home/Desktop/Output/"
# Assigning file_read path to modify the copied CSV files
file_read_path = "/home/Desktop/Output/*.csv"
## Code to copy .csv files from one folder to another
for csv_file in glob.iglob(os.path.join(source_path, "*.csv"), recursive = True):
shutil.copy(csv_file, dest_path)
## Code to delete the second row in all .CSV files
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding = 'ISO-8859-1') as file:
reader = list(csv.reader(file , delimiter = ","))
for i in range(0,len(reader)):
reader[i] = [row_space.replace("\n", "") for row_space in reader[i]]
with open(filename, "w") as output:
writer = csv.writer(output, delimiter = ",", dialect = 'unix')
for row in reader:
writer.writerow(row)
I actually copy the CSV files into a new folder and then use the above code to remove any new line present in the file.
You are fixing the csv File, because they have wrong \n the problem here is how
to know if the line is a part of the previous line or not. if all lines starts
with specifics words like in your example SV_a5d15EwfI8Zk1Zr or just SV_ You can do something like this:
import glob
# this is the FIX PART
# I have file ./data.csv(contains your example) Fixed version is in data.csv.FIXED
file_read_path = "./*.csv"
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file, open(filename + '.FIXED', "w", encoding='ISO-8859-1') as target:
previous_line = ''
for line in file:
# check if it's a new line or a part of the previous line
if line.startswith('SV_'):
if previous_line:
target.write( previous_line + '\n')
previous_line = line[:-1] # remove \n
else:
# concatenate the broken part with previous_line
previous_line += line[:-1] # remove \n
# add last line
target.write(previous_line + '\n')
Ouput:
SV_a5d15EwfI8Zk1Zr;QID4;"<span style=""font-size:16px;""><strong>HOUR</strong> Interview completed at:</span>";HOUR;TE;SL;;;true;ValidNumber;0;23.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID6;"<span style=""font-size:16px;""><strong>MINUTE</strong> Interview completed:</span>";MIN;TE;SL;;;true;ValidNumber;0;59.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID8;Number of Refusals - no language<br />For <strong>Zero Refusals - no language</strong> use 0;REFUSAL1;TE;SL;;;true;ValidNumber;0;99.0;0.0;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID10;<strong>DAY OF WEEK:</strong>;WEEKDAY;MC;SACOL;TX;;true;;0;;;882;-873;0
SV_a5d15EwfI8Zk1Zr;QID45;"<span style=""font-size:16px;"">Using points from 0 to 10, how likely would you be recommend Gatwick Airport to a friend or colleague?</span><div> </div>";NPSCORE;MC;NPS;;;true;;0;;;882;-873;
EDITS:
Can Be Simpler using split too, this will fix the file it self:
import glob
# this is the FIX PART
# I have file //data.csv the fixed version in the same file
file_read_path = "./*.csv"
# assuming that all lines starts with SV_
STARTING_KEYWORD = 'SV_'
for filename in glob.glob(file_read_path):
with open(filename, "r", encoding='ISO-8859-1') as file:
lines = file.read().split(STARTING_KEYWORD)
with open(filename, 'w', encoding='ISO-8859-1') as file:
file.write('\n'.join(STARTING_KEYWORD + l.replace('\n', '') for l in lines if l))
Well I'm not sure on the restrictions you have. But if you can use the pandas library , this is simple.
import pandas as pd
data_set = pd.read_csv(data_file,skip_blank_lines=True)
data_set.to_csv(target_file,index=False)
This will create a CSV File will all new lines removed. You can save a lot of time with available libraries.
I have a folder that has over 15,000 csv files. They all have different number of column names.
Most files have its first row as a column name (attribute of data) like this :
Name Date Contact Email
a b c d
a2 b2 c2 d2
What I want to do is read first row of all files, store them as a list, and write that list as new csv file.
Here is what I have done so far :
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for i in list:
csv_writer.writerow(i)
When I try this code, result comes out like this :
[('C:/example\\example.csv', 'Name,Date,Contact,Email\n'), ('C:/example\\example2.csv', 'Address,Date,Name\n')]
Therefore in a made csv, all attributes of each file go into second column making it look like this (for some reason, there's a empty row between) :
New CSV file made
Moreover when going through files, I have encoutered another error :
UnicodeDecodeError: 'cp949' codec can't decode byte 0xed in position 6: illegal multibyte sequence
So I included this code in first line but it didn't work saying files are invalid.
import codecs
files=glob.glob('C:/example/*.csv')
fileObj = codecs.open( files, "r", "utf-8" )
I read answers on stackflow but I couldn't find one related to my problem. I appreciate your answers.
Ok, so
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
for file in files :
f = open(file)
a=[file,f.readline()]
list.append(a)
here you're opening the file and then creating a list with the column headers as a string(note that means they'll look like "Column1,Column2") and the file name. So [("Filename", "Column1, Column2")]
so you're going to need to split that on the ',' like:
for file in files :
f = open(file)
a=[file] + f.readline().split(',')
Now we have:
["filename", ("Column1", "Column2")]
So it's still going to print to the file wrong. We need to concatenate the lists.
a=[file] + f.readline().split(',')
So we get:
["filename", "Column1", "Column2"]
And you should be closing each file after you open it with f.close() or use a context manager inside your loop like:
for file in files :
with open(file) as f:
a=[file] + f.readline()
list.append(a)
Better solution and how I would write it:
import csv
import glob
files = glob.glob('mydir/*.csv')
lst = list()
for file in files:
with open(file) as f:
reader = csv.reader(f)
lst.append(next(reader))
try:
with open(files,'r'.encoding='utf8') as f:
# do things
except UnicodeError:
with open(files,'r'.encoding='utf8') as f:
# do things
a little bit of tidying, proper context managing, and using csv.reader:
import csv
import glob
list=[]
files=glob.glob('C:/example/*.csv')
with open('test.csv', 'w') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
reader = csv.reader(infile)
headers = next(reader)
lst = [file] + headers
writer.writerow(lst)
this will write a new csv with one row per infile, each row being filename, column1, column2, ...
This is my code i am able to print each line but when blank line appears it prints ; because of CSV file format, so i want to skip when blank line appears
import csv
import time
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
for line in csv.reader(ifile):
if not line:
empty_lines += 1
continue
print line
If you want to skip all whitespace lines, you should use this test: ' '.isspace().
Since you may want to do something more complicated than just printing the non-blank lines to the console(no need to use CSV module for that), here is an example that involves a DictReader:
#!/usr/bin/env python
# Tested with Python 2.7
# I prefer this style of importing - hides the csv module
# in case you do from this_file.py import * inside of __init__.py
import csv as _csv
# Real comments are more complicated ...
def is_comment(line):
return line.startswith('#')
# Kind of sily wrapper
def is_whitespace(line):
return line.isspace()
def iter_filtered(in_file, *filters):
for line in in_file:
if not any(fltr(line) for fltr in filters):
yield line
# A dis-advantage of this approach is that it requires storing rows in RAM
# However, the largest CSV files I worked with were all under 100 Mb
def read_and_filter_csv(csv_path, *filters):
with open(csv_path, 'rb') as fin:
iter_clean_lines = iter_filtered(fin, *filters)
reader = _csv.DictReader(iter_clean_lines, delimiter=';')
return [row for row in reader]
# Stores all processed lines in RAM
def main_v1(csv_path):
for row in read_and_filter_csv(csv_path, is_comment, is_whitespace):
print(row) # Or do something else with it
# Simpler, less refactored version, does not use with
def main_v2(csv_path):
try:
fin = open(csv_path, 'rb')
reader = _csv.DictReader((line for line in fin if not
line.startswith('#') and not line.isspace()),
delimiter=';')
for row in reader:
print(row) # Or do something else with it
finally:
fin.close()
if __name__ == '__main__':
csv_path = "C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
main_v1(csv_path)
print('\n'*3)
main_v2(csv_path)
Instead of
if not line:
This should work:
if not ''.join(line).strip():
my suggestion would be to just use the csv reader who can delimite the file into rows. Like this you can just check whether the row is empty and if so just continue.
import csv
with open('some.csv', 'r') as csvfile:
# the delimiter depends on how your CSV seperates values
csvReader = csv.reader(csvfile, delimiter = '\t')
for row in csvReader:
# check if row is empty
if not (row):
continue
You can always check for the number of comma separated values. It seems to be much more productive and efficient.
When reading the lines iteratively, as these are a list of comma separated values you would be getting a list object. So if there is no element (blank link), then we can make it skip.
with open(filename) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
for row in csv_reader:
if len(row) == 0:
continue
You can strip leading and trailing whitespace, and if the length is zero after that the line is empty.
import csv
with open('userlist.csv') as f:
reader = csv.reader(f)
user_header = next(reader) # Add this line if there the header is
user_list = [] # Create a new user list for input
for row in reader:
if any(row): # Pick up the non-blank row of list
print (row) # Just for verification
user_list.append(row) # Compose all the rest data into the list
This example just prints the data in array form while skipping the empty lines:
import csv
file = open("data.csv", "r")
data = csv.reader(file)
for line in data:
if line: print line
file.close()
I find it much clearer than the other provided examples.
import csv
ifile=csv.reader(open('C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv', 'rb'),delimiter=';')
for line in ifile:
if set(line).pop()=='':
pass
else:
for cell_value in line:
print cell_value
This is my code i am able to print each line but when blank line appears it prints ; because of CSV file format, so i want to skip when blank line appears
import csv
import time
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
for line in csv.reader(ifile):
if not line:
empty_lines += 1
continue
print line
If you want to skip all whitespace lines, you should use this test: ' '.isspace().
Since you may want to do something more complicated than just printing the non-blank lines to the console(no need to use CSV module for that), here is an example that involves a DictReader:
#!/usr/bin/env python
# Tested with Python 2.7
# I prefer this style of importing - hides the csv module
# in case you do from this_file.py import * inside of __init__.py
import csv as _csv
# Real comments are more complicated ...
def is_comment(line):
return line.startswith('#')
# Kind of sily wrapper
def is_whitespace(line):
return line.isspace()
def iter_filtered(in_file, *filters):
for line in in_file:
if not any(fltr(line) for fltr in filters):
yield line
# A dis-advantage of this approach is that it requires storing rows in RAM
# However, the largest CSV files I worked with were all under 100 Mb
def read_and_filter_csv(csv_path, *filters):
with open(csv_path, 'rb') as fin:
iter_clean_lines = iter_filtered(fin, *filters)
reader = _csv.DictReader(iter_clean_lines, delimiter=';')
return [row for row in reader]
# Stores all processed lines in RAM
def main_v1(csv_path):
for row in read_and_filter_csv(csv_path, is_comment, is_whitespace):
print(row) # Or do something else with it
# Simpler, less refactored version, does not use with
def main_v2(csv_path):
try:
fin = open(csv_path, 'rb')
reader = _csv.DictReader((line for line in fin if not
line.startswith('#') and not line.isspace()),
delimiter=';')
for row in reader:
print(row) # Or do something else with it
finally:
fin.close()
if __name__ == '__main__':
csv_path = "C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
main_v1(csv_path)
print('\n'*3)
main_v2(csv_path)
Instead of
if not line:
This should work:
if not ''.join(line).strip():
my suggestion would be to just use the csv reader who can delimite the file into rows. Like this you can just check whether the row is empty and if so just continue.
import csv
with open('some.csv', 'r') as csvfile:
# the delimiter depends on how your CSV seperates values
csvReader = csv.reader(csvfile, delimiter = '\t')
for row in csvReader:
# check if row is empty
if not (row):
continue
You can always check for the number of comma separated values. It seems to be much more productive and efficient.
When reading the lines iteratively, as these are a list of comma separated values you would be getting a list object. So if there is no element (blank link), then we can make it skip.
with open(filename) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=",")
for row in csv_reader:
if len(row) == 0:
continue
You can strip leading and trailing whitespace, and if the length is zero after that the line is empty.
import csv
with open('userlist.csv') as f:
reader = csv.reader(f)
user_header = next(reader) # Add this line if there the header is
user_list = [] # Create a new user list for input
for row in reader:
if any(row): # Pick up the non-blank row of list
print (row) # Just for verification
user_list.append(row) # Compose all the rest data into the list
This example just prints the data in array form while skipping the empty lines:
import csv
file = open("data.csv", "r")
data = csv.reader(file)
for line in data:
if line: print line
file.close()
I find it much clearer than the other provided examples.
import csv
ifile=csv.reader(open('C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv', 'rb'),delimiter=';')
for line in ifile:
if set(line).pop()=='':
pass
else:
for cell_value in line:
print cell_value