How to use elements in list by order - python

My goal is to change multiple csv files in a folder into JSON.
First, I needed to list my csv files
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
#check if csv files are listed correctly
print(os.path.join("C:/Users/folder_to_csv", filename))
With this, I was able to call csv files in that folder.
Result:
C:/Users/folder_to_csv\file_1.csv C:/Users/folder_to_csv\file_2.csv C:/Users/folder_to_csv\file_3.csv
Then, I wanted to use all of the csv files in 'csvlist' to jsonObj, however for some reason, my codes are only using the first file (C:/Users/folder_to_csv\file_1.csv)
This is what I have tried so far:
import json
import csv
import requests
import threading
import os
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
csvlist = os.path.join("C:/Users/folder_to_csv", filename)
data = {}
def main():
#loop csv list so my codes can read all csv files
length = len(csvlist)
for i in range(length):
i += 1
path = csvlist
#switch csv to json
with open(path, mode='r') as f:
reader = csv.DictReader(f)
processdata = [row for row in reader]
dlist = processdata
jsonObj = json.dumps(dlist)
})
print(jsonObj)
main()

In the initial loop, you keep redefining the csvlist variable. I suppose you want it to be a list? Then just create an initial empty list and append to it instead of redefining
csvlist = []
...
csvlist.append(os.path.join("C:/Users/folder_to_csv", filename))

Related

Write a CSV from JSON, importing only given keys

I have JSONs reporting different values, and I want to import only some keys in a csv.
I have tried 2 approaches, but both give me some problems.
At first, I have tried this :
`import os,json
import glob
import csv
# Place your JSON data in a directory named 'data/'
src = "MYPATH"
data = []
json_pattern = os.path.join(src, '*.json')
# only json
files = glob.glob(json_pattern, recursive=True)
# Loop through files
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
try:
data.append([
json_file['name1'],
json_file['name2'],
json_file['name3'],
json_file['name4'],
])
except KeyError:
continue
# Add headers
data.insert(0, ['title_1', 'title_2', 'title_3'])
# Export to CSV.
# Add the date to the file name to avoid overwriting it each time.
csv_filename = 'name.csv'
with open((src + csv_filename), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(data)`
In this way, unfortunately, if a key is not included, the code skip the file altogether, while I want it to skip only the key.
So I tried this, instead:
import os,json
import glob
import csv
# Place your JSON data in a directory named 'data/'
src = "MY_PATH"
data = []
json_pattern = os.path.join(src, '*.json')
# Change the glob if you want to only look through files with specific names
files = glob.glob(json_pattern, recursive=True)
# Loop through files
col_name = ['name1','name2','name4']
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
for key in col_name:
try:
data.append([json_file[key]])
except KeyError:
continue
# Add headers
data.insert(0, ['title_1', 'title_2', 'title_3'])
# Export to CSV.
# Add the date to the file name to avoid overwriting it each time.
csv_filename = 'name.csv'
with open((src + csv_filename), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(data)
But in this case, each value is a new row in the csv, while I want the value from each json in a single row.
I am not an expert and I really don't know how to combine this two.
Can someone help me out?
Thanks!
If I understand what you're trying to do correctly, why not just do
# Loop through files
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
data.append([
json_file.get('name1', ''),
json_file.get('name2', ''),
json_file.get('name3', ''),
json_file.get('name4', '')
])
By using .get() you can specify the default value in case a key isn't found.

How to read n number of files in n variables and then add those variables in a list?

This is my code:
file_input1 = open('Amazon_Indi_Seller.py', 'r')
f1 = file_input1.read().lower()
file_input2 = open('Amazon_Prices.py', 'r')
f2 = file_input2.read().lower()
documents = [f1, f2]
import nltk, string, numpy
stemmer = nltk.stem.porter.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return
LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize,
stop_words='english')
LemVectorizer.fit_transform(documents)
Instead of reading 2 files i want to read all the files in a directory. And read them individually so that later I can add those variables in a list named documents.
You can use the code mentioned below,
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
files = ['sample.py', 'Amazon_Indi_Seller.py']
data = list()
for file in files:
data.append(read_files(file))
print(data)
The above code will used to read the files mentioned in the list
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
src = r'DIRECTORY PATH'
data = list()
for file in os.listdir(src):
data.append(read_files(file))
print(data)
And the above code will read all the files from the directory mentioned
You could collect all the in a list, for example:
lst = []
for file in os.listdir():
file_input = open(file,"r")
lst.append(file_input.read())
One extra recommendation - in general it might be wise to store the contents of a file as a collection of its lines by for example using file_input.readlines() which returns a list of lines.
Create a list of all filenames and then iterate over filename list and add their content in a dictionary.
from collections import defaultdict #imported default dictionary
result = defaultdict() #created empty default dictionary
filenames = ['name1.py', 'name2.py', 'name3.py'] #added filenames to a list
for name in filenames: #iterate over filename list
with open(name, 'r') as stream: #open each file
data = stream.readlines() #read contents lines by line (readlines return list of lines)
result[name] = data # set name as key and content as value in dictionary
print(result)
In this way you will have a dictionary with keys as filenames and values as their contents
If the directory may include other directories with files, which you want to read to - use os.walk
Here is sample code from the official documentation:
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
print root, "consumes",
print sum(getsize(join(root, name)) for name in files),
print "bytes in", len(files), "non-directory files"
if 'CVS' in dirs:
dirs.remove('CVS') # don't visit CVS directories

Python CSV writer - writing columns in new csv file up to maximum number of fields in csv files

I have 200 CSV files in my folder.
What I am trying to do is read first row of each files and write in new csv.
And on top, I want to write [file,field1,field2,...fieldn]
n is maximum number of fields.
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
max(hel)
lst = [file] + headers
csv_writer.writerow(lst)
It came out that maximum number of fields of 200 files are 255.
So on top of new csv file, I want to write file, field1, field2 ... field 255.
How can I do this?
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
b=['field{}'.format(i) for i in range(1,max(hel)+1)]
lst = [file] + headers
csv_writer.writerow(lst)
Now b is list that looks like this ['field1','field2'...'field255']
I need to insert 'file' before 'field1' and write that row on the top of new csv file. Writing code after csv_writer.writerow(lst) gives me csv file with 'field1','field2'.. every other line. How can I fix this problem
You first need to read all your input files to determine the maximum number of fields is 255. Then you need to construct a list of field names to write into the output file (just once, not in a loop):
['field{}'.format(i) for i in range(1, 256)]
You can pass that list to the csv module to write it.
Read the field count and first line from each file before writing the file.
import glob
from itertools import chain
import os
from os.path import splitext, basename
def first_line(filepath):
with open(filepath) as f:
return next(f)
def write_test_file(dest_file_path, source_path_name):
source_paths = glob.glob(source_path_name)
first_lines = list(map(first_line, source_paths))
max_count = max(l.count(",") for l in first_lines)
field_names = map("field{}".format, range(1, max_count + 2))
header = ",".join(chain(["file"], field_names)) + os.linesep
file_names = (splitext(basename(p))[0] for p in source_paths)
content = chain([header], map(",".join, zip(file_names, first_lines)))
with open(dest_file_path, 'w') as testfile:
testfile.write("".join(content))
write_test_file('test.csv', 'C:/dataset/*.csv')

Move files listed in csv file?

I have been trying to use the following code to move files that are listed in a csv list. But at most it will copy the last file in the list but not the rest.
I keep hitting this wall with every example I have seen listed what am I doing wrong?
My CVS list will have a list like:
12355,12355.jpg
Here's my code
import os
import shutil
import csv
keys={}
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter = ',')
for rowDict in reader:
keys[rowDict[0]] = rowDict[1]
print (rowDict)
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
for file in os.listdir(dir_src):
src_file = os.path.join(dir_src, file)
dst_file = os.path.join(dir_dst, file)
if file in rowDict[1]:
shutil.move(src_file, dst_file)
I think doing something like this will work (untested):
import os
import shutil
import csv
keys={}
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
for rowDict in reader:
keys[rowDict[0]] = rowDict[1]
print(rowDict) # if desired
valid_files = set(keys.values()) # file names found in csv
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
for file in os.listdir(dir_src):
if file in valid_files:
src_file = os.path.join(dir_src, file)
dst_file = os.path.join(dir_dst, file)
shutil.move(src_file, dst_file)
As an optimization, unless you need the keys dictionary for other processing, you could change the first part so it just creates the valid_files set variable used in the second for loop:
valid_files = set() # empty set
with open('shuttle_image.csv', 'r') as f:
for rowDict in csv.reader(f, delimiter=','):
valid_files |= {rowDict[1]} # add file name to set
print(rowDict) # if desired
The reason why it's only the last file that could be copied (if it was) is because in this line:
if file in rowDict[1]:
you are referencing rowDict outside of the first for-loop. So at that execution moment, it contains the last value of this loop.
If I understand correctly what you are trying to do you could try something like this (untested code):
import os
import shutil
import csv
dir_src = 'C:\\Users\\Willie\\Desktop\\Suppliers Dropship\\hunting\\'
dir_dst = 'C:\\image\\'
with open('shuttle_image.csv', 'r') as f:
reader = csv.reader(f, delimiter = ',')
for rowDict in reader:
id, filename = rowDict
src_file = os.path.join(dir_src, filename)
if os.path.exists(src_file):
shutil.move(src_file, dir_dst)
So instead of:
Constructing a dictionary with all the values in your CSV file
Somehow check for every file in your source directory that it is included in your dictionary (which is what I interpreted you were trying to do)
And move it if it does.
You could:
For every file extracted from your CSV, check that it exists in your source directory.
If it does, you move it to the destination directory.
Is that what you were trying to do ?
[And if the filename stays the same, you only need to specify the destination directory for the second argument of shutil.move()]

grab headers from multiple tsv/csv files

I have a list of tsv files where I am looking to grab column headers for all the files.
with open(os.path.abspath('reference/file.tsv'), 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
Currently, this snippet only reads 1 file where I have a list of files that needs to be parsed.
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
The name of the files are listed in files. How do I loop through the list of files and grab only the column headers for each file?
I try this and it works.
import os
import csv
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for f in files:
with open(dir_path +'/'+f, 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
print row1
The files variable in your code is the content of the reference folder, meaning all files and subfolders of the folder. They are returned in a list of strings, containing only the file or subfolder name. This means that you'll have to prefix the path yourself.
Example:
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for file in files:
# Skip non-files
if not os.path.isfile(file):
continue
with open(os.path.join(dir_path, file), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
An alternative using the pathlib module:
for file in Path('reference/').glob('*'):
if not file.is_file():
continue
with open(str(file.resolve()), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
Wouldn't you be better off in reading the first line of each of those files, appending them to a list and then passing them to csvreader?
Example:
lines = []
with open(str(file.resolve()), 'rU') as f:
lines.append(f.readline())
reader = csv.reader(lines)
for row in reader:
# whatever you want to do with the parsed lines

Categories