Merge txt files in a folder and replacing characters in python - python

I have a doubt about how to do to continue the code, I need to take all files from a folder and merge them in 1 file with another text format.
Example:
The Input files are of text format like this:
"{'nr': '3173391045', 'data': '27/12/2017'}"
"{'nr': '2173391295', 'data': '05/01/2017'}"
"{'nr': '5173351035', 'data': '07/03/2017'}"
The Output files must be lines like this:
"3173391045","27/09/2017"
"2173391295","05/01/2017"
"5173351035","07/03/2017"
This is my working code, it's working for merge and taking out the blank lines
import glob2
import datetime
filenames=glob2.glob("*.txt")
with open(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w') as file:
for filename in filenames:
with open(filename,"r") as f:
file.write(f.read())
I'm trying something with .replace but is not working, I get syntax errors or blank files
filedata = filedata.replace("{", "") for line in filedata

If your input files had contained valid JSON strings, the correct way would have been to parse the lines as JSON and write them back in csv. As strings are enclosed in single quotes (') they are rejected by the json module of the Python library, and my advice is to use a regex to parse them. Code could become:
import glob2
import datetime
import csv
import re
# the regex to parse the line
rx = re.compile(r".*'nr'\s*:\s*'(\d+)'.*'data'\s*:\s*'([/\d]+)'")
filenames=glob2.glob("*.txt")
with open(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w') as file:
wr = csv.writer(file, quoting = csv.QUOTE_ALL)
for filename in filenames:
with open(filename,"r") as f:
for line in f: # process line by line
m = rx.match(line)
wr.writerow(m.groups())

With a few tweaks, the input data can be coerced into a form suitable for JSON parsing:
from datetime import datetime
import json
import glob2
import csv
with open(datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w', newline='') as f_output:
csv_output = csv.writer(f_output, quoting=csv.QUOTE_ALL)
for filename in glob2.glob('*.txt'):
with open(filename) as f_input:
for row in f_input:
row_dict = json.loads(row.strip('"\n').replace("'", '"'))
csv_output.writerow([row_dict['nr'], row_dict['data']])
Giving you:
"3173391045","27/12/2017"
"2173391295","05/01/2017"
"5173351035","07/03/2017"
Note, in Python 3.x the output file should be opened with newline=''. Without this, extra blank lines can appear in the output file.

using regex/replaces to parse those strings is dangerous. You could always stumble on a data containing the delimiter, the comma, etc..
And in this case, even if json cannot read those lines,ast.literal_eval can without any modification whatsoever:
import ast
with open("output.csv",newline="") as fw:
cw = csv.writer(fw)
for filename in filenames:
with open(filename) as f:
for line in f:
d = ast.literal_eval(line)
cw.writerow([d['nr'],d['data'])

Related

Python converts multiple JSON files in a folder directory to CSV

I have a lot of JSON files, I put them in my folder, I want to convert them to CSV format,
Should I use import glob? ? I am a novice, how can I modify my codeļ¼Œ
#-*-coding:utf-8-*-
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('C:/Users/jeri/Desktop/1', '*.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('C:/Users/jeri/Desktop/1.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0])
print(path)
trans(path)
Yes using glob would be a good way to iterate through the .json files in your folder! But glob doesn't have anything to do with the reading/writing of files. After importing glob, you can use it like this:
for curr_file in glob.glob("*.json"):
# Process each file here
I see that you've used the json module to read in your code snippet. I'd say the better way to go about it is to use pandas.
df = pd.read_json()
I say this because with the pandas library, you can simply convert from .json to .csv using
df.to_csv('file_name.csv')
Combining the three together, it would look like this:
for curr_file in glob.glob("*.json"):
# Process each file here
df = pd.read_json(curr_file)
df.to_csv('file_name.csv')
Also, note that if your json has nested objects, it can't be directly converted to csv, you'll have to settle the organization of data prior to the conversion.

Read a file from a folder and extract a specific key from the file and save as in CSV file

I'm new to Python and the task I am performing is to extract a specific key value from a list of .iris ( which contains the list of nested dictionary format) files in a specific directory.
I wanted to extract the specific value and save it as a new .csv file and repeat it for all other files.
Below is my sample of .iris file from which I should extract only for the these keys ('uid','enabled','login','name').
{"streamType":"user",
"uid":17182,
"enabled":true,
"login":"xyz",
"name":"abcdef",
"comment":"",
"authSms":"",
"email":"",
"phone":"",
"location":"",
"extraLdapOu":"",
"mand":997,
"global":{
"userAccount":"View",
"uid":"",
"retention":"No",
"enabled":"",
"messages":"Change"},
"grants":[{"mand":997,"role":1051,"passOnToSubMand":true}],
I am trying to convert the .iris file to .json and reading the files one by, but unfortunately, I am not getting the exact output as desired.
Please, could anyone help me?
My code (added from comments):
import os
import csv
path = ''
os.chdir(path)
# Read iris File
def read_iris_file(file_path):
with open(file_path, 'r') as f:
print(f.read())
# iterate through all files
for file in os.listdir():
# Check whether file is in iris format or not
if file.endswith(".iris"):
file_path = f"{path}\{file}"
# call read iris file function
print(read_iris_file(file_path))
Your files contain data in JSON format, so we can use built-in json module to parse it. To iterate over files with certain extension you can use pathlib.glob() with next pattern "*.iris". Then we can use csv.DictWriter() and pass "ignore" to extrasaction argument which will make DictWriter ignore keys which we don't need and write only those which we passed to fieldnames argument.
Code:
import csv
import json
from pathlib import Path
path = Path(r"path/to/folder")
keys = "uid", "enabled", "login", "name"
with open(path / "result.csv", "w", newline="") as out_f:
writer = csv.DictWriter(out_f, fieldnames=keys, extrasaction='ignore')
writer.writeheader()
for file in path.glob("*.iris"):
with open(file) as inp_f:
data = json.load(inp_f)
writer.writerow(data)
Try the below (the key point here is loading the iris file using ast)
import ast
fields = ('uid','enabled','login','name')
with open('my.iris') as f1:
data = ast.literal_eval(f1.read())
with open('my.csv','w') as f2:
f2.write(','.join(fields) + '\n')
f2.write(','.join(data[f] for f in fields) + '\n')
my.csv
uid,enabled,login,name
17182,true,xyz,abcdef

os.write() appends file instead of overwriting, but O_APPEND isn't used [duplicate]

I have the following code:
import re
#open the xml file for reading:
file = open('path/test.xml','r+')
#convert to string:
data = file.read()
file.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
file.close()
where I'd like to replace the old content that's in the file with the new content. However, when I execute my code, the file "test.xml" is appended, i.e. I have the old content follwed by the new "replaced" content. What can I do in order to delete the old stuff and only keep the new?
You need seek to the beginning of the file before writing and then use file.truncate() if you want to do inplace replace:
import re
myfile = "path/test.xml"
with open(myfile, "r+") as f:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
f.truncate()
The other way is to read the file then open it again with open(myfile, 'w'):
with open(myfile, "r") as f:
data = f.read()
with open(myfile, "w") as f:
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>", r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", data))
Neither truncate nor open(..., 'w') will change the inode number of the file (I tested twice, once with Ubuntu 12.04 NFS and once with ext4).
By the way, this is not really related to Python. The interpreter calls the corresponding low level API. The method truncate() works the same in the C programming language: See http://man7.org/linux/man-pages/man2/truncate.2.html
file='path/test.xml'
with open(file, 'w') as filetowrite:
filetowrite.write('new content')
Open the file in 'w' mode, you will be able to replace its current text save the file with new contents.
Using truncate(), the solution could be
import re
#open the xml file for reading:
with open('path/test.xml','r+') as f:
#convert to string:
data = f.read()
f.seek(0)
f.write(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>",data))
f.truncate()
import os#must import this library
if os.path.exists('TwitterDB.csv'):
os.remove('TwitterDB.csv') #this deletes the file
else:
print("The file does not exist")#add this to prevent errors
I had a similar problem, and instead of overwriting my existing file using the different 'modes', I just deleted the file before using it again, so that it would be as if I was appending to a new file on each run of my code.
See from How to Replace String in File works in a simple way and is an answer that works with replace
fin = open("data.txt", "rt")
fout = open("out.txt", "wt")
for line in fin:
fout.write(line.replace('pyton', 'python'))
fin.close()
fout.close()
in my case the following code did the trick
with open("output.json", "w+") as outfile: #using w+ mode to create file if it not exists. and overwrite the existing content
json.dump(result_plot, outfile)
Using python3 pathlib library:
import re
from pathlib import Path
import shutil
shutil.copy2("/tmp/test.xml", "/tmp/test.xml.bak") # create backup
filepath = Path("/tmp/test.xml")
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))
Similar method using different approach to backups:
from pathlib import Path
filepath = Path("/tmp/test.xml")
filepath.rename(filepath.with_suffix('.bak')) # different approach to backups
content = filepath.read_text()
filepath.write_text(re.sub(r"<string>ABC</string>(\s+)<string>(.*)</string>",r"<xyz>ABC</xyz>\1<xyz>\2</xyz>", content))

Merge columns of numbers of different lengths from separate text files into a single csv file

Hi I am fairly new to Python programming and cannot seem to overcome this issue.
I have a directory with 100 subfolders with a single text file in each (with no file extension) all named exactly the same. Each file contains a single column of numbers of different lengths.
I want to merge all the numbers of each file into a single csv file with the numbers from each file in separate columns.
So I should end up with a matrix of 100 columns of differing lengths of numbers where each column corresponds to a single file.
Example of files:
file1
1
15
23
22
10
file 2
3
55
22
I have this script:
# import modules
import glob
import csv
import sys
import itertools
inf = glob.glob("*/*-ambig")
for f in inf:
with open(f) as fin:
with open(sys.argv[1], 'w') as fout:
writer = csv.writer(fout, delimiter=',', quotechar='', quoting=csv.QUOTE_NONE)
headers = ('coverage', )
writer.writerow(headers)
for line in fin:
columns = line.split("\n") # split each column on new line
writer.writerow(itertools.izip_longest(*columns, fillvalue=['']))
However I am getting this error:
Traceback (most recent call last):
File "coverage_per_strain.py", line 21, in <module>
writer.writerow(itertools.izip_longest(*columns, fillvalue=['']))
_csv.Error: sequence expected
Does anyone have any idea what is wrong with my code? And can you see any other errors?
Thanks!
csv.writerow expects a sequence as an argument. itertools.izip_longest is returning an iterator. Hence the error message.
You should be able to fix the problem with:
writer.writerow(list(itertools.izip_longest(*columns, fillvalue=[''])))
Here's a solution that I wrote up before I realized you were using Python 2.7. This will only work as written in Python 3.3+, as it uses the very nifty contextlib.ExitStack context manager, which was only added in that version (I also use Python 3's map):
import glob
import csv
import sys
import contextlib
from itertools import zip_longest
in_filenames = glob.glob("*/*-ambig")
with contextlib.ExitStack() as stack:
in_files = [stack.enter_context(open(filename)) for filename in in_filenames]
out_file = stack.enter_context(open(sys.argv[1], "w", newlines=""))
writer = csv.writer(out_file, delimiter=',', quoting=csv.QUOTE_NONE)
writer.writerow(('coverage',)) # do you want the header repeated for each column?
writer.writerows(zip_longest(*(map(str.strip, f) for f in in_files), fillvalue=""))
Here's my attempt to port this back to Python 2. I've not tested this version. I use a try/finally pair to handle the closing of the files (and imap to handle the stripping without reading all of each file into memory up front):
import glob
import csv
import sys
from itertools import izip_longest, imap
in_filenames = glob.glob("*/*-ambig")
with open(sys.argv[1], "wb") as out_file:
in_files = []
try:
for filename in in_filenames:
in_files.append(open(filename))
writer = csv.writer(out_file, delimiter=',', quoting=csv.QUOTE_NONE)
writer.writerow(('coverage',))
writer.writerows(izip_longest(*[imap(str.strip, f) for f in in_files],
fillvalue=""))
finally:
for f in in_files:
try:
f.close()
except: # ignore exceptions here, or the later files might not get closed!
pass

Add file name as last column of CSV file

I have a Python script which modifies a CSV file to add the filename as the last column:
import sys
import glob
for filename in glob.glob(sys.argv[1]):
file = open(filename)
data = [line.rstrip() + "," + filename for line in file]
file.close()
file = open(filename, "w")
file.write("\n".join(data))
file.close()
Unfortunately, it also adds the filename to the header (first) row of the file. I would like the string "ID" added to the header instead. Can anybody suggest how I could do this?
Have a look at the official csv module.
Here are a few minor notes on your current code:
It's a bad idea to use file as a variable name, since that shadows the built-in type.
You can close the file objects automatically by using the with syntax.
Don't you want to add an extra column in the header line, called something like Filename, rather than just omitting a column in the first row?
If your filenames have commas (or, less probably, newlines) in them, you'll need to make sure that the filename is quoted - just appending it won't do.
That last consideration would incline me to use the csv module instead, which will deal with the quoting and unquoting for you. For example, you could try something like the following code:
import glob
import csv
import sys
for filename in glob.glob(sys.argv[1]):
data = []
with open(filename) as finput:
for i, row in enumerate(csv.reader(finput)):
to_append = "Filename" if i == 0 else filename
data.append(row+[to_append])
with open(filename,'wb') as foutput:
writer = csv.writer(foutput)
for row in data:
writer.writerow(row)
That may quote the data slightly differently from your input file, so you might want to play with the quoting options for csv.reader and csv.writer described in the documentation for the csv module.
As a further point, you might have good reasons for taking a glob as a parameter rather than just the files on the command line, but it's a bit surprising - you'll have to call your script as ./whatever.py '*.csv' rather than just ./whatever.py *.csv. Instead, you could just do:
for filename in sys.argv[1:]:
... and let the shell expand your glob before the script knows anything about it.
One last thing - the current approach you're taking is slightly dangerous, in that if anything fails when writing back to the same filename, you'll lose data. The standard way of avoiding this is to instead write to a temporary file, and, if that was successful, rename the temporary file over the original. So, you might rewrite the whole thing as:
import csv
import sys
import tempfile
import shutil
for filename in sys.argv[1:]:
tmp = tempfile.NamedTemporaryFile(delete=False)
with open(filename) as finput:
with open(tmp.name,'wb') as ftmp:
writer = csv.writer(ftmp)
for i, row in enumerate(csv.reader(finput)):
to_append = "Filename" if i == 0 else filename
writer.writerow(row+[to_append])
shutil.move(tmp.name,filename)
You can try:
data = [file.readline().rstrip() + ",id"]
data += [line.rstrip() + "," + filename for line in file]
You can try changing your code, but using the csv module is recommended. This should give you the result you want:
import sys
import glob
import csv
filename = glob.glob(sys.argv[1])[0]
yourfile = csv.reader(open(filename, 'rw'))
csv_output=[]
for row in yourfile:
if len(csv_output) != 0: # skip the header
row.append(filename)
csv_output.append(row)
yourfile = csv.writer(open(filename,'w'),delimiter=',')
yourfile.writerows(csv_output)
Use the CSV module that comes with Python.
import csv
import sys
def process_file(filename):
# Read the contents of the file into a list of lines.
f = open(filename, 'r')
contents = f.readlines()
f.close()
# Use a CSV reader to parse the contents.
reader = csv.reader(contents)
# Open the output and create a CSV writer for it.
f = open(filename, 'wb')
writer = csv.writer(f)
# Process the header.
header = reader.next()
header.append('ID')
writer.writerow(header)
# Process each row of the body.
for row in reader:
row.append(filename)
writer.writerow(row)
# Close the file and we're done.
f.close()
# Run the function on all command-line arguments. Note that this does no
# checking for things such as file existence or permissions.
map(process_file, sys.argv[1:])
You can run this as follows:
blair#blair-eeepc:~$ python csv_add_filename.py file1.csv file2.csv
you can use fileinput to do in place editing
import sys
import glob
import fileinput
for filename in glob.glob(sys.argv[1]):
for line in fileinput.FileInput(filename,inplace=1) :
if fileinput.lineno()==1:
print line.rstrip() + " ID"
else
print line.rstrip() + "," + filename

Categories