Parse all the XML files and convert them to one CSV file - python

I am trying to write a code where it search all the XML files in directory then parse those XML and save some data to a CSV file. I have 50 plus XML files in that directory. Whenever I run my code a CSV file created but it only prints data of the last xml file. How can i print all the XML file's data to a CSV file?Please help
Here is my code :
from xml.dom.minidom import parse
import csv
import os
def writeToCSV(frelation):
csvfile = open('data.csv', 'w')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
for root, dirs, files in os.walk('data/frames'):
for file in files:
if (file.endswith('.xml')):
xmldoc = parse(os.path.join(root, file))
frelation = xmldoc.getElementsByTagName("frameset")[0]
relation = frelation.getElementsByTagName("predicate")
writeToCSV(frelation)

U are overwriting the same file again and again in the WriteToCSV , may be a little change as below:
def writeToCSV(frelation,file_id):
csvfile = open('data'+str(file_id)+'.csv', 'w')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
file_id=1;
for root, dirs, files in os.walk('data/frames'):
for file in files:
if (file.endswith('.xml')):
xmldoc = parse(os.path.join(root, file))
frelation = xmldoc.getElementsByTagName("frameset")[0]
relation = frelation.getElementsByTagName("predicate")
writeToCSV(frelation,file_id)
file_id+=1
if you want only one CSV file, u need to open the file in append mode, a+ mode indicates create file if does not exist.:
def writeToCSV(frelation):
csvfile = open('data.csv', 'a+')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
No changes required in other code.

Related

Write a CSV from JSON, importing only given keys

I have JSONs reporting different values, and I want to import only some keys in a csv.
I have tried 2 approaches, but both give me some problems.
At first, I have tried this :
`import os,json
import glob
import csv
# Place your JSON data in a directory named 'data/'
src = "MYPATH"
data = []
json_pattern = os.path.join(src, '*.json')
# only json
files = glob.glob(json_pattern, recursive=True)
# Loop through files
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
try:
data.append([
json_file['name1'],
json_file['name2'],
json_file['name3'],
json_file['name4'],
])
except KeyError:
continue
# Add headers
data.insert(0, ['title_1', 'title_2', 'title_3'])
# Export to CSV.
# Add the date to the file name to avoid overwriting it each time.
csv_filename = 'name.csv'
with open((src + csv_filename), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(data)`
In this way, unfortunately, if a key is not included, the code skip the file altogether, while I want it to skip only the key.
So I tried this, instead:
import os,json
import glob
import csv
# Place your JSON data in a directory named 'data/'
src = "MY_PATH"
data = []
json_pattern = os.path.join(src, '*.json')
# Change the glob if you want to only look through files with specific names
files = glob.glob(json_pattern, recursive=True)
# Loop through files
col_name = ['name1','name2','name4']
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
for key in col_name:
try:
data.append([json_file[key]])
except KeyError:
continue
# Add headers
data.insert(0, ['title_1', 'title_2', 'title_3'])
# Export to CSV.
# Add the date to the file name to avoid overwriting it each time.
csv_filename = 'name.csv'
with open((src + csv_filename), "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(data)
But in this case, each value is a new row in the csv, while I want the value from each json in a single row.
I am not an expert and I really don't know how to combine this two.
Can someone help me out?
Thanks!
If I understand what you're trying to do correctly, why not just do
# Loop through files
for single_file in files:
with open(single_file, 'r') as f:
json_file = json.load(f)
data.append([
json_file.get('name1', ''),
json_file.get('name2', ''),
json_file.get('name3', ''),
json_file.get('name4', '')
])
By using .get() you can specify the default value in case a key isn't found.

Looping through list of xml-files?

I'm trying to create a program that loops through a list of xml-files and extracts certain elements from the files:
from os import listdir, path
import xml.etree.ElementTree as ET
mypath = 'C:\myfolder'
files = [f for f in listdir(mypath) if f.endswith('.xml')]
for file in files:
tree = ET.parse(file)
root = tree.getroot()
ns = {namespaces}
def myfunction():
if 'something' in root.tag:
filename = path.splitext(file)[0]
var1 = root.find('./element1', ns)
var2 = root.find('./element2', ns)
row = [
var1.text,
var2.text
]
return row
The above code returns a list with var1, var2 (from the last file) if I call the function. The reason I have defined this function is that there are different types of xml-files with different element names, so I'm going to create a function for each file type.
Now I want to create a table where the output from each file is a row i.e.:
filename1, var1, var2
filename2, var1, var2
ect.
And ideally export the table to a csv-file. How do I go about that?
The easiest way to write a CSV file is using the Standard CSV.
To write a CSV file, is as simple as opening the file and using the default writer:
import csv
from os import listdir, path
import xml.etree.ElementTree as ET
mypath = 'C:\myfolder'
files = [f for f in listdir(mypath) if f.endswith('.xml')]
for file in files:
tree = ET.parse(file)
root = tree.getroot()
ns = {namespaces}
def myfunction():
if 'something' in root.tag:
filename = path.splitext(file)[0]
var1 = root.find('./element1', ns)
var2 = root.find('./element2', ns)
row = [
var1.text,
var2.text
]
# Open the file and store the data
with open('outfile.csv', 'a', newline='') as csvfile:
csv_writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(row)
return row
Note that csf.writer receives a list as parameter.

Python CSV writer - writing columns in new csv file up to maximum number of fields in csv files

I have 200 CSV files in my folder.
What I am trying to do is read first row of each files and write in new csv.
And on top, I want to write [file,field1,field2,...fieldn]
n is maximum number of fields.
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
max(hel)
lst = [file] + headers
csv_writer.writerow(lst)
It came out that maximum number of fields of 200 files are 255.
So on top of new csv file, I want to write file, field1, field2 ... field 255.
How can I do this?
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
b=['field{}'.format(i) for i in range(1,max(hel)+1)]
lst = [file] + headers
csv_writer.writerow(lst)
Now b is list that looks like this ['field1','field2'...'field255']
I need to insert 'file' before 'field1' and write that row on the top of new csv file. Writing code after csv_writer.writerow(lst) gives me csv file with 'field1','field2'.. every other line. How can I fix this problem
You first need to read all your input files to determine the maximum number of fields is 255. Then you need to construct a list of field names to write into the output file (just once, not in a loop):
['field{}'.format(i) for i in range(1, 256)]
You can pass that list to the csv module to write it.
Read the field count and first line from each file before writing the file.
import glob
from itertools import chain
import os
from os.path import splitext, basename
def first_line(filepath):
with open(filepath) as f:
return next(f)
def write_test_file(dest_file_path, source_path_name):
source_paths = glob.glob(source_path_name)
first_lines = list(map(first_line, source_paths))
max_count = max(l.count(",") for l in first_lines)
field_names = map("field{}".format, range(1, max_count + 2))
header = ",".join(chain(["file"], field_names)) + os.linesep
file_names = (splitext(basename(p))[0] for p in source_paths)
content = chain([header], map(",".join, zip(file_names, first_lines)))
with open(dest_file_path, 'w') as testfile:
testfile.write("".join(content))
write_test_file('test.csv', 'C:/dataset/*.csv')

Python CSV: Can I do this with one 'with open' instead of two?

I am a noobie.
I have written a couple of scripts to modify CSV files I work with.
The scripts:
1.) change the headers of a CSV file then save that to a new CSV file,.
2.) Load that CSV File, and change the order of select columns using DictWriter.
from tkinter import *
from tkinter import filedialog
import os
import csv
root = Tk()
fileName = filedialog.askopenfilename(filetypes=(("Nimble CSV files", "*.csv"),("All files", "*.*")))
outputFileName = os.path.splitext(fileName)[0] + "_deleteme.csv" #my temp file
forUpload = os.path.splitext(fileName)[0] + "_forupload.csv"
#Open the file - change the header then save the file
with open(fileName, 'r', newline='') as infile, open(outputFileName, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile, delimiter=',', lineterminator='\n')
row1 = next(reader)
#new header names
row1[0] = 'firstname'
row1[1] = 'lastname'
row1[4] = 'phone'
row1[5] = 'email'
row1[11] = 'address'
row1[21] = 'website'
#write the temporary CSV file
writer.writerow(row1)
for row in reader:
writer.writerow(row)
#Open the temporary CSV file - rearrange some columns
with open(outputFileName, 'r', newline='') as dInFile, open(forUpload, 'w', newline='') as dOutFile:
fieldnames = ['email', 'title', 'firstname', 'lastname', 'company', 'phone', 'website', 'address', 'twitter']
dWriter = csv.DictWriter(dOutFile, restval='', extrasaction='ignore', fieldnames=fieldnames, lineterminator='\n')
dWriter.writeheader()
for row in csv.DictReader(dInFile):
dWriter.writerow(row)
My question is: Is there a more efficient way to do this?
It seems like I shouldn't have to make a temporary CSV file ("_deleteme.csv") I then delete.
I assume making the temporary CSV file is a rookie move -- is there a way to do this all with one 'With open' statement?
Thanks for any help, it is greatly appreciated.
--Luke
csvfile can be any object with a write() method. You could craft a custom element, or use StringIO. You'd have to verify efficiency yourself.

grab headers from multiple tsv/csv files

I have a list of tsv files where I am looking to grab column headers for all the files.
with open(os.path.abspath('reference/file.tsv'), 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
Currently, this snippet only reads 1 file where I have a list of files that needs to be parsed.
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
The name of the files are listed in files. How do I loop through the list of files and grab only the column headers for each file?
I try this and it works.
import os
import csv
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for f in files:
with open(dir_path +'/'+f, 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
print row1
The files variable in your code is the content of the reference folder, meaning all files and subfolders of the folder. They are returned in a list of strings, containing only the file or subfolder name. This means that you'll have to prefix the path yourself.
Example:
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for file in files:
# Skip non-files
if not os.path.isfile(file):
continue
with open(os.path.join(dir_path, file), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
An alternative using the pathlib module:
for file in Path('reference/').glob('*'):
if not file.is_file():
continue
with open(str(file.resolve()), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
Wouldn't you be better off in reading the first line of each of those files, appending them to a list and then passing them to csvreader?
Example:
lines = []
with open(str(file.resolve()), 'rU') as f:
lines.append(f.readline())
reader = csv.reader(lines)
for row in reader:
# whatever you want to do with the parsed lines

Categories