Looping through list of xml-files? - python

I'm trying to create a program that loops through a list of xml-files and extracts certain elements from the files:
from os import listdir, path
import xml.etree.ElementTree as ET
mypath = 'C:\myfolder'
files = [f for f in listdir(mypath) if f.endswith('.xml')]
for file in files:
tree = ET.parse(file)
root = tree.getroot()
ns = {namespaces}
def myfunction():
if 'something' in root.tag:
filename = path.splitext(file)[0]
var1 = root.find('./element1', ns)
var2 = root.find('./element2', ns)
row = [
var1.text,
var2.text
]
return row
The above code returns a list with var1, var2 (from the last file) if I call the function. The reason I have defined this function is that there are different types of xml-files with different element names, so I'm going to create a function for each file type.
Now I want to create a table where the output from each file is a row i.e.:
filename1, var1, var2
filename2, var1, var2
ect.
And ideally export the table to a csv-file. How do I go about that?

The easiest way to write a CSV file is using the Standard CSV.
To write a CSV file, is as simple as opening the file and using the default writer:
import csv
from os import listdir, path
import xml.etree.ElementTree as ET
mypath = 'C:\myfolder'
files = [f for f in listdir(mypath) if f.endswith('.xml')]
for file in files:
tree = ET.parse(file)
root = tree.getroot()
ns = {namespaces}
def myfunction():
if 'something' in root.tag:
filename = path.splitext(file)[0]
var1 = root.find('./element1', ns)
var2 = root.find('./element2', ns)
row = [
var1.text,
var2.text
]
# Open the file and store the data
with open('outfile.csv', 'a', newline='') as csvfile:
csv_writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(row)
return row
Note that csf.writer receives a list as parameter.

Related

How to use elements in list by order

My goal is to change multiple csv files in a folder into JSON.
First, I needed to list my csv files
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
#check if csv files are listed correctly
print(os.path.join("C:/Users/folder_to_csv", filename))
With this, I was able to call csv files in that folder.
Result:
C:/Users/folder_to_csv\file_1.csv C:/Users/folder_to_csv\file_2.csv C:/Users/folder_to_csv\file_3.csv
Then, I wanted to use all of the csv files in 'csvlist' to jsonObj, however for some reason, my codes are only using the first file (C:/Users/folder_to_csv\file_1.csv)
This is what I have tried so far:
import json
import csv
import requests
import threading
import os
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
csvlist = os.path.join("C:/Users/folder_to_csv", filename)
data = {}
def main():
#loop csv list so my codes can read all csv files
length = len(csvlist)
for i in range(length):
i += 1
path = csvlist
#switch csv to json
with open(path, mode='r') as f:
reader = csv.DictReader(f)
processdata = [row for row in reader]
dlist = processdata
jsonObj = json.dumps(dlist)
})
print(jsonObj)
main()
In the initial loop, you keep redefining the csvlist variable. I suppose you want it to be a list? Then just create an initial empty list and append to it instead of redefining
csvlist = []
...
csvlist.append(os.path.join("C:/Users/folder_to_csv", filename))

How to pass a file as an argument in python

I am having some issues passing an argument in a python script to take a specific file like a csv, txt, or xml
I am reviewing python and would like some feedback on why I don't see any output after running the following command: ./my_script some3455.csv
#!/usr/bin/python
import sys
import csv
import xml.etree.ElementTree as ET
FILE = str(sys.argv[1])
def run_files():
if FILE == '*.csv'
run_csv()
elif FILE == '*.txt'
run_txt()
else
run_xml()
def run_csv():
csv_file = csv.register_dialect('dialect', delimiter = '|')
with open(FILE, 'r') as file:
reader = csv.reader(file, dialect='dialect')
for row in reader:
print(row)
def run_txt():
with open(FILE, 'r') as file:
txt_contents = file.read()
print(txt_contents)
def run_xml():
tree = ET.parse(FILE)
root = tree.getroot()
for child in root.findall('Attributes')
car = child.find('Car').text
color = child.find('Color').text
print(car, color)
I have tried to pass it as without the FILE but works just for one and the other file types doesn't get identify.
You need to use fnmatch and not == to compare a string with a glob pattern:
import fnmatch
def run_files():
if fnmatch.fnmatch(FILE, '*.csv'):
run_csv()
elif fnmatch.fnmatch(FILE, '*.txt'):
run_txt()
else:
run_xml()

How to read n number of files in n variables and then add those variables in a list?

This is my code:
file_input1 = open('Amazon_Indi_Seller.py', 'r')
f1 = file_input1.read().lower()
file_input2 = open('Amazon_Prices.py', 'r')
f2 = file_input2.read().lower()
documents = [f1, f2]
import nltk, string, numpy
stemmer = nltk.stem.porter.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return
LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize,
stop_words='english')
LemVectorizer.fit_transform(documents)
Instead of reading 2 files i want to read all the files in a directory. And read them individually so that later I can add those variables in a list named documents.
You can use the code mentioned below,
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
files = ['sample.py', 'Amazon_Indi_Seller.py']
data = list()
for file in files:
data.append(read_files(file))
print(data)
The above code will used to read the files mentioned in the list
import os
def read_files(file):
file_input1 = open(file, 'r')
f1 = file_input1.read()
return f1
src = r'DIRECTORY PATH'
data = list()
for file in os.listdir(src):
data.append(read_files(file))
print(data)
And the above code will read all the files from the directory mentioned
You could collect all the in a list, for example:
lst = []
for file in os.listdir():
file_input = open(file,"r")
lst.append(file_input.read())
One extra recommendation - in general it might be wise to store the contents of a file as a collection of its lines by for example using file_input.readlines() which returns a list of lines.
Create a list of all filenames and then iterate over filename list and add their content in a dictionary.
from collections import defaultdict #imported default dictionary
result = defaultdict() #created empty default dictionary
filenames = ['name1.py', 'name2.py', 'name3.py'] #added filenames to a list
for name in filenames: #iterate over filename list
with open(name, 'r') as stream: #open each file
data = stream.readlines() #read contents lines by line (readlines return list of lines)
result[name] = data # set name as key and content as value in dictionary
print(result)
In this way you will have a dictionary with keys as filenames and values as their contents
If the directory may include other directories with files, which you want to read to - use os.walk
Here is sample code from the official documentation:
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
print root, "consumes",
print sum(getsize(join(root, name)) for name in files),
print "bytes in", len(files), "non-directory files"
if 'CVS' in dirs:
dirs.remove('CVS') # don't visit CVS directories

Parse all the XML files and convert them to one CSV file

I am trying to write a code where it search all the XML files in directory then parse those XML and save some data to a CSV file. I have 50 plus XML files in that directory. Whenever I run my code a CSV file created but it only prints data of the last xml file. How can i print all the XML file's data to a CSV file?Please help
Here is my code :
from xml.dom.minidom import parse
import csv
import os
def writeToCSV(frelation):
csvfile = open('data.csv', 'w')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
for root, dirs, files in os.walk('data/frames'):
for file in files:
if (file.endswith('.xml')):
xmldoc = parse(os.path.join(root, file))
frelation = xmldoc.getElementsByTagName("frameset")[0]
relation = frelation.getElementsByTagName("predicate")
writeToCSV(frelation)
U are overwriting the same file again and again in the WriteToCSV , may be a little change as below:
def writeToCSV(frelation,file_id):
csvfile = open('data'+str(file_id)+'.csv', 'w')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
file_id=1;
for root, dirs, files in os.walk('data/frames'):
for file in files:
if (file.endswith('.xml')):
xmldoc = parse(os.path.join(root, file))
frelation = xmldoc.getElementsByTagName("frameset")[0]
relation = frelation.getElementsByTagName("predicate")
writeToCSV(frelation,file_id)
file_id+=1
if you want only one CSV file, u need to open the file in append mode, a+ mode indicates create file if does not exist.:
def writeToCSV(frelation):
csvfile = open('data.csv', 'a+')
fieldnames = ['sub', 'sup']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
relation = frelation.getElementsByTagName("predicate")
for elem in relation:
sub = elem.attributes['sub'].value
for elem1 in elem.getElementsByTagName("sup"):
sup = elem1.attributes['name'].value
writer.writerow({'sub': sub, 'sup': sup})
No changes required in other code.

grab headers from multiple tsv/csv files

I have a list of tsv files where I am looking to grab column headers for all the files.
with open(os.path.abspath('reference/file.tsv'), 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
Currently, this snippet only reads 1 file where I have a list of files that needs to be parsed.
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
The name of the files are listed in files. How do I loop through the list of files and grab only the column headers for each file?
I try this and it works.
import os
import csv
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for f in files:
with open(dir_path +'/'+f, 'rU') as file:
reader = csv.reader(file)
row1 = next(reader)
print row1
The files variable in your code is the content of the reference folder, meaning all files and subfolders of the folder. They are returned in a list of strings, containing only the file or subfolder name. This means that you'll have to prefix the path yourself.
Example:
dir_path = os.path.abspath('reference/')
files = os.listdir(dir_path)
for file in files:
# Skip non-files
if not os.path.isfile(file):
continue
with open(os.path.join(dir_path, file), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
An alternative using the pathlib module:
for file in Path('reference/').glob('*'):
if not file.is_file():
continue
with open(str(file.resolve()), 'rU') as f:
reader = csv.reader(f)
row1 = next(reader)
Wouldn't you be better off in reading the first line of each of those files, appending them to a list and then passing them to csvreader?
Example:
lines = []
with open(str(file.resolve()), 'rU') as f:
lines.append(f.readline())
reader = csv.reader(lines)
for row in reader:
# whatever you want to do with the parsed lines

Categories