TypeError when parsing XML - python

I have an XML file of metadata on dissertations and I'm trying to get the author name as a single string. Names in the XML look like this:
<DISS_name>
<DISS_surname>Clark</DISS_surname>
<DISS_fname>Brian</DISS_fname>
<DISS_middle/>
<DISS_suffix/>
</DISS_name>
All names have first and last names, but only some have middle names and/or suffixes. Here is my code:
author_surname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_surname').text.strip().title()
author_fname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_fname').text.strip().title()
author_mname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_middle')
author_suffix = record.find('DISS_authorship/DISS_author/DISS_name/DISS_suffix')
if author_mname is not None and author_suffix is not None:
author_name = author_surname + ', ' + author_fname + author_mname.text + ', ' + author_suffix.text
if author_mname is not None and author_suffix is None:
author_name = author_surname + ', ' + author_fname + author_mname.text
if author_mname is None and author_suffix is None:
author_name = author_surname + ', ' + author_fname
Why am I getting this output and how can I fix it?
Traceback (most recent call last):
File "C:\Users\bpclark2\pythonProject3\prqXML-to-dcCSV.py", line 185, in <module>
author_name = author_surname + ', ' + author_fname + author_mname.text + author_suffix.text
TypeError: can only concatenate str (not "NoneType") to str
Revised code:
author_surname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_surname').text.strip().title()
author_fname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_fname').text.strip().title()
author_mname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_middle').text or ''
author_suffix = record.find('DISS_authorship/DISS_author/DISS_name/DISS_suffix').text or ''
author_name = author_surname + ', ' + author_fname + ' ' + str(author_mname.strip().title()) + str(', ' + author_suffix.strip().title())
row.append(author_name)
This gets the output I was looking for:
author_surname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_surname').text.strip().title()
author_fname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_fname').text.strip().title()
author_mname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_middle').text or ''
author_suffix = record.find('DISS_authorship/DISS_author/DISS_name/DISS_suffix').text or ''
author_name = author_surname + ', ' + author_fname + ' ' + author_mname.strip().title() + ', ' + author_suffix.strip().title()
if author_mname != '' and author_suffix != '':
author_name = author_surname + ', ' + author_fname + ' ' + author_mname.strip().title() + ', ' + author_suffix.strip().title()
row.append(author_name)
if author_mname != '' and author_suffix == '':
author_name = author_surname + ', ' + author_fname + ' ' + author_mname.strip().title()
row.append(author_name)
if author_mname == '' and author_suffix != '':
author_name = author_surname + ', ' + author_fname + ', ' + author_suffix.strip().title()
row.append(author_name)
if author_mname == '' and author_suffix == '':
author_name = author_surname + ', ' + author_fname
row.append(author_name)

What about changing your code to something like this:
author_mname = record.find('DISS_authorship/DISS_author/DISS_name/DISS_middle') or ''
author_suffix = record.find('DISS_authorship/DISS_author/DISS_name/DISS_suffix') or ''
Also you could add str casts like:
... + str(author_suffix.text)
And if you are on new python please use f-strings! Life is much easier with them.

I'd keep everything simple with just minor edits of code. You can use an XPath .//DISS_name to find all <DISS_name> nodes and then just unpack it into a separate variables with corresponding names. Code:
import xml.etree.ElementTree as ET
data = """\
<DISS_authorship>
<DISS_author>
<DISS_name>
<DISS_surname>Clark</DISS_surname>
<DISS_fname>Brian</DISS_fname>
<DISS_middle/>
<DISS_suffix/>
</DISS_name>
</DISS_author>
</DISS_authorship>"""
root = ET.fromstring(data)
row = []
for name_node in root.iterfind(".//DISS_name"):
surname, fname, middle, suffix = name_node # 4 child nodes in this order
name_str = surname.text + ", " + fname.text
if middle.text:
name_str += " " + middle.text
if suffix.text:
name_str += ", " + suffix.text
row.append(name_str)
Or even shorter:
import xml.etree.ElementTree as ET
data = ...
root = ET.fromstring(data)
row = []
for (surname, fname, middle, suffix) in root.iterfind(".//DISS_name"):
name_str = surname.text + ", " + fname.text
if middle.text:
name_str += " " + middle.text
if suffix.text:
name_str += ", " + suffix.text
row.append(name_str)

A shorter concept below
import xml.etree.ElementTree as ET
xml = '''<r><DISS_name>
<DISS_surname>Clark</DISS_surname>
<DISS_fname>Brian</DISS_fname>
<DISS_middle/>
<DISS_suffix/>
</DISS_name>
<DISS_name>
<DISS_surname>Jack</DISS_surname>
<DISS_fname>Brian</DISS_fname>
<DISS_middle>Smith</DISS_middle>
<DISS_suffix/>
</DISS_name>
</r>'''
root = ET.fromstring(xml)
for name in root.findall('.//DISS_name'):
parts = [name.find(f'DISS_{f}').text for f in ['surname','fname','middle','suffix'] if name.find(f'DISS_{f}').text is not None ]
print(", ".join(parts))
output
Clark, Brian
Jack, Brian, Smith

Related

Tweepy error with exporting array content

I am looking to extract tweets and write them to a CSV file, however, I cannot figure out how to get it to generate a file. I am using Tweepy to extract the tweets. I would like the CSV file to contain the following cells: User, date, tweet, likes, retweets, total, eng rate, rating, tweet id
import tweepy
import csv
auth = tweepy.OAuthHandler("", "")
auth.set_access_token("", "")
api = tweepy.API(auth)
try:
api.verify_credentials()
print("Authentication OK")
except:
print("Error during authentication")
def timeline(username):
tweets = api.user_timeline(screen_name=username, count = '100', tweet_mode="extended")
for status in (tweets):
eng = round(((status.favorite_count + status.retweet_count)/status.user.followers_count)*100, 2)
if (not status.retweeted) and ('RT #' not in status.full_text) and (eng <= 0.02):
print (status.user.screen_name + ',' + str(status.created_at) + ',' + status.full_text + ",Likes: " + str(status.favorite_count) + ",Retweets: " + str(status.retweet_count) + ',Total: ' + str(status.favorite_count + status.retweet_count) + ',Engagement rate: ' + str(eng) + '%' + 'Rating: Low' + ',Tweet ID: ' + str(status.id))
elif (not status.retweeted) and ('RT #' not in status.full_text) and (0.02 < eng <= 0.09):
print (status.user.screen_name + ',' + str(status.created_at) + ',' + status.full_text + ",Likes: " + str(status.favorite_count) + ",Retweets: " + str(status.retweet_count) + ',Total: ' + str(status.favorite_count + status.retweet_count) + ',Engagement rate: ' + str(eng) + '%' + 'Rating: Good' + ',Tweet ID: ' + str(status.id))
elif (not status.retweeted) and ('RT #' not in status.full_text) and (0.09 < eng <= 0.33):
print (status.user.screen_name + ',' + str(status.created_at) + ',' + status.full_text + ",Likes: " + str(status.favorite_count) + ",Retweets: " + str(status.retweet_count) + ',Total: ' + str(status.favorite_count + status.retweet_count) + ',Engagement rate: ' + str(eng) + '%' + 'Rating: High' + ',Tweet ID: ' + str(status.id))
elif (not status.retweeted) and ('RT #' not in status.full_text) and (0.33 < eng):
print (status.user.screen_name + ',' + str(status.created_at) + ',' + status.full_text + ",Likes: " + str(status.favorite_count) + ",Retweets: " + str(status.retweet_count) + ',Total: ' + str(status.favorite_count + status.retweet_count) + ',Engagement rate: ' + str(eng) + '%' + 'Rating: Very High' + ',Tweet ID: ' + str(status.id))
tweet = timeline("twitter")
with open('tweet.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([tweet])
You can look at https://docs.python.org/3/library/csv.html for the info on how to generate a csv file in Python. Quick exmaple:
import csv
with open('some_output.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["field1", "field2", "field3"])
Your function get_tweets does not return a value but you are trying to retrieve a value from that function which would result in None. Also it looks like tweet value will be list of strings. writerow method from csv.writer should get list of items and not list of lists. I have modified your code to address those issues. Let me know if it works.
def get_tweets(username):
tweets = api.user_timeline(screen_name=username, count=100)
tweets_for_csv = [tweet.text for tweet in tweets]
print(tweets_for_csv)
return tweets_for_csv
tweet = get_tweets("fazeclan")
with open('tweet.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(tweet)

Python adds a double entry at the end of output file

This is my code to create a hashtag file. The issue is it does not put the # for the first hashtag and at he end it puts a double hashtag like below.
passiveincome, #onlinemarketing, #wahmlife, #cash, #entrepreneurlifestyle, #makemoneyonline, #makemoneyfast, #entrepreneurlifestyle, #mlm, #mlm
How do I get the code to remove the double output and put the # at the beginning?
import random, os, sys
basepath = os.path.dirname(sys.argv[0]) + "/"
outputpath = "C:/Users/matth/OneDrive/Desktop/Create hashtags/"
paragraphsmin = 9
paragraphsmax = 9
sentencemin = 1
sentencemax = 1
keywords = []
for line in open(basepath + "/base.txt", "r"):
keywords.append(line.replace("\n",""))
keywordlist = []
keyword = open(basepath + "/text-original.txt", "r")
for line in keyword:
keywordlist.append(line.replace("\n", "\n"))
def type(name):
value = name[random.randint(0,len(name)-1)]
return value
"""
def xyz(num):
s1 = '' + type(keywordlist).strip()
return eval('s' + str(num))
"""
def s1():
return '' + type(keywordlist).strip()
def randomSentence():
sent = eval("s" + str(random.randint(1,1)) + "()")
return sent
for keyword in keywords:
outputfile = open(outputpath + keyword.replace(" ", " ") + ".txt", "w")
outputfile.write('')
for p in range(1,random.randint(paragraphsmin,paragraphsmax) + 1):
outputfile.write('')
for s in range(1,random.randint(sentencemin,sentencemax) + 1):
sentence = randomSentence()
if str(sentence)[0] == "\"":
outputfile.write("" + str(sentence)[0] + str(sentence)[1] + str(sentence)[2:] + " ")
else:
outputfile.write("" + str(sentence)[0] + str(sentence)[1:] + ", #")
outputfile.write('')
outputfile.write(sentence.replace("", "") + "")
outputfile.close()
Try replacing
outputfile.write("" + str(sentence)[0] + str(sentence)[1:] + ", #")
with
outputfile.write("#" + str(sentence)[0] + str(sentence)[1:] + ", ")

how to remove elements with a common characters in a list?

i would like to remove elements of the form ' + 0x^n' (except the last one if its in the form ' + 0x^0') from this list:
polynomial = ['-7x^5', ' + 0x^4', ' + 0x^3', ' + 4x^2', ' + 4x^1', ' + 2x^0']
i.e. the output should look like this:
['-7x^5', ' + 4x^2', ' + 4x^1', ' + 2x^0']
i tried looping through each element in elements followed by an if statement that would remove list elements with the third index being '0'(see code below)
res = []
for elements in range(0, len(polynomial) - 1):
if polynomial[elements][3] == '0':
polynomial.remove(polynomial[elements])
res.append(polynomial)
else:
res.append(polynomial)
print(res[0])
Try :
polynomial = ['-7x^5', ' + 0x^4', ' + 0x^3', ' + 4x^2', ' + 4x^1', ' + 2x^0',' + 0x^0']
res=[]
for p in polynomial:
if p==' + 0x^0' or p[:-1]!=' + 0x^':
res.append(p)
print(res) #['-7x^5', ' + 4x^2', ' + 4x^1', ' + 2x^0', ' + 0x^0']
Without regex :
polynomial = ['-7x^5', ' + 0x^4', ' + 0x^3', ' + 4x^2', ' + 4x^1', ' + 2x^0',' + 0x^0']
res = [i for i in polynomial if "0x^" not in i or "0x^0" in i]
print(res)
You can try:
polynomial = ['-7x^5', ' + 0x^4', ' + 0x^3', ' + 4x^2', ' + 4x^1', ' + 2x^0', '0x^2']
res = []
s = "0x^" # delete string tag, you can input
for i in polynomial:
if s not in i:
res.append(i)
print(res)

python - rename files with 001,002 etc

I am working on a code that will encode my files according to the filename. If the file is named DOG, CAT, the file should be renamed to DEL/DBD plus date and number (001,002). The problem is that if I have several CAT files, I get an error as the code does not increment the number. Also, the numeration should be separate for each file type (so CAT 001, 002 and DOG 001, 002).
I've tried to increase the counter by one every time I get the FileNameError exception, but it does not seem to work. Could you please help me out?
import os
from datetime import date
def rename_files():
path = 'U:\get_filename'
cur_day = str(date.today().strftime("%Y%m%d"))
counter =+ 1
stamp_2 = "{} 00{}.pdf".format(cur_day,str(counter))
del01 = 'DEL ' + stamp_2
dbd02 = 'DBD ' + stamp_2
cgf01 = 'CGF ' + stamp_2
cle01 = 'CLE ' + stamp_2
dmm01 = 'DMM ' + stamp_2
dqt01 = 'DQT ' + stamp_2
sri01 = 'SRI ' + stamp_2
cal01 = 'CAL ' + stamp_2
for r,d,f in os.walk(path):
for file in f:
counter +=1
try:
if 'DOG' in file:
os.rename(file, del01)
elif 'CAT' in file:
os.rename(file, dbd02)
elif 'BIRD' in file:
os.rename(file, cgf01)
elif 'FISH' in file:
os.rename(file, cle01)
elif 'INSECT' in file:
os.rename(file, dmm01)
elif 'CAR' in file:
os.rename(file, dqt01)
elif 'BIKE' in file:
os.rename(file, sri01)
elif 'SCOOTER' in file:
os.rename(file, cal01)
except:
counter +=1
rename_files()
import os
from datetime import date
names = {
"DEL": 1,
"DBD": 1,
"CGF": 1,
"CLE": 1,
"DMM": 1,
"DQT": 1,
"SRI": 1,
"CAL": 1,
}
cur_day = str(date.today().strftime("%Y%m%d"))
def rename(source, name):
prefix = "{} {0:03d}.pdf".format(cur_day, names[name])
try:
os.rename(source, "{} {}".format(name, prefix))
names[name] += 1
except:
pass
def rename_files():
path = "U:\get_filename"
for r, d, f in os.walk(path):
for file in f:
if "DOG" in file:
rename(file, "DEL")
elif "CAT" in file:
rename(file, "DBD")
elif "BIRD" in file:
rename(file, "CGF")
elif "FISH" in file:
rename(file, "CLE")
elif "INSECT" in file:
rename(file, "DMM")
elif "CAR" in file:
rename(file, "DQT")
elif "BIKE" in file:
rename(file, "SRI")
elif "SCOOTER" in file:
rename(file, "CAL")
rename_files()
Great, thanks for help. Though it's not quite perfect yet, when I run the rename function, I get a - TypeError: unhashable type: 'dict'.
import os
from datetime import date
import pandas as pd
cur_day = str(date.today().strftime("%Y%m%d"))
def main():
print("Current Working Directory " , os.getcwd())
os.chdir(r'U:\\get_filename')
i = 1
j = 1
k = 1
l = 1
m = 1
n = 1
o = 1
p = 1
q = 1
r = 1
path = r'U:\get_filename'
for filename in os.listdir(r'U:\get_filename'):
my_dest = 'CLE ' + cur_day + ' 00' + str(i) + ".pdf"
my_dest2 = 'DBD ' + cur_day + ' 00' + str(j) + ".pdf"
my_dest3 = 'CGF ' + cur_day + ' 00' + str(k) + ".pdf"
my_dest4 = 'CLE ' + cur_day + ' 00' + str(l) + ".pdf"
my_dest5 = 'DMM ' + cur_day + ' 00' + str(m) + ".pdf"
my_dest6 = 'DQT ' + cur_day + ' 00' + str(n) + ".pdf"
my_dest7 = 'SRI ' + cur_day + ' 00' + str(o) + ".pdf"
my_dest8 = 'CAL ' + cur_day + ' 00' + str(p) + ".pdf"
my_dest9 = 'BIL ' + cur_day + ' 00' + str(q) + ".pdf"
my_dest10 = 'DEL ' + cur_day + ' 00' + str(r) + ".pdf"
my_source = path + filename
if 'Jerry' in filename:
os.rename(filename, my_dest)
i +=1
elif 'Bob' in filename:
os.rename(filename, my_dest2)
j +=1
elif 'Sara' in filename:
os.rename(filename, my_dest3)
k +=1
elif 'Greg' in filename:
os.rename(filename, my_dest4)
l +=1
elif 'Annie' in filename:
os.rename(filename, my_dest5)
m +=1
elif 'Beth' in filename:
os.rename(filename, my_dest6)
n +=1
elif 'Claire' in filename:
os.rename(filename, my_dest7)
o +=1
elif 'Johnny' in filename:
os.rename(filename, my_dest8)
p +=1
elif 'Bob' in filename:
os.rename(filename, my_dest9)
q +=1
elif 'Jimbo' in filename:
os.rename(filename, my_dest10)
r +=1
def exp_names():
list_files = os.listdir(r'U:\\get_filename')
list_2 = [x.strip('.pdf') for x in list_files]
list_2 = pd.DataFrame(list_2)
list_3 = list_2[list_2[0] != 'iles.xlsx']
list_3.to_excel(r'U:\get_filename\\files.xlsx', header = False, index = False)
if __name__ == "__main__":
main()
exp_names()

python read all files from a folder and write the file name and other info into a txt file

I have 30911 html files. I need to do webscraping and then save the info into a txt file named index.txt.
It should look like
filename1, title, t1, date, p1
filename2, title, t1, date, p1
filename3, title, t1, date, p2
and so on...
I only want filename, but output gave me path+filename.
Your problem is that filename is filepath in reality, in order to get the filename you could use os module
os.path.basename('filepath')
so in order to write to the file:
indexFile.write(os.path.basename(filename)+ ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
You can use:
path = 'C:/Users/.../.../output/'
#read html files
for filename in glob.glob(os.path.join(path, '*.html')):
soup = bs4.BeautifulSoup(open(filename).read(), "lxml")
title = soup.find('h1')
ticker = soup.find('p')
d_date = soup.find_all('div', {"id": "a-body"})[0].find_all("p")[2]
try:
def find_participant(tag):
return tag.name == 'p' and tag.find("strong", text=re.compile(r"Executives|Corporate Participants"))
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
except:
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(filename + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + 'No participants' + '\n')
else:
participants = soup.find(find_participant)
parti_names = ""
for parti in participants.find_next_siblings("p"):
if parti.find("strong", text=re.compile(r"(Operator)")):
break
parti_names += parti.get_text(strip=True) + ","
indexFile = open('C:/Users/.../output1/' + 'index.txt', 'a+')
indexFile.write(os.path.basename(filename) + ', ' + title.get_text(strip=True) + ', '+ ticker.get_text(strip=True) + ', ' + d_date.get_text(strip=True) + ', ' + parti_names + '\n')
indexFile.close()
ntpath is another module used to get base name from path.
>>> import ntpath
>>> ntpath.basename('C:/Users/.../output1/' + 'index.txt')
'index.txt'

Categories