Parsing and sorting keys in Python dictionary - python

I created the following dictionary:
code dictionary = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
I now want to write some Python code that goes through the dictionary's keys and separates out the codes and their corresponding values. So for the first element in the dictionary, I want to end up with:
News: 'contradictory about news', 'something about news'
comment: 'contradictory about news', 'something about news'
negative: 'contradictory about news'
The end result can be a dictionary, list, or tab or comma-separated text.
You can see my attempt to do this here:
from bs4 import BeautifulSoup as Soup
f = open('transcript.xml','r')
soup = Soup(f)
#print soup.prettify()
#searches text for all w:commentrangestart tags and makes a dictionary that matches ids with text
textdict = {}
for i in soup.find_all('w:commentrangestart'):
# variable 'key' is assigned to the tag id
key = i.parent.contents[1].attrs['w:id']
key = str(key)
#variable 'value' is assigned to the tag's text
value= ''.join(i.nextSibling.findAll(text=True))
# key / value pairs are added to the dictionary 'textdict'
textdict[key]=value
print "Transcript Text = " , textdict
# makes a dictionary that matches ids with codes
codedict = {}
for i in soup.find_all('w:comment'):
key = i.attrs['w:id']
key = str(key)
value= ''.join(i.findAll(text=True))
codedict[key]=value
print "Codes = ", codedict
# makes a dictionary that matches all codes with text
output = {}
for key in set(textdict.keys()).union(codedict.keys()):
print "key= ", key
txt = textdict[key]
print "txt = ", txt
ct = codedict[key]
print "ct= ", ct
output[ct] = txt
#print "output = ", output
print "All code dictionary = ", output
#codelist={}
#for key in output:
# codelist =key.split(";")
#print "codelist= " , codelist
code_negative = {}
code_news = {}
print output.keys()
for i in output:
if 'negative' in output.keys():
print 'yay'
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
For some reason though, I keep getting a key error when I run the last function:
code_negative = {}
code_news = {}
for i in output:
if 'negative' in output.keys():
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
Any ideas? Thanks!

The following code should work, if I understood the problem correctly:
from collections import defaultdict
out = defaultdict(list)
for k, v in code_dictionary.viewitems():
for item in k.split('; '):
out[item].append(v)

output = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
negatives = []
comments = []
news = []
for k, v in output.items():
key_parts = k.split('; ')
key_parts = [part.lower() for part in key_parts]
if 'negative' in key_parts:
negatives.append(v)
if 'news' in key_parts:
news.append(v)
if 'comment' in key_parts:
comments.append(v)

Related

convert string which contains sub string to dictionary

I am tring to convert particular strings which are in particular format to Python dictionary.
String format is like below,
st1 = 'key1 key2=value2 key3="key3.1, key3.2=value3.2 , key3.3 = value3.3, key3.4" key4'
I want to parse it and convert to dictionary as below,
dict1 {
key1: None,
key2: value2,
key3: {
key3.1: None,
key3.2: value3.2,
key3.3: value3.3,
key3.2: None
}
key4: None,
I tried to use python re package and string split function. not able to acheive the result. I have thousands of string in same format, I am trying to automate it. could someone help.
If all your strings are consistent, and only have 1 layer of sub dict, this code below should do the trick, you may need to make tweaks/changes to it.
import json
st1 = 'key1 key2=item2 key3="key3.1, key3.2=item3.2 , key3.3 = item3.3, key3.4" key4'
st1 = st1.replace(' = ', '=')
st1 = st1.replace(' ,', ',')
new_dict = {}
no_keys=False
while not no_keys:
st1 = st1.lstrip()
if " " in st1:
item = st1.split(" ")[0]
else:
item = st1
if '=' in item:
if '="' in item:
item = item.split('=')[0]
new_dict[item] = {}
st1 = st1.replace(f'{item}=','')
sub_items = st1.split('"')[1]
sub_values = sub_items.split(',')
for sub_item in sub_values:
if "=" in sub_item:
sub_key, sub_value = sub_item.split('=')
new_dict[item].update({sub_key.strip():sub_value.strip()})
else:
new_dict[item].update({sub_item.strip(): None})
st1 = st1.replace(f'"{sub_items}"', '')
else:
key, value = item.split('=')
new_dict.update({key:value})
st1 = st1.replace(f"{item} ","")
else:
new_dict.update({item: None})
st1 = st1.replace(f"{item}","")
if st1 == "":
no_keys=True
print(json.dumps(new_dict, indent=4))
Consider use parsing tool like lark. A simple example to your case:
_grammar = r'''
?start: value
?value: object
| NON_SEPARATOR_STRING?
object : "\"" [pair (_SEPARATOR pair)*] "\""
pair : NON_SEPARATOR_STRING [_PAIRTOR] value
NON_SEPARATOR_STRING: /[a-zA-z0-9\.]+/
_SEPARATOR: /[, ]+/
| ","
_PAIRTOR: " = "
| "="
'''
parser = Lark(_grammar)
st1 = 'key1 key2=value2 key3="key3.1, key3.2=value3.2 , key3.3 = value3.3, key3.4" key4'
tree = parser.parse(f'"{st1}"')
print(tree.pretty())
"""
object
pair
key1
value
pair
key2
value2
pair
key3
object
pair
key3.1
value
pair
key3.2
value3.2
pair
key3.3
value3.3
pair
key3.4
value
pair
key4
value
"""
Then you can write your own Transformer to transform this tree to your desired date type.

How to remove \r\n from language prices?

When I run the code, it gives me \r\n with space. I have tried to remove \r\n from the result but it didn't. This is code. Please check it out.
def parse_subtitles(self, response):
items = FetchingItem()
Arabic_price = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
Chinese_price = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for names in names_list:
result = [re.sub('\r\n\s+', ' ', text) for text in names]
items['Arabic_price'] = Arabic_price
items['Chinese_price'] = Chinese_price
yield items
Not sure what do you want exactly but this code works:
def parse_subtitles(self, response):
results = {}
results['Arabic_price'] = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
results['Chinese_price'] = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for name in names_list:
results[name] = [re.sub(r'[\r\n\s]+', ' ', text) for text in results[name]]
items['Arabic_price'] = results['Arabic_price']
items['Chinese_price'] = results['Chinese_price']

Aggregating values in one column by their corresponding value in another from two files

had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example:
1:5
2:4
3:2
1:4
Very basic but I'm looking for an output that looks like:
1:9
2:4
3:2
In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).
I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as:
Britney Spears (289) 2393140. Any help or input would be so appreciated.
import codecs
#from collections import defaultdict
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]
artists = {}
for a in artist_list:
artistID, name = a[0], a[1]
artists[artistID] = name
grouped_user_stats = {}
for u in user_stats_list:
userID, artistID, weight = u
grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
for artistID, weight in u:
grouped_user_stats.groupby('artistID')['weight'].sum()
print(grouped_user_stats.groupby('artistID')['weight'].sum())
#if userID not in grouped_user_stats:
#grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
#else:
#if artistID not in grouped_user_stats[userID]:
#grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
#else:
#grouped_user_stats[userID][artistID]['plays'] += 1
#print('this never happens')
#print(grouped_user_stats)
how about:
import codecs
from collections import defaultdict
# read stuff
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
# transform artist data in a dict with "artist id" as key and "artist name" as value
artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
user_stats_list = [x.strip().split('\t') for x in users][1:]
grouped_user_stats = defaultdict(lambda:0)
for u in user_stats_list:
#userID, artistID, weight = u
grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
# extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format
grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
# lastly print it
for k, v in grouped_user_stats.iteritems():
print k,v

How do I update a defaultdict using a loop?

I created a dictionary
TTR_of_speakers = defaultdict(int)
TTR_of_speakers = {}
and I wrote a program that does some computations that results in a value called TTR.
for record in cwrecords:
if record["speaker_name"] == "Joe Biden":
text = record["text"]
processed = nlp(text)
textw = [t.lemma_ for t in processed]
N += len(textw)
total_types |= set(textw)
V = len(total_types)
TTR = float(V)/float(N)
How do I put the TTR value into the dictionary with the key as Joe Biden?

Learning Python: Store values in dict from stdout

How can I do the following in Python:
I have a command output that outputs this:
Datexxxx
Clientxxx
Timexxx
Datexxxx
Client2xxx
Timexxx
Datexxxx
Client3xxx
Timexxx
And I want to work this in a dict like:
Client:(date,time), Client2:(date,time) ...
After reading the data into a string subject, you could do this:
import re
d = {}
for match in re.finditer(
"""(?mx)
^Date(.*)\r?\n
Client\d*(.*)\r?\n
Time(.*)""",
subject):
d[match.group(2)] = (match.group(1), match.group(2))
How about something like:
rows = {}
thisrow = []
for line in output.split('\n'):
if line[:4].lower() == 'date':
thisrow.append(line)
elif line[:6].lower() == 'client':
thisrow.append(line)
elif line[:4].lower() == 'time':
thisrow.append(line)
elif line.strip() == '':
rows[thisrow[1]] = (thisrow[0], thisrow[2])
thisrow = []
print rows
Assumes a trailing newline, no spaces before lines, etc.
What about using a dict with tuples?
Create a dictionary and add the entries:
dict = {}
dict['Client'] = ('date1','time1')
dict['Client2'] = ('date2','time2')
Accessing the entires:
dict['Client']
>>> ('date1','time1')

Categories