If and else statement in Google Places API script in Python - python

I am attempting to write a script that will loop through list items and query the google places api.
The problem is that some of the queries will return no results, while other queries will.
The query results are gathered into lists. For every query that returns no results I would like to insert 'no results' string into list.
This is the script I have so far (API Key is fake):
companies = ['company A', 'company B', 'company C']
#create list items to store API search results
google_name = []
place_id = []
formatted_address = []
#function to find company id and address from company names
def places_api_id():
api_key = 'AIzaSyAKCp1kN0cHvO7t_NlqMagergrghhehtsrht'
url = 'https://maps.googleapis.com/maps/api/place/textsearch/json'
#replace spaces within list items with %20
company_replaced = company.replace(' ', '%20')
final_url = url + '?query=' + company_replaced +'&key=' + api_key
json_obj = urllib2.urlopen(final_url)
data = json.loads(json_obj)
#if no results, insert 'no results'
if data['status'] == 'ZERO RESULTS':
google_name.append('no results')
place_id.append('no results')
formatted_address('no results')
#otherwise, insert the result into list
else:
for item in data['results']:
google_name.append(item['name'])
place_id.append(item['place_id'])
formatted_address.append(item['formatted_address'])
#run the script
for company in companies:
places_api_id()
Unfortunately when I run the script python produces the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-159-eadf5f84e27f> in <module>()
1 for company in companies:
----> 2 places_api_id()
3
<ipython-input-153-f0e25b871a0e> in places_api_id()
6 final_url = url + '?query=' + company_replaced +'&key=' + api_key
7 json_obj = urllib2.urlopen(final_url)
----> 8 data = json.loads(json_obj)
9 if data['status'] == 'ZERO RESULTS':
10 google_name.append('no results')
/usr/lib/python2.7/json/__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
336 parse_int is None and parse_float is None and
337 parse_constant is None and object_pairs_hook is None and not kw):
--> 338 return _default_decoder.decode(s)
339 if cls is None:
340 cls = JSONDecoder
/usr/lib/python2.7/json/decoder.pyc in decode(self, s, _w)
364
365 """
--> 366 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
367 end = _w(s, end).end()
368 if end != len(s):
`TypeError: expected string or buffer
I would really appreciate your help and advice on how to get this script working, I've been staring at it for hours.
Thank you
Kamil
UPDATE
I am now loooping the following list through the script:
companies = ['MARINE AND GENERAL MUTUAL LIFE ASSURANCE SOCIETY',
'KENTSTONE PROPERTIES LIMITED',
'ASHFORD CATTLE MARKET COMPANY LIMITED(THE)',
'ORIENTAL GAS COMPANY, LIMITED(THE)',
'BRITISH INDIA STEAM NAVIGATION COMPANY LIMITED',
'N & C BUILDING PRODUCTS LIMITED',
'UNION MARINE AND GENERAL INSURANCE COMPANY LIMITED,(THE)',
'00000258 LIMITED',
'METHODIST NEWSPAPER COMPANY LIMITED',
'LONDON AND SUBURBAN LAND AND BUILDING COMPANY LIMITED(THE)']
after I run the script this is what Google Places API returns in the google name list:
[u'The Ashford Cattle Market Co Ltd',
u'Orient Express Hotels',
u'British-India Steam-Navigation Co Ltd',
u'N-Of-One, Inc.',
u'In-N-Out Burger',
u'In-N-Out Burger Distribution Center',
u"Wet 'n Wild Orlando",
u'In-N-Out Burger',
u'Alt-N Technologies (MDaemon)',
u'Model N Inc',
u"Pies 'n' Thighs",
u"Bethany Women's Center",
u"Jim 'N Nick's Bar-B-Q",
u"Steak 'n Shake",
u'New Orleans Ernest N. Morial Convention Center',
u"Jim 'N Nick's Bar-B-Q",
u"Jim 'N Nick's Bar-B-Q",
u"Jim 'N Nick's Bar-B-Q",
u'Theatre N at Nemours',
u'Model N',
u"Jim 'N Nick's Bar-B-Q",
u'Memphis Rock n Soul Museum',
u"Eat'n Park - Squirrel Hill",
u'Travelers',
u'American General Life Insurance Co',
u'258 Ltd Rd',
u'The Limited',
u'258, New IPCL Rd',
u'London Metropolitan Archives',
u'Hampstead Garden Suburb Trust Ltd']
Majority of the company names returned by Google are not even on the companies list and also there are many more of them. I am really confused now.

The error is not at the if-line, but before.
json_obj is a file-like object, not a string, therefore you have to use load:
data = json.load(json_obj)
PS: if the status is not what you expect, you can just test if data['results'] is empty or not:
import urllib2
from collections import namedtuple
API_KEY = 'AIzaSyAKCp1kN0cHvO7t_NlqMagergrghhehtsrht'
URL = 'https://maps.googleapis.com/maps/api/place/textsearch/json?query={q}&key={k}'
Place = namedtuple("Place", "google_name,place_id,formatted_address")
#function to find company id and address from company names
def places_api_id(company):
places = []
url = URL.format(q=urllib2.quote(company), k=API_KEY)
json_obj = urllib2.urlopen(url)
data = json.loads(json_obj)
if not data['results']:
places.append(Place("no results", "no results", "no results"))
else:
for item in data['results']:
places.append(Place(item['name'], item['place_id'], item['formatted_address']))
return places
companies = ['company A', 'company B', 'company C']
places = []
for company in companies:
places.extend(places_api_id(company))

Related

Pickling a scraped list with BS4 vs Preset Data issue (why is it not working the same?)

I am trying to save this scraped data to file (pickle it) but I cannot figure out why I cannot pickle it with this code:
url = "https://www.imdb.com/list/ls016522954/?ref_=nv_tvv_dvd"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
html_soup = BeautifulSoup(webpage, 'html5lib')
dvdNames = html_soup.find_all("div", class_="lister-item-content")
for dvd in dvdNames:
dvdArray.append(dvd.a.string)
viewtitles = input("Finished!, do you want to view the DVD titles? (Y/N): ")
if viewtitles == "y".casefold():
num = 1
for name in dvdArray:
print(""+ str(num) + " - " + name)
num += 1
elif viewtitles == "n".casefold():
print("Not Showing TItles!")
else:
print("that is not an option!")
saveToFile = input("Do you want to save / update the data? (Y/N): ")
if saveToFile == "y".casefold():
with open("IMDBDVDNames.dat", "wb") as f:
pickle.dump(dvdArray, f)
continue
elif saveToFile == "n".casefold():
print("Data Not Saved!")
continue
else:
print("That's not one of the option!")
continue
I've tried adding the sys.setrecursionlimit(1000000) and it doesn't make a difference (FYI) and am getting this error "maximum recursion depth exceeded while pickling an object" but when I run this code:
import pickle
testarray = []
if input("1 or 2?: ") == "1":
testarray = ['1917', 'Onward', 'The Hunt', 'The Invisible Man', 'Human Capital', 'Dolittle', 'Birds of Prey: And the Fantabulous Emancipation of One Harley Quinn', 'The Gentlemen', 'Bloodshot', 'The Way Back', 'Clemency', 'The Grudge', 'I Still Believe', 'The Song of Names', 'Treadstone', 'Vivarium', 'Star Wars: Episode IX - The Rise of Skywalker', 'The Current War', 'Downhill', 'The Call of the Wild', 'Resistance', 'Banana Split', 'Bad Boys for Life', 'Sonic the Hedgehog', 'Mr. Robot', 'The Purge', 'VFW', 'The Other Lamb', 'Slay the Dragon', 'Clover', 'Lazy Susan', 'Rogue Warfare: The Hunt', 'Like a Boss', 'Little Women', 'Cats', 'Madam Secretary', 'Escape from Pretoria', 'The Cold Blue', 'The Night Clerk', 'Same Boat', 'The 420 Movie: Mary & Jane', 'Manou the Swift', 'Gold Dust', 'Sea Fever', 'Miles Davis: Birth of the Cool', 'The Lost Husband', 'Stray Dolls', 'Mortal Kombat Legends: Scorpions Revenge', 'Just Mercy', 'The Righteous Gemstones', 'Criminal Minds', 'Underwater', 'Final Kill', 'Green Rush', 'Butt Boy', 'The Quarry', 'Abe', 'Bad Therapy', 'Yip Man 4', 'The Last Full Measure', 'Looking for Alaska', 'The Turning', 'True History of the Kelly Gang', 'To the Stars', 'Robert the Bruce', 'Papa, sdokhni', 'The Rhythm Section', 'Arrow', 'The Assistant', 'Guns Akimbo', 'The Dark Red', 'Dreamkatcher', 'Fantasy Island', 'The Etruscan Smile', "A Nun's Curse", 'Allagash']
with open("test.dat", "wb") as f:
pickle.dump(testarray, f)
else:
with open("test.dat", "rb") as f:
testarray = pickle.load(f)
print(testarray)
with the exact same (at least I hope it's the same, I did a print(dvdArray) and got the list that way FYI) information but it WILL let me pickle it when i do it like that
can someone let me know why and how I can fix it?
I know I'm scraping the data from a website and converting it into a list but cannot figure out what is causing the error in example 1 vs example 2
any help would be appreciated
Thanks,
lttlejiver
In Case anyone is curious, I added "strip()" to when I was appending the dvdArray and it worked!
dvdArray.append(dvd.a.string.strip())
BeautifulSoup objects are highly recursive, and so are very difficult to pickle. When you do dvdArray.append(dvd.a.string), dvd.a.string is not a python string, but a bs4.element.NavigableString - one of these complex objects. By using strip(), you're actually converting the bs4.element.NavigableString to a python string, which is easily pickled. The same would be true if you used dvd.a.getText().
For future reference, when pickling, always remember to convert (where possible) BeautifulSoup objects to simpler python objects.

Scraping HTML data from a page that adds new tables as you scroll

Im trying to learn html scraping for a project, I'm using python and lxml. I've been successful so far in getting the data I needed but now I have another problem. The site that I'm scraping from (op.gg) when you scroll down it adds new tables with more information. When I run my script (below) it only gets the first 50 entries and nothing more. My question is how can I get at least the first 200 names on the page or if it is even possible.
from lxml import html
import requests
page = requests.get('https://na.op.gg/ranking/ladder/')
tree = html.fromstring(page.content)
names = tree.xpath('//td[#class="SummonerName Cell"]/a/text()')
print (names)
Borrow the idea from Pedro, https://na.op.gg/ranking/ajax2/ladders/start=number will give you 50 records start from number, for example:
https://na.op.gg/ranking/ajax2/ladders/start=0 get (1-50),
https://na.op.gg/ranking/ajax2/ladders/start=50 get (51-100),
https://na.op.gg/ranking/ajax2/ladders/start=100 get (101-150),
https://na.op.gg/ranking/ajax2/ladders/start=150 get (151-200),
etc....
After that you could change your scrap code, as the page is different as your original one, suppose you want get first 200 names, here is the amended code:
from lxml import html
import requests
start_url = 'https://na.op.gg/ranking/ajax2/ladders/start='
names_200 = list()
for i in [0,50,100,150]:
dest_url = start_url + str(i)
page = requests.get(dest_url)
tree = html.fromstring(page.content)
names_50 = tree.xpath('//a[not(#target) and not(#onclick)]/text()')
names_200.extend(names_50)
print names_200
print len(names_200)
Output:
[u'am\xc3\xa9liorer', 'pireaNn', 'C9 Ray', 'P1 Pirean', 'Pobelter', 'mulgokizary', 'consensual clown', 'Jue VioIe Grace', 'Deep Learning', 'Keegun', 'Free Papa Chau', 'C9 Gun', 'Dhokla', 'Arrowlol', 'FOX Brandini', 'Jurassiq', 'Win or Learn', 'Acoldblazeolive', u'R\xc3\xa9venge', u'M\xc3\xa9ru', 'Imaqtpie', 'Rohammers', 'blaberfish2', 'qldurtms', u'd\xc3\xa0wolfsclaw', 'TheOddOrange', 'PandaTv 656826', 'stuntopolis', 'Butler Delta', 'P1 Shady', 'Entranced', u'Linsan\xc3\xadty', 'Ablazeolive', 'BukZacH', 'Anivia Kid', 'Contractz', 'Eitori', 'MistyStumpey', 'Prodedgy', 'Splitting', u'S\xc4\x99b B\xc4\x99rnal', 'N For New York', 'Naeun', '5tunt', 'C9 Winter', 'Doubtfull', 'MikeYeung', 'Rikara', u'RAH\xc3\x9cLK', ' Sudzzi', 'joong ki song', 'xWeixin VinLeous', 'rhubarbs', u'Ch\xc3\xa0se', 'XueGao', 'Erry', 'C9 EonYoung', 'Yeonbee', 'M ckg', u'Ari\xc3\xa1na Lovato', 'OmarGod', 'Wiggily', 'lmpactful', 'Str1fe', 'LL Stylish', '2017', 'FlREFLY', 'God Fist Monk', 'rWeiXin VinLeous', 'Grigne', 'fantastic ad', 'bobqinX', 'grigne 1v10', 'Sora1', 'Juuichi san ', 'duoking2', 'SandPaperX', 'Xinthus', 'TwichTv CoMMa', 'xFSN Rin', 'UBC CJ', 'PotIuck', 'DarkWingsForSale', 'Get After lt', 'old chicken', u'\xc4\x86ris', 'VK Deemo', 'Pekin Woof', 'YIlIlIlIlI', 'RiceLegend', 'Chimonaa1', 'DJNDREE5', u'CloudNguy\xc3\xa9n', 'Diamond 1 Khazix', 'dawolfsfang', 'clg imaqtpie69', 'Pyrites', 'Lava', 'Rathma', 'PieCakeLord', 'feed l0rd', 'Eygon', 'Autolycus1', 'FateFalls 20xx', 'nIsHIlEzHIlA', 'C9 Sword', 'TET Fear', 'a very bad time', u'Jur\xc3\xa1ssiq', 'Ginormous Noob', 'Saskioo', 'S D 2 NA', 'C9 Smoothie', 'dufTlalgkqtlek', 'Pants are Dragon', u'H\xc3\xb3llywood', 'Serenitty', 'Waggily ', 'never lucky help', u'insan\xc3\xadty', 'Joyul', 'TheeBrandini', 'FoTheWin', 'RyuShoryu', 'avi is me', 'iKingVex', 'PrismaI', 'An Obese Panda', 'TdollasAKATmoney', 'feud999', 'Soligo', 'Steel I', 'SNH48 Ruri', 'BillyBoss1', 'Annie Bot', 'Descraton', 'Cris', 'GrayHoves', 'RegisZZ', 'lron Pyrite', 'Zaion', 'Allorim', 't d', u'Alex \xc3\xafch', 'godrjsdnd', 'DOUBLELIFTSUCKS', 'John Mcrae', u'Lobo Solitari\xc3\xb3', 'MikeYeunglol', 'i xo u', 'NoahMost', 'Vsionz', 'GladeGleamBright', 'Tuesdayy', 'RealDarkness', 'CC Dean', 'na mid xd LFT', 'Piggy Kitten', 'Abou222', 'TG Strompest', 'MooseHater', 'Day after Day', 'bat8man', 'AxAxAxAxA', 'Boyfriend', 'EvanRL', '63FYWJMbam', 'Fiftygbl', u'Br\xc4\xb1an', 'MlST', u'S\xc3\xb8ren Bjerg', 'FOX Akaadian', '5word', 'tchikou', 'Hakuho', 'Noobkiller291', 'woxiangwanAD', 'Doublelift', 'Jlaol', u'z\xc3\xa3ts', 'Cow Goes Mooooo', u'Be Like \xc3\x91e\xc3\xb8\xc3\xb8', 'Liquid Painless', 'Zergy', 'Huge Rooster', 'Shiphtur', 'Nikkone', 'wiggily1', 'Dylaran', u'C\xc3\xa0m', 'byulbit', 'dirtybirdy82', 'FreeXpHere', u'V\xc2\xb5lcan', 'KaNKl', 'LCS Actor 4', 'bie sha wo', 'Mookiez', 'BKSMOOTH', 'FatMiku']
200
BTW, you could expand it based on your requirement.

Parsing a python list converted from JSON

I am trying to use google custom search API to search through US news outlets. Using the code example provided by google you end up with a python dictionary containing a multitude of other dictionaries and lists. The tags listed next to "res" in the meta function are the values I am trying to access for each article.
import os.path
import csv
from lxml import html
from googleapiclient.discovery import build
def newslist():
'''
Uses google custom search to search 20 US news sources for gun control articles,
and converts info into python dictionary.
in - none
out - res: JSON formatted search results
'''
service = build("customsearch", "v1",
developerKey="key")
res = service.cse().list(
q='query',
cx='searchid',
).execute()
return res
def meta(res, doc_count):
'''
Finds necessary meta-data of all articles. Avoids collections, such as those found on Huffington Post and New York Times.
in - res: defined above
out - meta_csv: csv file with article meta-data
'''
row1 = ['doc_id', 'url', 'title', 'publisher', 'date']
if res['context']['items']['pagemap']['metatags']['applicationname'] is not 'collection':
for art in res['context']['items']:
url = res['context']['items']['link']
title = res['context']['items']['pagemap']['article']['newsarticle']['headline']
publisher = res['context']['items']['displayLink'].split('www.' and '.com')
date = res['context']['items']['pagemap']['newsarticle']['datepublished']
row2 = [doc_count, url, title, publisher, date]
with open('meta.csv', 'w', encoding = 'utf-8') as meta:
csv_file = csv.writer(meta, delimiter = ',', quotechar = '|',
quoting = csv.QUOTE_MINIMAL)
if doc_count == 1:
csv_file.writerow(row1)
csv_file.writerow(row2)
doc_count += 1
Here's and example of the printed output from a search query:
{'context': {'title': 'Gun Control articles'},
'items': [{'displayLink': 'www.washingtonpost.com',
'formattedUrl': 'https://www.washingtonpost.com/.../white-resentment-is-fueling-opposition- '
'to-gun-control-researchers-say/',
'htmlFormattedUrl': 'https://www.washingtonpost.com/.../white-resentment-is-fueling-opposition- '
'to-<b>gun</b>-<b>control</b>-researchers-say/',
'htmlSnippet': 'Apr 4, 2016 <b>...</b> Racial prejudice could play '
'a significant role in white Americans' '
'opposition to <br>\n'
'<b>gun control</b>, according to new research from '
'political scientists at ...',
'htmlTitle': 'White resentment is fueling opposition to <b>gun '
'control</b>, researchers say',
'kind': 'customsearch#result',
'link': 'https://www.washingtonpost.com/news/wonk/wp/2016/04/04/white-resentment-is-fueling-opposition-to-gun-control-researchers-say/',
'pagemap': {'cse_image': [{'src': 'https://img.washingtonpost.com/rf/image_1484w/2010-2019/WashingtonPost/2015/10/03/Others/Images/2015-10-03/Botsford_gunshow1004_15_10_03_41831443897980.jpg'}],
'cse_thumbnail': [{'height': '183',
'src': 'https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSXtMnfm_GHkX3d2dOWgmto3rFjmhzxV8InoPao1tBuiBrEWsDMz4WDKcPB',
'width': '275'}],
'metatags': [{'apple-itunes-app': 'app-id=938922398, '
'app-argument=https://www.washingtonpost.com/news/wonk/wp/2016/04/04/white-resentment-is-fueling-opposition-to-gun-control-researchers-say/',
'article:author': 'https://www.facebook.com/chrisingraham',
'article:publisher': 'https://www.facebook.com/washingtonpost',
'author': 'https://www.facebook.com/chrisingraham',
'fb:admins': '1513210492',
'fb:app_id': '41245586762',
'news_keywords': 'guns, gun control, '
'racial resentment, '
'white people',
'og:description': 'Some white gun owners '
'"understand '
"'freedom' in a very "
'particular way."',
'og:image': 'https://img.washingtonpost.com/rf/image_1484w/2010-2019/WashingtonPost/2015/10/03/Others/Images/2015-10-03/Botsford_gunshow1004_15_10_03_41831443897980.jpg',
'og:site_name': 'Washington Post',
'og:title': 'White resentment is fueling '
'opposition to gun control, '
'researchers say',
'og:type': 'article',
'og:url': 'https://www.washingtonpost.com/news/wonk/wp/2016/04/04/white-resentment-is-fueling-opposition-to-gun-control-researchers-say/',
'referrer': 'unsafe-url',
'twitter:card': 'summary_large_image',
'twitter:creator': '#_cingraham',
'viewport': 'width=device-width, '
'initial-scale=1.0, '
'user-scalable=yes, '
'minimum-scale=0.5, '
'maximum-scale=2.0'}],
'newsarticle': [{'articlebody': 'People look at '
'handguns during the '
"Nation's Gun Show in "
'Chantilly, Va. in '
'October 2015. (Photo '
'by Jabin Botsford/The '
'Washington Post) '
'Racial prejudice '
'could play a '
'significant role in '
'white...',
'datepublished': '2016-04-04T11:46-500',
'description': 'Some white gun owners '
'"understand '
"'freedom' in a very "
'particular way."',
'headline': 'White resentment is '
'fueling opposition to '
'gun control, researchers '
'say',
'image': 'https://img.washingtonpost.com/rf/image_1484w/2010-2019/WashingtonPost/2015/10/03/Others/Images/2015-10-03/Botsford_gunshow1004_15_10_03_41831443897980.jpg',
'mainentityofpage': 'True',
'url': 'https://www.washingtonpost.com/news/wonk/wp/2016/04/04/white-resentment-is-fueling-opposition-to-gun-control-researchers-say/'}],
'person': [{'name': 'Christopher Ingraham'}]},
'snippet': 'Apr 4, 2016 ... Racial prejudice could play a '
"significant role in white Americans' opposition to \n"
'gun control, according to new research from political '
'scientists at\xa0...',
'title': 'White resentment is fueling opposition to gun control, '
'researchers say'},
I understand that I could basically write a for loop, but I'm wondering if there is an easier, less code intensive way of accessing this data for each desired value: URL, title, publisher, and date.
Why don't you use json module?
import json
s = ... # Your JSON text
result = json.loads(s)
result will be a dict or list depending on your JSON.

Python regex to capture a comma-delimited list of items

I have a list of weather forecasts that start with a similar prefix that I'd like to remove. I'd also like to capture the city names:
Some Examples:
If you have vacation or wedding plans in Phoenix, Tucson, Flagstaff,
Salt Lake City, Park City, Denver, Estes Park, Colorado Springs,
Pueblo, or Albuquerque, the week will...
If you have vacation or wedding plans for Miami, Jacksonville, Macon,
Charlotte, or Charleston, expect a couple systems...
If you have vacation or wedding plans in Pittsburgh, Philadelphia,
Atlantic City, Newark, Baltimore, D.C., Richmond, Charleston, or
Dover, expect the week...
The strings start with a common prefix "If you have vacation or wedding plans in" and the last city has "or" before it. The list of cities is of variable length.
I've tried this:
>>> text = 'If you have vacation or wedding plans in NYC, Boston, Manchester, Concord, Providence, or Portland'
>>> re.search(r'^If you have vacation or wedding plans in ((\b\w+\b), ?)+ or (\w+)', text).groups()
('Providence,', 'Providence', 'Portland')
>>>
I think I'm pretty close, but obviously it's not working. I've never tried to do something with a variable number of captured items; any guidance would be greatly appreciated.
Alternative solution here (probably just for sharing and educational purposes).
If you were to solve it with nltk, it would be called a Named Entity Recognition problem. Using the snippet based on nltk.chunk.ne_chunk_sents(), provided here:
import nltk
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
sample = "If you have vacation or wedding plans in Phoenix, Tucson, Flagstaff, Salt Lake City, Park City, Denver, Estes Park, Colorado Springs, Pueblo, or Albuquerque, the week will..."
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
print entity_names
It prints exactly the desired result:
['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'Albuquerque']
Here is my approach: use the csv module to parse the lines (I assume they are in a text file named data.csv, please change to suite your situation). After parsing each line:
Discard the last cell, it is not a city name
Remove 'If ...' from the first cell
Remove or 'or ' from the last cell (used to be next-to-last)
Here is the code:
import csv
def cleanup(row):
new_row = row[:-1]
new_row[0] = new_row[0].replace('If you have vacation or wedding plans in ', '')
new_row[0] = new_row[0].replace('If you have vacation or wedding plans for ', '')
new_row[-1] = new_row[-1].replace('or ', '')
return new_row
if __name__ == '__main__':
with open('data.csv') as f:
reader = csv.reader(f, skipinitialspace=True)
for row in reader:
row = cleanup(row)
print row
Output:
['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'Albuquerque']
['Miami', 'Jacksonville', 'Macon', 'Charlotte', 'Charleston']
['Pittsburgh', 'Philadelphia', 'Atlantic City', 'Newark', 'Baltimore', 'D.C.', 'Richmond', 'Charleston', 'Dover']
import re
s = "If you have vacation or wedding plans for Miami, Jacksonville, Macon, Charlotte, or Charleston, expect a couple systems"
p = re.compile(r"If you have vacation or wedding plans (in|for) ((\w+, )+)or (\w+)")
m = p.match(s)
print m.group(2) # output: Miami, Jacksonville, Macon, Charlotte,
cities = m.group(2).split(", ") # cities = ['Miami', 'Jacksonville', 'Macon', 'Charlotte', '']
cities[-1] = m.group(4) # add the city after or
print cities # cities = ['Miami', 'Jacksonville', 'Macon', 'Charlotte', 'Charleston']
the city can be matched by pattern (\w+, ) and or (\w+)
and split cities by pattern ,
btw, as the pattern is used to many data, it is preferred to work with the compiled object
PS: the word comes after plan can be for or in, according to examples you provide
How about this
>>> text = 'If you have vacation or wedding plans for Phoenix, Tucson, Flagstaff, Salt Lake City, Park City, Denver, Estes Park, Colorado Springs, Pueblo, or Albuquerque, the week will'
>>> match = re.search(r'^If you have vacation or wedding plans (in?|for?) ([\w+ ,]+)',text).groups()[1].split(", ")
Output
>>> match
['Phoenix', 'Tucson', 'Flagstaff', 'Salt Lake City', 'Park City', 'Denver', 'Estes Park', 'Colorado Springs', 'Pueblo', 'or Albuquerque', 'the week will']

Append items to dictionary Python

I am trying to write a function in python that opens a file and parses it into a dictionary. I am trying to make the first item in the list block the key for each item in the dictionary data. Then each item is supposed to be the rest of the list block less the first item. For some reason though, when I run the following function, it parses it incorrectly. I have provided the output below. How would I be able to parse it like I stated above? Any help would be greatly appreciated.
Function:
def parseData() :
filename="testdata.txt"
file=open(filename,"r+")
block=[]
for line in file:
block.append(line)
if line in ('\n', '\r\n'):
album=block.pop(1)
data[block[1]]=album
block=[]
print data
Input:
Bob Dylan
1966 Blonde on Blonde
-Rainy Day Women #12 & 35
-Pledging My Time
-Visions of Johanna
-One of Us Must Know (Sooner or Later)
-I Want You
-Stuck Inside of Mobile with the Memphis Blues Again
-Leopard-Skin Pill-Box Hat
-Just Like a Woman
-Most Likely You Go Your Way (And I'll Go Mine)
-Temporary Like Achilles
-Absolutely Sweet Marie
-4th Time Around
-Obviously 5 Believers
-Sad Eyed Lady of the Lowlands
Output:
{'-Rainy Day Women #12 & 35\n': '1966 Blonde on Blonde\n',
'-Whole Lotta Love\n': '1969 II\n', '-In the Evening\n': '1979 In Through the Outdoor\n'}
You can use groupby to group the data using the empty lines as delimiters, use a defaultdict for repeated keys extending the rest of the values from each val returned from groupby after extracting the key/first element.
from itertools import groupby
from collections import defaultdict
d = defaultdict(list)
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
# if k is True we have a section
if k:
# get key "k" which is the first line
# from each section, val will be the remaining lines
k,*v = val
# add or add to the existing key/value pairing
d[k].extend(map(str.rstrip,v))
from pprint import pprint as pp
pp(d)
Output:
{'Bob Dylan\n': ['1966 Blonde on Blonde',
'-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or Later)',
'-I Want You',
'-Stuck Inside of Mobile with the Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
"-Most Likely You Go Your Way (And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands'],
'Led Zeppelin\n': ['1979 In Through the Outdoor',
'-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl",
'1969 II',
'-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home']}
For python2 the unpack syntax is slightly different:
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
if k:
k, v = next(val), val
d[k].extend(map(str.rstrip, v))
If you want to keep the newlines remove the map(str.rstrip..
If you want the album and songs separately for each artist:
from itertools import groupby
from collections import defaultdict
d = defaultdict(lambda: defaultdict(list))
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
if k:
k, alb, songs = next(val),next(val), val
d[k.rstrip()][alb.rstrip()] = list(map(str.rstrip, songs))
from pprint import pprint as pp
pp(d)
{'Bob Dylan': {'1966 Blonde on Blonde': ['-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or '
'Later)',
'-I Want You',
'-Stuck Inside of Mobile with the '
'Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
'-Most Likely You Go Your Way '
"(And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands']},
'Led Zeppelin': {'1969 II': ['-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home'],
'1979 In Through the Outdoor': ['-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl"]}}
I guess this is what you want?
Even if this is not the format you wanted, there are a few things you might learn from the answer:
use with for file handling
nice to have:
PEP8 compilant code, see http://pep8online.com/
a shebang
numpydoc
if __name__ == '__main__'
And SE does not like a list being continued by code...
#!/usr/bin/env python
""""Parse text files with songs, grouped by album and artist."""
def add_to_data(data, block):
"""
Parameters
----------
data : dict
block : list
Returns
-------
dict
"""
artist = block[0]
album = block[1]
songs = block[2:]
if artist in data:
data[artist][album] = songs
else:
data[artist] = {album: songs}
return data
def parseData(filename='testdata.txt'):
"""
Parameters
----------
filename : string
Path to a text file.
Returns
-------
dict
"""
data = {}
with open(filename) as f:
block = []
for line in f:
line = line.strip()
if line == '':
data = add_to_data(data, block)
block = []
else:
block.append(line)
data = add_to_data(data, block)
return data
if __name__ == '__main__':
data = parseData()
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)
which gives:
{ 'Bob Dylan': { '1966 Blonde on Blonde': [ '-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or Later)',
'-I Want You',
'-Stuck Inside of Mobile with the Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
"-Most Likely You Go Your Way (And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands']},
'Led Zeppelin': { '1969 II': [ '-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home'],
'1979 In Through the Outdoor': [ '-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl"]}}

Categories