When I execute this code...
from bs4 import BeautifulSoup
with open("games.html", "r") as page:
doc = BeautifulSoup(page, "html.parser")
titles = doc.select("a.title")
prices = doc.select("span.price-inner")
for game_soup in doc.find_all("div", {"class": "game-options-wrapper"}):
game_ids = (game_soup.button.get("data-game-id"))
for title, price_official, price_lowest in zip(titles, prices[::2], prices[1::2]):
print(title.text + ',' + str(price_official.text.replace('$', '').replace('~', '')) + ',' + str(
price_lowest.text.replace('$', '').replace('~', '')))
The output is...
153356
80011
130187
119003
73502
156474
96592
154207
155123
152790
165013
110837
Call of Duty: Modern Warfare II (2022),69.99,77.05
Red Dead Redemption 2,14.85,13.79
God of War,28.12,22.03
ELDEN RING,50.36,48.10
Cyberpunk 2077,29.99,28.63
EA SPORTS FIFA 23,41.99,39.04
Warhammer 40,000: Darktide,39.99,45.86
Marvels Spider-Man Remastered,30.71,27.07
Persona 5 Royal,37.79,43.32
The Callisto Protocol,59.99,69.41
Need for Speed Unbound,69.99,42.29
Days Gone,15.00,9.01
But I'm trying to get the value next to the other ones on the same line
Expected output:
Call of Duty: Modern Warfare II (2022),69.99,77.05,153356
Red Dead Redemption 2,14.85,13.79,80011
...
Even when adding game_ids to print(), it spams the same game id for each line.
How can I go about resolving this issue?
HTML file: https://jsfiddle.net/m3hqy54x/
I feel like all 3 details (title, price_official, price_lowest) are probably all in a shared container. It would be better to loop through these containers and select the details as sets from each container to make sure the wight prices and titles are being paired up, but I can't tell you how to do that without seeing at least a snippet from (or all of) "games.html"....
Anyway, assuming that '110837\nCall of Duty: Modern Warfare II (2022)' is from the first title here, you can rewrite your last loop as something like:
for z in zip(titles, prices[::2], prices[1::2]):
z, lw = list(z), ''
for i in len(z):
if i == 0: # title
z[0] = ' '.join(w for w in z[0].text.split('\n', 1)[-1] if w)
if '\n' in z[0].text: lw = z[0].text.split('\n', 1)[0]
continue
z[i] = z[i].text.replace('$', '').replace('~', '')
print(','.join(z+[lw]))
Added EDIT: After seeing the html, this is my suggested solution:
for g in doc.select('div[data-container-game-id]'):
gameId = g.get('data-container-game-id')
title = g.select_one('a.title')
if title: title = ' '.join(w for w in title.text.split() if w)
price_official = g.select_one('.price-wrap > div:first-child span.price')
price_lowest = g.select_one('.price-wrap > div:first-child+div span.price')
if price_official:
price_official = price_official.text.replace('$', '').replace('~', '')
if price_lowest:
price_lowest = price_lowest.text.replace('$', '').replace('~', '')
print(', '.join([title, price_official, price_lowest, gameId]))
prints
Call of Duty: Modern Warfare II (2022), 69.99, 77.05, 153356
Red Dead Redemption 2, 14.85, 13.79, 80011
God of War, 28.12, 22.03, 130187
ELDEN RING, 50.36, 48.10, 119003
Cyberpunk 2077, 29.99, 28.63, 73502
EA SPORTS FIFA 23, 41.99, 39.04, 156474
Warhammer 40,000: Darktide, 39.99, 45.86, 96592
Marvel's Spider-Man Remastered, 30.71, 27.07, 154207
Persona 5 Royal, 37.79, 43.32, 155123
The Callisto Protocol, 59.99, 69.41, 152790
Need for Speed Unbound, 69.99, 42.29, 165013
Days Gone, 15.00, 9.01, 110837
Btw, this might look ok for just four values, but if you have a large amount of details that you want to extract, you might want to consider using a function like this.
Related
I am trying to scrape data from:-
https://www.canadapharmacy.com/
below are a few pages that I need to scrape:-
https://www.canadapharmacy.com/products/abilify-tablet
https://www.canadapharmacy.com/products/accolate
https://www.canadapharmacy.com/products/abilify-mt
I need all the information from the page. I wrote the below code:-
base_url = 'https://www.canadapharmacy.com'
data = []
for i in tqdm(range(len(medicine_url))):
r = requests.get(base_url+medicine_url[i])
soup = BeautifulSoup(r.text,'lxml')
# Scraping medicine Name
try:
main_name = (soup.find('h1',{"class":"mn"}).text.lstrip()).rstrip()
except:
main_name = None
try:
sec_name = (soup.find('div',{"class":"product-name"}).find('h3').text.lstrip()).rstrip()
except:
sec_name = None
try:
generic_name = (soup.find('div',{"class":"card product generic strength equal"}).find('div').find('h3').text.lstrip()).rstrip()
except:
generic_name = None
# Description
try:
des1 = soup.find('div',{"class":"answer expanded"}).find_all('p')[1].text
except:
des1 = ''
try:
des2 = soup.find('div',{"class":"answer expanded"}).find('ul').text
except:
des2 = ''
try:
des3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[2].text
except:
des3 = ''
desc = (des1+des2+des3).replace('\n',' ')
#Directions
try:
dir1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[1].text
except:
dir1 = ''
try:
dir2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[5].text
except:
dir2 = ''
try:
dir3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[6].text
except:
dir3 = ''
try:
dir4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[7].text
except:
dir4 = ''
directions = dir1+dir2+dir3+dir4
#Ingredients
try:
ing = soup.find('div',{"class":"answer expanded"}).find_all('p')[9].text
except:
ing = None
#Cautions
try:
c1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[3].text
except:
c1 = None
try:
c2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[11].text
except:
c2 = ''
try:
c3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[12].text #//div[#class='answer expanded']//p[2]
except:
c3 = ''
try:
c4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[13].text
except:
c4 = ''
try:
c5 = soup.find('div',{"class":"answer expanded"}).find_all('p')[14].text
except:
c5 = ''
try:
c6 = soup.find('div',{"class":"answer expanded"}).find_all('p')[15].text
except:
c6 = ''
caution = (c1+c2+c3+c4+c5+c6).replace('\xa0','')
#Side Effects
try:
se1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[4].text
except:
se1 = ''
try:
se2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[18].text
except:
se2 = ''
try:
se3 = soup.find('div',{"class":"answer expanded"}).find_all('ul')[1].text
except:
se3 = ''
try:
se4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[19].text
except:
se4 = ''
try:
se5 = soup.find('div',{"class":"post-author-bio"}).text
except:
se5 = ''
se = (se1 + se2 + se3 + se4 + se5).replace('\n',' ')
for j in soup.find('div',{"class":"answer expanded"}).find_all('h4'):
if 'Product Code' in j.text:
prod_code = j.text
#prod_code = soup.find('div',{"class":"answer expanded"}).find_all('h4')[5].text #//div[#class='answer expanded']//h4
pharma = {"primary_name":main_name,
"secondary_name":sec_name,
"Generic_Name":generic_name,
"Description":desc,
"Directions":directions,
"Ingredients":ing,
"Caution":caution,
"Side_Effects":se,
"Product_Code":prod_code}
data.append(pharma)
But, each page is having different positions for the tags hence not giving correct data. So, I tried:-
soup.find('div',{"class":"answer expanded"}).find_all('h4')
which gives me the output:-
[<h4>Description </h4>,
<h4>Directions</h4>,
<h4>Ingredients</h4>,
<h4>Cautions</h4>,
<h4>Side Effects</h4>,
<h4>Product Code : 5513 </h4>]
I want to create a data frame where the description contains all the information given in the description, directions contain all the information of directions given on the web page.
for i in soup.find('div',{"class":"answer expanded"}).find_all('h4'):
if 'Description' in i.text:
print(soup.find('div',{"class":"answer expanded"}).findAllNext('p'))
but it prints all the after the soup.find('div',{"class":"answer expanded"}).find_all('h4'). but I want only the tags are giving me the description of the medicine and no others.
Can anyone suggest how to do this? Also, how to scrape the rate table from the page as it gives me values in unappropriate fashion?
You can try the next working example:
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
r = requests.get('https://www.canadapharmacy.com/products/abilify-tablet')
soup = BeautifulSoup(r.text,"lxml")
try:
card = ''.join([x.get_text(' ',strip=True) for x in soup.select('div.answer.expanded')])
des = card.split('Directions')[0].replace('Description','')
#print(des)
drc = card.split('Directions')[1].split('Ingredients')[0]
#print(drc)
ingre= card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[0]
#print(ingre)
cau=card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[1].split('Side Effects')[0]
#print(cau)
se= card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[1].split('Side Effects')[1]
#print(se)
except:
pass
data.append({
'Description':des,
'Directions':drc,
'Ingredients':ingre,
'Cautions':cau,
'Side Effects':se
})
print(data)
# df = pd.DataFrame(data)
# print(df)
Output:
[{'Description': " Abilify Tablet (Aripiprazole) Abilify (Aripiprazole) is a medication prescribed to treat or manage different conditions, including: Agitation associated with schizophrenia or bipolar mania (injection formulation only) Irritability associated with autistic disorder Major depressive disorder , adjunctive treatment Mania and mixed episodes associated with Bipolar I disorder Tourette's disorder Schizophrenia Abilify works by activating different neurotransmitter receptors located in brain cells. Abilify activates D2 (dopamine) and 5-HT1A (serotonin) receptors and blocks 5-HT2A (serotonin) receptors. This combination of receptor activity is responsible for the treatment effects of Abilify. Conditions like schizophrenia, major depressive disorder, and bipolar disorder are caused by neurotransmitter imbalances in the brain. Abilify helps to correct these imbalances and return the normal functioning of neurons. ", 'Directions': ' Once you are prescribed and buy Abilify, then take Abilify exactly as prescribed by your
doctor. The dose will vary based on the condition that you are treating. The starting dose of Abilify ranges from 2-15 mg once daily, and the recommended dose for most conditions is between 5-15 mg once daily. The maximum dose is 30 mg once daily. Take Abilify with or without food. ', 'Ingredients': ' The active ingredient in Abilify medication is aripiprazole . ', 'Cautions': ' Abilify and other antipsychotic medications have been associated with an increased risk of death in elderly patients with dementia-related psychosis. When combined with other dopaminergic agents, Abilify can increase the risk of neuroleptic malignant syndrome. Abilify can cause metabolic changes and in some cases can induce high blood sugar in people with and without diabetes . Abilify can also weight gain and increased risk of dyslipidemia. Blood glucose should be monitored while taking Abilify. Monitor for low blood pressure and heart rate while taking Abilify; it can cause orthostatic hypertension which may lead to dizziness or fainting. Use with caution in patients with a history of seizures. ', 'Side Effects': ' The side effects of Abilify vary greatly depending
on what condition is being treated, what other medications are being used concurrently, and what dose is being taken. Speak with your doctor or pharmacist for a full list of side effects that apply to you. Some of the most common side effects include: Akathisia Blurred vision Constipation Dizziness Drooling Extrapyramidal disorder Fatigue Headache Insomnia Nausea Restlessness Sedation Somnolence Tremor Vomiting Buy Abilify online from Canada Pharmacy . Abilify can be purchased online with a valid prescription from a doctor. About Dr. Conor Sheehy (Page Author) Dr. Sheehy (BSc Molecular Biology, PharmD) works a clinical pharmacist specializing in cardiology, oncology, and ambulatory care. He’s a board-certified pharmacotherapy specialist (BCPS), and his experience working one-on-one with patients to fine tune their medication and therapy plans for optimal results makes him a valuable subject matter expert for our pharmacy. Read More.... IMPORTANT NOTE: The above information is intended to increase awareness of health information
and does not suggest treatment or diagnosis. This information is not a substitute for individual medical attention and should not be construed to indicate that use of the drug is safe, appropriate, or effective for you. See your health care professional for medical advice and treatment. Product Code : 5513'}]
I'm making a program that counts how many times a band has played a song from a webpage of all their setlists. I have grabbed the webpage and converted all the songs played into one big list so all I wanted to do was see if the song name was in the list and add to a counter but it isn't working and I can't seem to figure out why.
I've tried using the count function instead and that didn't work
sugaree_counter = 0
link = 'https://www.cs.cmu.edu/~mleone/gdead/dead-sets/' + year + '/' + month+ '-' + day + '-' + year + '.txt'
page = requests.get(link)
page_text = page.text
page_list = [page_text.split('\n')]
print(page_list)
This code returns the list:
[['Winterland Arena, San Francisco, CA (1/2/72)', '', "Truckin'", 'Sugaree',
'Mr. Charlie', 'Beat it on Down the Line', 'Loser', 'Jack Straw',
'Chinatown Shuffle', 'Your Love At Home', 'Tennessee Jed', 'El Paso',
'You Win Again', 'Big Railroad Blues', 'Mexicali Blues',
'Playing in the Band', 'Next Time You See Me', 'Brown Eyed Women',
'Casey Jones', '', "Good Lovin'", 'China Cat Sunflower', 'I Know You Rider',
"Good Lovin'", 'Ramble On Rose', 'Sugar Magnolia', 'Not Fade Away',
"Goin' Down the Road Feeling Bad", 'Not Fade Away', '',
'One More Saturday Night', '', '']]
But when I do:
sugaree_counter = int(sugaree_counter)
if 'Sugaree' in page_list:
sugaree_counter += 1
print(str(sugaree_counter))
It will always be zero.
It should add 1 to that because 'Sugaree' is in that list
Your page_list is a list of lists, so you need two for loops to get the pages, you need to do
for page in page_list:
for item in page:
sugaree_counter += 1
Use sum() and list expressions:
sugaree_counter = sum([page.count('Sugaree') for page in page_list])
I've been working on a function which will update two dictionaries (similar authors, and awards they've won) from an open text file. The text file looks something like this:
Brabudy, Ray
Hugo Award
Nebula Award
Saturn Award
Ellison, Harlan
Heinlein, Robert
Asimov, Isaac
Clarke, Arthur
Ellison, Harlan
Nebula Award
Hugo Award
Locus Award
Stephenson, Neil
Vonnegut, Kurt
Morgan, Richard
Adams, Douglas
And so on. The first name is an authors name (last name first, first name last), followed by awards they may have won, and then authors who are similar to them. This is what I've got so far:
def load_author_dicts(text_file, similar_authors, awards_authors):
name_of_author = True
awards = False
similar = False
for line in text_file:
if name_of_author:
author = line.split(', ')
nameA = author[1].strip() + ' ' + author[0].strip()
name_of_author = False
awards = True
continue
if awards:
if ',' in line:
awards = False
similar = True
else:
if nameA in awards_authors:
listawards = awards_authors[nameA]
listawards.append(line.strip())
else:
listawards = []
listawards.append(line.strip()
awards_authors[nameA] = listawards
if similar:
if line == '\n':
similar = False
name_of_author = True
else:
sim_author = line.split(', ')
nameS = sim_author[1].strip() + ' ' + sim_author[0].strip()
if nameA in similar_authors:
similar_list = similar_authors[nameA]
similar_list.append(nameS)
else:
similar_list = []
similar_list.append(nameS)
similar_authors[nameA] = similar_list
continue
This works great! However, if the text file contains an entry with just a name (i.e. no awards, and no similar authors), it screws the whole thing up, generating an IndexError: list index out of range at this part Zname = sim_author[1].strip()+" "+sim_author[0].strip() )
How can I fix this? Maybe with a 'try, except function' in that area?
Also, I wouldn't mind getting rid of those continue functions, I wasn't sure how else to keep it going. I'm still pretty new to this, so any help would be much appreciated! I keep trying stuff and it changes another section I didn't want changed, so I figured I'd ask the experts.
How about doing it this way, just to get the data in, then manipulate the dictionary any ways you want.
test.txt contains your data
Brabudy, Ray
Hugo Award
Nebula Award
Saturn Award
Ellison, Harlan
Heinlein, Robert
Asimov, Isaac
Clarke, Arthur
Ellison, Harlan
Nebula Award
Hugo Award
Locus Award
Stephenson, Neil
Vonnegut, Kurt
Morgan, Richard
Adams, Douglas
And my code to parse it.
award_parse.py
data = {}
name = ""
awards = []
f = open("test.txt")
for l in f:
# make sure the line is not blank don't process blank lines
if not l.strip() == "":
# if this is a name and we're not already working on an author then set the author
# otherwise treat this as a new author and set the existing author to a key in the dictionary
if "," in l and len(name) == 0:
name = l.strip()
elif "," in l and len(name) > 0:
# check to see if recipient is already in list, add to end of existing list if he/she already
# exists.
if not name.strip() in data:
data[name] = awards
else:
data[name].extend(awards)
name = l.strip()
awards = []
# process any lines that are not blank, and do not have a ,
else:
awards.append(l.strip())
f.close()
for k, v in data.items():
print("%s got the following awards: %s" % (k,v))
I was trying to clean up wikitext. Specifically I was trying to remove all the {{.....}} and <..>...</..> in the wikitext. For example, for this wikitext:
"{{Infobox UK place\n|country = England\n|official_name =
Morcombelake\n|static_image_name = Morecombelake from Golden Cap -
geograph.org.uk - 1184424.jpg\n|static_image_caption = Morcombelake as
seen from Golden Cap\n|coordinates =
{{coord|50.74361|-2.85153|display=inline,title}}\n|map_type =
Dorset\n|population = \n|population_ref = \n|shire_district = [[West
Dorset]]\n|shire_county = [[Dorset]]\n|region = South West
England\n|constituency_westminster = West Dorset\n|post_town =
\n|postcode_district = \n|postcode_area = DT\n|os_grid_reference =
SY405938\n|website = \n}}\n'''Morcombelake''' (also spelled
'''Morecombelake''') is a small village near [[Bridport]] in
[[Dorset]], [[England]], within the ancient parish of [[Whitchurch
Canonicorum]]. [[Golden Cap]], part of the [[Jurassic Coast]] World
Heritage Site, is nearby.{{cite
web|url=http://www.nationaltrust.org.uk/golden-cap/|title=Golden
Cap|publisher=National Trust|accessdate=2014-05-04}}\n\n==
References ==\n{{reflist}}\n\n{{West
Dorset}}\n\n\n{{Dorset-geo-stub}}\n[[Category:Villages in
Dorset]]\n\n== External Links
==\n\n*[http://www.goldencapteamofchurches.org.uk/morcombelakechurch.html
Parish Church of St Gabriel]\n\n"
How can I use regular expressions in python to produce output like this:
\n'''Morcombelake''' (also spelled '''Morecombelake''') is a small
village near [[Bridport]] in [[Dorset]], [[England]], within the
ancient parish of [[Whitchurch Canonicorum]]. [[Golden Cap]], part of
the [[Jurassic Coast]] World Heritage Site, is nearby.\n\n==
References ==\n\n\n\n\n\n\n[[Category:Villages in Dorset]]\n\n==
External Links
==\n\n*[http://www.goldencapteamofchurches.org.uk/morcombelakechurch.html
Parish Church of St Gabriel]\n\n
As the tags are nested into each other, you can find and remove them in a loop:
n = 1
while n > 0:
s, n = re.subn('{{(?!{)(?:(?!{{).)*?}}|<[^<]*?>', '', s, flags=re.DOTALL)
s is a string containing the wikitext.
There are no the <...> tags in your example, but they should be removed as well.
I am trying to write a function in python that opens a file and parses it into a dictionary. I am trying to make the first item in the list block the key for each item in the dictionary data. Then each item is supposed to be the rest of the list block less the first item. For some reason though, when I run the following function, it parses it incorrectly. I have provided the output below. How would I be able to parse it like I stated above? Any help would be greatly appreciated.
Function:
def parseData() :
filename="testdata.txt"
file=open(filename,"r+")
block=[]
for line in file:
block.append(line)
if line in ('\n', '\r\n'):
album=block.pop(1)
data[block[1]]=album
block=[]
print data
Input:
Bob Dylan
1966 Blonde on Blonde
-Rainy Day Women #12 & 35
-Pledging My Time
-Visions of Johanna
-One of Us Must Know (Sooner or Later)
-I Want You
-Stuck Inside of Mobile with the Memphis Blues Again
-Leopard-Skin Pill-Box Hat
-Just Like a Woman
-Most Likely You Go Your Way (And I'll Go Mine)
-Temporary Like Achilles
-Absolutely Sweet Marie
-4th Time Around
-Obviously 5 Believers
-Sad Eyed Lady of the Lowlands
Output:
{'-Rainy Day Women #12 & 35\n': '1966 Blonde on Blonde\n',
'-Whole Lotta Love\n': '1969 II\n', '-In the Evening\n': '1979 In Through the Outdoor\n'}
You can use groupby to group the data using the empty lines as delimiters, use a defaultdict for repeated keys extending the rest of the values from each val returned from groupby after extracting the key/first element.
from itertools import groupby
from collections import defaultdict
d = defaultdict(list)
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
# if k is True we have a section
if k:
# get key "k" which is the first line
# from each section, val will be the remaining lines
k,*v = val
# add or add to the existing key/value pairing
d[k].extend(map(str.rstrip,v))
from pprint import pprint as pp
pp(d)
Output:
{'Bob Dylan\n': ['1966 Blonde on Blonde',
'-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or Later)',
'-I Want You',
'-Stuck Inside of Mobile with the Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
"-Most Likely You Go Your Way (And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands'],
'Led Zeppelin\n': ['1979 In Through the Outdoor',
'-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl",
'1969 II',
'-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home']}
For python2 the unpack syntax is slightly different:
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
if k:
k, v = next(val), val
d[k].extend(map(str.rstrip, v))
If you want to keep the newlines remove the map(str.rstrip..
If you want the album and songs separately for each artist:
from itertools import groupby
from collections import defaultdict
d = defaultdict(lambda: defaultdict(list))
with open("file.txt") as f:
for k, val in groupby(f, lambda x: x.strip() != ""):
if k:
k, alb, songs = next(val),next(val), val
d[k.rstrip()][alb.rstrip()] = list(map(str.rstrip, songs))
from pprint import pprint as pp
pp(d)
{'Bob Dylan': {'1966 Blonde on Blonde': ['-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or '
'Later)',
'-I Want You',
'-Stuck Inside of Mobile with the '
'Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
'-Most Likely You Go Your Way '
"(And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands']},
'Led Zeppelin': {'1969 II': ['-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home'],
'1979 In Through the Outdoor': ['-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl"]}}
I guess this is what you want?
Even if this is not the format you wanted, there are a few things you might learn from the answer:
use with for file handling
nice to have:
PEP8 compilant code, see http://pep8online.com/
a shebang
numpydoc
if __name__ == '__main__'
And SE does not like a list being continued by code...
#!/usr/bin/env python
""""Parse text files with songs, grouped by album and artist."""
def add_to_data(data, block):
"""
Parameters
----------
data : dict
block : list
Returns
-------
dict
"""
artist = block[0]
album = block[1]
songs = block[2:]
if artist in data:
data[artist][album] = songs
else:
data[artist] = {album: songs}
return data
def parseData(filename='testdata.txt'):
"""
Parameters
----------
filename : string
Path to a text file.
Returns
-------
dict
"""
data = {}
with open(filename) as f:
block = []
for line in f:
line = line.strip()
if line == '':
data = add_to_data(data, block)
block = []
else:
block.append(line)
data = add_to_data(data, block)
return data
if __name__ == '__main__':
data = parseData()
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data)
which gives:
{ 'Bob Dylan': { '1966 Blonde on Blonde': [ '-Rainy Day Women #12 & 35',
'-Pledging My Time',
'-Visions of Johanna',
'-One of Us Must Know (Sooner or Later)',
'-I Want You',
'-Stuck Inside of Mobile with the Memphis Blues Again',
'-Leopard-Skin Pill-Box Hat',
'-Just Like a Woman',
"-Most Likely You Go Your Way (And I'll Go Mine)",
'-Temporary Like Achilles',
'-Absolutely Sweet Marie',
'-4th Time Around',
'-Obviously 5 Believers',
'-Sad Eyed Lady of the Lowlands']},
'Led Zeppelin': { '1969 II': [ '-Whole Lotta Love',
'-What Is and What Should Never Be',
'-The Lemon Song',
'-Thank You',
'-Heartbreaker',
"-Living Loving Maid (She's Just a Woman)",
'-Ramble On',
'-Moby Dick',
'-Bring It on Home'],
'1979 In Through the Outdoor': [ '-In the Evening',
'-South Bound Saurez',
'-Fool in the Rain',
'-Hot Dog',
'-Carouselambra',
'-All My Love',
"-I'm Gonna Crawl"]}}