How to clean the parsing output?

How to clean the parsing output? - python

I'm working on a Python Crash Course exercise and tried to experiment with BeautifulSoup.
I tried some things and couldn't find a solution so I'm asking here what to do.
So how do I get rid of the first output, that's not a burger name?
And how do I get rid of the \n and .?
Here is the code:
from bs4 import BeautifulSoup as bs
import requests
source = requests.get('https://www.mcdonalds.com/us/en-us/full-menu/burgers.html').text
soup = bs(source, 'html.parser')
sandwiches_ordered = []
finished_sandwiches =[]
for menu in soup.find_all('div', id='maincatcontent'):
for burger in soup.find_all('div', class_='categories-item-details'):
sandwiches_ordered.append(burger.text)
print("We are making all the sandwiches!\n")
while sandwiches_ordered:
sandwich = sandwiches_ordered.pop()
finished_sandwiches.append(sandwich)
for sandwich in finished_sandwiches:
print(f"Here is your {sandwich}.")
And my output is:
We are making all the sandwiches!
Here is your
#{itemName}
#{nutrientValue}
.
Here is your
Hamburger
.
Here is your
Double Cheeseburger
.
Here is your
Cheeseburger
.
Here is your
Quarter Pounder®* with Cheese Bacon
.
Here is your
McDouble®
.
Here is your
Quarter Pounder®* with Cheese Deluxe
.
Here is your
Double Quarter Pounder®* with Cheese
.
Here is your
Quarter Pounder®* with Cheese
.
Here is your
Little Mac™
.
Here is your
Double Big Mac®
.
Here is your
Big Mac®
.

We can use the replace() function to get rid of the newline characters and slice the list (finished_sandwiches[1:]) to get rid of the first output which is not a burger name. I have included the code below that incorperates the two fixes:
from bs4 import BeautifulSoup as bs
import requests
source = requests.get('https://www.mcdonalds.com/us/en-us/full-menu/burgers.html').text
soup = bs(source, 'html.parser')
sandwiches_ordered = []
finished_sandwiches =[]
for menu in soup.find_all('div', id='maincatcontent'):
for burger in soup.find_all('div', class_='categories-item-details'):
sandwiches_ordered.append(burger.text)
print("We are making all the sandwiches!\n")
while sandwiches_ordered:
sandwich = sandwiches_ordered.pop()
finished_sandwiches.append(sandwich.replace("\n", ""))
for sandwich in finished_sandwiches[1:]:
print(f"Here is your {sandwich}.")

Related

How to remove certain items that are not duplicate in 2 lists?

For example (here's the code I'm working on):
from bs4 import BeautifulSoup
from string import digits
import requests
joke_of_the_day = []
a = []
url_joke_of_the_day = "https://www.womansday.com/life/entertainment/a38635408/corny-jokes/"
page_joke_of_the_day = requests.get(url_joke_of_the_day)
soup_joke_of_the_day = BeautifulSoup(page_joke_of_the_day.content, "html.parser")
content_joke_of_the_day = soup_joke_of_the_day.find("div", class_="article-body-content article-body standard-body-content css-z6i669 ewisyje5")
goodcontents_joke_of_the_day = content_joke_of_the_day.find_all("li")
a.append(goodcontents_joke_of_the_day)
#print(a)
for goodcontent_joke_of_the_day in goodcontents_joke_of_the_day:
joke_of_the_day1 = goodcontent_joke_of_the_day.find("strong")
joke_of_the_day2 = str(joke_of_the_day1).replace("<strong>","")
joke_of_the_day3 = joke_of_the_day2.replace("</strong>","")
joke_of_the_day4 = joke_of_the_day3.replace("<br>","")
joke_of_the_day5 = joke_of_the_day4.replace("<br/>","")
joke_of_the_day.append(joke_of_the_day5)
I'm trying to web scrape jokes for a project I'm working on, however the response to the jokes are outside of . An example:
<li>
::marker
<strong> Why did the bay strawberry cry?</strong>
<br>
**"His parents were in a jam."**
</li>
I was thinking on creating two lists and removing duplicates however that didn't work, here's the code to remove duplicates:
for i in a[:]:
if i in joke_of_the_day:
a.remove(i)
I'm open to any suggestions, I just need the bold-part of the code

To get question + responses you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://www.womansday.com/life/entertainment/a38635408/corny-jokes/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for n, joke in enumerate(soup.select("li:has(strong)"), 1):
question = joke.strong.text
response = joke.br.find_next(text=True)
print(f"Joke nr.{n}")
print(question)
print(response)
print("-" * 80)
Prints:
...
--------------------------------------------------------------------------------
Joke nr.99
What did the tomato say to the other tomato during a race?
"Ketchup."
--------------------------------------------------------------------------------
Joke nr.100
What has four wheels and flies?
A garbage truck.
--------------------------------------------------------------------------------
Joke nr.101
Why didn't the skeleton get a prom date?
He didn't have the guts to ask anyone.
--------------------------------------------------------------------------------

Extracting title from link in Python (Beautiful soup)

I am new to Python and I'm looking to extract the title from a link. So far I have the following but have hit a dead end:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
tonight = book_list[0]
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.find('a')
print (price)
print (title.contents[0])

To extract title from links, you can use title attribute.
Fore example:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
for a in soup.select('h3 > a'):
print(a['title'])
Prints:
A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red
The Dirty Little Secrets of Getting Your Dream Job
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
The Black Maria
Starving Hearts (Triangular Trade Trilogy, #1)
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Rip it Up and Start Again
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Olio
Mesaerion: The Best Science Fiction Stories 1800-1849
Libertarianism for Beginners
It's Only the Himalayas

you can use it:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
tonight = book_list[0]
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
print (title)
Output:
A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red...

By just modifying your existing code you can use the alt text which contains the book titles in your example.
print (title.contents[0].attrs["alt"])

Removing specific strings from Python Webscraping Results

I'm new to web scraping and am currently trying out this block of code
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
page = requests.get("https://leeweebrothers.com/our-food/lunch-boxes/#")
soup = BeautifulSoup(page.text, "html.parser")
names = soup.find_all('h2') #name of food
rest = soup.find_all('span', {'class' : 'amount'}) # price of food
for div, a in zip(names, rest):
print(div.text, a.text) # print name / price in same line
It works great except for one problem that I will show in the link below
printing result of 2 for loops in same line
Beside the string "HONEY GLAZED CHICKEN WING" is a $0.00 which is an outlier returned as a result of the shopping cart app on the website (it shares the span class='amount').
How would I remove this string and "move up" the other prices so that they are now in line and correspond with the names of the food
Edit: Sample output below
Line1: HONEY GLAZED CHICKEN WING $0.00
Line2: CRISPY CHICKEN LUNCH BOX
Line3: $5.00
Line4: BREADED FISH LUNCH BOX
Line5: $5.00
My desired output would be something like:
Line1: HONEY GLAZED CHICKEN WING $5.00
Line2: CRISPY CHICKEN LUNCH BOX $5.00
I'm looking for a solution that removes the outlying $0.00 and moves the rest of the prices up

I think you might have asked the wrong question. You can eliminate the $0.00 outlier, but your results for the prices still won't match up with the names.
To be sure that your list of prices and and names are in the same order, so they match up, it might be easier to search for the divs that contain both of them first:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
page = requests.get("https://leeweebrothers.com/our-food/lunch-boxes/#")
soup = BeautifulSoup(page.text, "html.parser")
# all the divs that held the foods had this same style
divs = soup.find_all('div', {'style': 'max-height:580px;'})
names_and_prices = {
# name: price
div.find('h2').text: div.find('span', {'class': 'amount'}).text
for div in divs
}
for name, price in names_and_prices.items():
print(name, price)

To get the output the way you have mentioned above, you can try like below:
import requests
from bs4 import BeautifulSoup
page = requests.get("https://leeweebrothers.com/our-food/lunch-boxes/#")
soup = BeautifulSoup(page.text, "html.parser")
for items in soup.find_all(class_='product-cat-lunch-boxes'):
name = items.find("h2").get_text(strip=True)
price = items.find(class_="amount").get_text(strip=True)
print(name,price)
Results are like:
HONEY GLAZED CHICKEN WING LUNCH BOX $5.00
CRISPY CHICKEN LUNCH BOX $4.50
BREADED FISH LUNCH BOX $4.50
EGG OMELETTE LUNCH BOX $4.50
FRIED TWO-JOINT WING LUNCH BOX $4.50

try this:
for div, a in zip(names, rest):
if a.text.strip() and '$0.00' not in a.text: # empty strings are False
print(div.text, a.text) # print name / price in same line
else: # optional
print 'Outlier' # optional
Keep in mind this will ONLY work for outliers that contain '$0.00' in a.text.

Simple web scraper formatting, how could I fix this?

I have this code:
import requests
from bs4 import BeautifulSoup
def posts_spider():
url = 'http://www.reddit.com/r/nosleep/new/'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'title'}):
href = "http://www.reddit.com" + link.get('href')
title = link.string
print(title)
print(href)
print("\n")
def get_single_item_data():
item_url = 'http://www.reddit.com/r/nosleep/new/'
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for rating in soup.findAll('div', {'class': 'score unvoted'}):
print(rating.string)
posts_spider()
get_single_item_data()
The output is:
My light.. I'm seeing and feeling things.. what's happening?
http://www.reddit.com/r/nosleep/comments/2kw0nu/my_light_im_seeing_and_feeling_things_whats/
Why being the first to move in a new Subdivision is not the most brilliant idea...
http://www.reddit.com/r/nosleep/comments/2kw010/why_being_the_first_to_move_in_a_new_subdivision/
I Am Falling.
http://www.reddit.com/r/nosleep/comments/2kvxvt/i_am_falling/
Heidi
http://www.reddit.com/r/nosleep/comments/2kvrnf/heidi/
I remember everything
http://www.reddit.com/r/nosleep/comments/2kvrjs/i_remember_everything/
To Lieutenant Griffin Stone
http://www.reddit.com/r/nosleep/comments/2kvm9p/to_lieutenant_griffin_stone/
The woman in my room
http://www.reddit.com/r/nosleep/comments/2kvir0/the_woman_in_my_room/
Dr. Margin's Guide to New Monsters: The Guest, or, An Update
http://www.reddit.com/r/nosleep/comments/2kvhe5/dr_margins_guide_to_new_monsters_the_guest_or_an/
The Evil Woman (part 5)
http://www.reddit.com/r/nosleep/comments/2kva73/the_evil_woman_part_5/
Blood for the blood god, The first of many.
http://www.reddit.com/r/nosleep/comments/2kv9gx/blood_for_the_blood_god_the_first_of_many/
An introduction to the beginning of my journey
http://www.reddit.com/r/nosleep/comments/2kv8s0/an_introduction_to_the_beginning_of_my_journey/
A hunter..of sorts.
http://www.reddit.com/r/nosleep/comments/2kv8oz/a_hunterof_sorts/
Void Trigger
http://www.reddit.com/r/nosleep/comments/2kv84s/void_trigger/
What really happened to Amelia Earhart
http://www.reddit.com/r/nosleep/comments/2kv80r/what_really_happened_to_amelia_earhart/
I Used To Be Fine Being Alone
http://www.reddit.com/r/nosleep/comments/2kv2ks/i_used_to_be_fine_being_alone/
The Green One
http://www.reddit.com/r/nosleep/comments/2kuzre/the_green_one/
Elevator
http://www.reddit.com/r/nosleep/comments/2kuwxu/elevator/
Scary story told by my 4 year old niece- The Guy With Really Big Scary Claws
http://www.reddit.com/r/nosleep/comments/2kuwjz/scary_story_told_by_my_4_year_old_niece_the_guy/
Cranial Nerve Zero
http://www.reddit.com/r/nosleep/comments/2kuw7c/cranial_nerve_zero/
Mom's Story About a Ghost Uncle
http://www.reddit.com/r/nosleep/comments/2kuvhs/moms_story_about_a_ghost_uncle/
It snowed.
http://www.reddit.com/r/nosleep/comments/2kutp6/it_snowed/
The pocket watch I found at a store
http://www.reddit.com/r/nosleep/comments/2kusru/the_pocket_watch_i_found_at_a_store/
You’re Going To Die When You Are 23
http://www.reddit.com/r/nosleep/comments/2kur3m/youre_going_to_die_when_you_are_23/
The Customer: Part Two
http://www.reddit.com/r/nosleep/comments/2kumac/the_customer_part_two/
Dimenhydrinate
http://www.reddit.com/r/nosleep/comments/2kul8e/dimenhydrinate/
•
•
•
•
•
12
12
76
4
2
4
6
4
18
2
6
13
5
16
2
2
14
48
1
13
What I want to do is, to place the matching rating for each post right next to it, so I could tell instantly how much rating does that post have, instead of printing the titles and links in 1 "block" and the rating numbers in another "block".
Thanks in advance for the help!

You can do it in one go by iterating over div elements with class="thing" (think about it as iterating over posts). For each div, get the link and rating:
from urlparse import urljoin
from bs4 import BeautifulSoup
import requests
def posts_spider():
url = 'http://www.reddit.com/r/nosleep/new/'
soup = BeautifulSoup(requests.get(url).content)
for thing in soup.select('div.thing'):
link = thing.find('a', {'class': 'title'})
rating = thing.find('div', {'class': 'score'})
href = urljoin("http://www.reddit.com", link.get('href'))
print(link.string, href, rating.string)
posts_spider()
FYI, div.thing is a CSS Selector that matches all divs with class="thing".

Python HTML parsing with beautiful soup and filtering stop words

I am parsing out specific information from a website into a file. Right now the program I have looks at a webpage, and find the right HTML tag and parses out the right contents. Now I want to further filter these "results".
For example, on the site : http://allrecipes.com/Recipe/Slow-Cooker-Pork-Chops-II/Detail.aspx
I am parsing out the ingredients which are located in < div class="ingredients"...> tag. This parser does the job nicely but I want to further process these results.
When I run this parser, it removes numbers, symbols, commas, and slash(\ or /) but leaves all text. When I run it on the website I get results like:
cup olive oil
cup chicken broth
cloves garlic minced
tablespoon paprika
Now I want to further process this by removing stop words like "cup", "cloves", "minced", "tablesoon" among others. How exactly do I do this? This code is written in python and I am not very good at it, and I am just using this parser to get information which I can manually enter but I would rather not.
Any help on how to do this in detail would be appreciated! My code is below: how would I do this?
Code:
import urllib2
import BeautifulSoup
def main():
url = "http://allrecipes.com/Recipe/Slow-Cooker-Pork-Chops-II/Detail.aspx"
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data)
ingreds = bs.find('div', {'class': 'ingredients'})
ingreds = [s.getText().strip('123456789.,/\ ') for s in ingreds.findAll('li')]
fname = 'PorkRecipe.txt'
with open(fname, 'w') as outf:
outf.write('\n'.join(ingreds))
if __name__=="__main__":
main()

import urllib2
import BeautifulSoup
import string
badwords = set([
'cup','cups',
'clove','cloves',
'tsp','teaspoon','teaspoons',
'tbsp','tablespoon','tablespoons',
'minced'
])
def cleanIngred(s):
# remove leading and trailing whitespace
s = s.strip()
# remove numbers and punctuation in the string
s = s.strip(string.digits + string.punctuation)
# remove unwanted words
return ' '.join(word for word in s.split() if not word in badwords)
def main():
url = "http://allrecipes.com/Recipe/Slow-Cooker-Pork-Chops-II/Detail.aspx"
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data)
ingreds = bs.find('div', {'class': 'ingredients'})
ingreds = [cleanIngred(s.getText()) for s in ingreds.findAll('li')]
fname = 'PorkRecipe.txt'
with open(fname, 'w') as outf:
outf.write('\n'.join(ingreds))
if __name__=="__main__":
main()
results in
olive oil
chicken broth
garlic,
paprika
garlic powder
poultry seasoning
dried oregano
dried basil
thick cut boneless pork chops
salt and pepper to taste
? I don't know why it's left the comma in it - s.strip(string.punctuation) should have taken care of that.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to clean the parsing output? - python

Related

How to remove certain items that are not duplicate in 2 lists?

Extracting title from link in Python (Beautiful soup)

Removing specific strings from Python Webscraping Results

Simple web scraper formatting, how could I fix this?

Python HTML parsing with beautiful soup and filtering stop words

Categories

Resources