Extract specific portions in html file using python

Extract specific portions in html file using python - python

How can I extract a specific portion of a html file example https://patents.google.com/patent/EP1208209A1/en?oq=medicinal+chemistry
So far I used beautifulsoup to get the text version of the html without all the tags. But I would like my code to read only say the claims sections of the above mentioned file.

here you have mate, i found out that in this site, the claims section is a html with its own Id, making things easier. I just colected the section and gave the string so you can play with it.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://patents.google.com/patent/EP1208209A1/en?oq=medicinal+chemistry")
soup = BeautifulSoup(page.content, 'html.parser')
claim_sect = soup.find_all('section', attrs={"itemprop":"claims"})
print('This is the raw content: \n')
print(str(claim_sect))
print('This is the variable type: \n')
print(str(type(claim_sect)))
str_sect = claim_sect[0]

As far as I see, there are two divs with the class="flex flex-width style-scope patent-result".
soup = BeautifulSoup(sdata)
mydivs = soup.findAll("div", {"class": "flex flex-width style-scope patent-result"})
div_with_claims = mydivs [1]

filename= 'C:/Users/xyz/.ipynb_checkpoints/EP1208209A1.html'
html_file =open(filename, 'r', encoding='utf-8')
source_code = html_file.read()
#print(source_code)
soup = BeautifulSoup(source_code)
print(soup.get_text())
#mydivs = soup.findAll("div", {"class": "flex flex-width style-scope patent-result"})
#div_with_claims = mydivs [1]
#print(div_with_claims)

Related

Trying to scrape image url's in Python using beautiful soup

I'm new to Python and need some help. I am trying to scrape the image urls from this site but can't seems to do so. I pull up all the html. Here is my code.
import requests
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup
import csv
baseurl = ('https://www.thewhiskyexchange.com/')
productlinks = []
for x in range(1,4):
r = requests.get(f'https://www.thewhiskyexchange.com/c/316/campbeltown-single-malt-scotch-whisky?pg={x}')
soup = BeautifulSoup(r.content, 'html.parser')
tag = soup.find_all('ul',{'class':'product-grid__list'})
for items in tag:
for link in items.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
#print(len(productlinks))
for items in productlinks:
r = requests.get(items)
soup = BeautifulSoup(r.content, 'html.parser')
name = soup.find('h1', class_='product-main__name').text.strip()
price = soup.find('p', class_='product-action__price').text.strip()
imgurl = soup.find('div', class_='product-main__image-container')
print(imgurl)
And here is the piece of HTML I am trying to scrape from.
<div class="product-card__image-container"><img src="https://img.thewhiskyexchange.com/480/gstob.non1.jpg" alt="Glen Scotia Double Cask Sherry Finish" class="product-card__image" loading="lazy" width="3" height="4">
I would appreicate any help. Thanks

You need to first select the image then get the src attribute.
Try this:
imgurl = soup.find('div', class_='product-main__image-container').find('img')['src']

I'm not sure if I fully understand what output you are looking for. But if you just want the img source URLs, this might work:
# imgurl = soup.find('div', class_='product-main__image-container')
imgurl = soup.find('img', class_='product-main__image')
imgurl_attribute = imgurl['src']
print(imgurl_attribute[:5])
#https://img.thewhiskyexchange.com/900/gstob.non1.jpg
#https://img.thewhiskyexchange.com/900/gstob.15yov1.jpg
#https://img.thewhiskyexchange.com/900/gstob.18yov1.jpg
#https://img.thewhiskyexchange.com/900/gstob.25yo.jpg
#https://img.thewhiskyexchange.com/900/sets_gst1.jpg

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks

Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.

If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.

You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

How do I extract the text from this div?

<div id="wordlist" style="display:none;">leave|river|what|try|just|because|cut|now|made|
I just want to get the words and here's the code that I used but it just gives me wordlist as the output:
import requests
from bs4 import BeautifulSoup
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
text = soup.find("div", attrs={'id': 'wordlist'}).get('id')
print(text)

The code below should output just the text.
text = soup.find("div", attrs={'id': 'wordlist'})
print(text.text)

The problem is your use of get('id'). Instead, just print text from the soup.find result.
result = soup.find("div", attrs={'id': 'wordlist'})
print(result.text)

No output in console python

from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'secondaryInfo' }):
href = link.get('href')
print(href)
imdb_spider()
I'm trying to get links of all top rated movies from imdb . I'm using pycharm . The code runs for more than 30 mins but i'm not getting any print in my console.

You're correct that there's an element with class secondaryInfo for every movie title, but that's not the a element. If you want to find that, you have to use a different selector. For example, the following selector will do the trick instead of using soup.findAll().
soup.select('td.titleColumn a')

The problem is that {'class': 'secondaryInfo' } is a parameter of <span> object.
So try this:
from bs4 import BeautifulSoup
import requests
def imdb_spider():
url = 'http://www.imdb.com/chart/top'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "lxml")
for td in soup.findAll('td', {'class': 'titleColumn'}):
href = td.find('a').get('href')
print(href)
imdb_spider()

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?

Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!

A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]

This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.

Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract specific portions in html file using python - python

As far as I see, there are two divs with the class="flex flex-width style-scope patent-result". soup = BeautifulSoup(sdata) mydivs = soup.findAll("div", {"class": "flex flex-width style-scope patent-result"}) div_with_claims = mydivs [1]

Related

Trying to scrape image url's in Python using beautiful soup

How do I make this web crawler print only the titles of the songs?

How do I extract the text from this div?

No output in console python

Web crawler - following links

Categories

Resources