Scraping data for Company details - python

I am trying to scrape Company name, Postcode, phone number and web address from:
https://www.matki.co.uk/matki-dealers/ Finding it difficult as the information is only retrieved upon clicking the region on the page. If anyone could help it would be much appreciated. Very new to both Python and especially scraping!
!pip install beautifulsoup4
!pip install urllib3
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://www.matki.co.uk/matki-dealers/"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

I guess this is what you wanted to do: (you can put the result after in a file or a database, or even parse it and use it directly)
import requests
from bs4 import BeautifulSoup
URL = "https://www.matki.co.uk/matki-dealers/"
page = requests.get(URL)
# parse HTML
soup = BeautifulSoup(page.content, "html.parser")
# extract the HTML results
results = soup.find(class_="dealer-region")
company_elements = results.find_all("article")
# Loop through the results and extract the wanted informations
for company_element in company_elements:
# some cleanup before printing the info:
company_info = company_element.getText(separator=u', ').replace('Find out more »', '')
# the results ..
print(company_info)
Output:
ESP Bathrooms & Interiors, Queens Retail Park, Queens Street, Preston, PR1 4HZ, 01772 200400, www.espbathrooms.co.uk
Paul Scarr & Son Ltd, Supreme Centre, Haws Hill, Lancaster Road A6, Carnforth, LA5 9DG, 01524 733788,
Stonebridge Interiors, 19 Main Street, Ponteland, NE20 9NH, 01661 520251, www.stonebridgeinteriors.com
Bathe Distinctive Bathrooms, 55 Pottery Road, Wigan, WN3 5AA, www.bathe-showroom.co.uk
Draw A Bath Ltd, 68 Telegraph Road, Heswall, Wirral, CH60 7SG, 0151 342 7100, www.drawabath.co.uk
Acaelia Home Design, Unit 4 Fence Avenue Industrial Estate, Macclesfield, Cheshire, SK10 1LT, 01625 464955, www.acaeliahomedesign.co.uk
...

Related

How to open scraped links one by one automatically using Python?

So here is my situation: Let's say you search on eBay for "Motorola DynaTAC 8000x". The bot that I build is going to scrape all the links of the listings. My goal is now, to make it open those scraped links one by one.
I think something like that would be possible with using loops, but I am not sure on how to do it. Thanks in advance!
Here is the code of the bot:
import requests
from bs4 import BeautifulSoup
url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Motorola+DynaTAC+8000x&_sacat=0"
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.select("li a")
for a in listings:
link = a["href"]
if link.startswith("https://www.ebay.com/itm/"):
print(link)
To get information from the link you can do:
import requests
from bs4 import BeautifulSoup
url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=Motorola+DynaTAC+8000x&_sacat=0"
r = requests.get(url)
soup = BeautifulSoup(r.content, features="lxml")
listings = soup.select("li a")
for a in listings:
link = a["href"]
if link.startswith("https://www.ebay.com/itm/"):
s = BeautifulSoup(requests.get(link).content, "lxml")
price = s.select_one('[itemprop="price"]')
print(s.h1.text)
print(price.text if price else "-")
print(link)
print("-" * 80)
Prints:
...
Details about  MOTOROLA DYNATAC 8100L- BRICK CELL PHONE VINTAGE RETRO RARE MUSEUM 8000X
GBP 555.00
https://www.ebay.com/itm/393245721991?hash=item5b8f458587:g:c7wAAOSw4YdgdvBt
--------------------------------------------------------------------------------
Details about  MOTOROLA DYNATAC 8100L- BRICK CELL PHONE VINTAGE RETRO RARE MUSEUM 8000X
GBP 555.00
https://www.ebay.com/itm/393245721991?hash=item5b8f458587:g:c7wAAOSw4YdgdvBt
--------------------------------------------------------------------------------
Details about  Vintage Pulsar Extra Thick Brick Cell Phone Has Dynatac 8000X Display
US $3,000.00
https://www.ebay.com/itm/163814682288?hash=item26241daeb0:g:sTcAAOSw6QJdUQOX
--------------------------------------------------------------------------------
...

Recursive Web Scraping Pagination

I'm trying to scrape some real estate articles from the following website:
Link
I manage to get the links I need,but I am struggling with pagination on the web page.I'm trying to scrape every link under each category 'building relationships', 'building your team', 'capital rising' etc.Some of these categories pages have pagination and some of them do not contain pagination.I tried with the following code but it just gives me the links from 2 page.
from requests_html import HTMLSession
def tag_words_links(url):
global _session
_request = _session.get(url)
tags = _request.html.find('a.tag-cloud-link')
links = []
for link in tags:
links.append({
'Tags': link.find('a', first=True).text,
'Links': link.find('a', first=True).attrs['href']
})
return links
def parse_tag_links(link):
global _session
_request = _session.get(link)
articles = []
try:
next_page = _request.html.find('link[rel="next"]', first=True).attrs['href']
_request = _session.get(next_page)
article_links = _request.html.find('h3 a')
for article in article_links:
articles.append(article.find('a', first=True).attrs['href'])
except:
_request = _session.get(link)
article_links = _request.html.find('h3 a')
for article in article_links:
articles.append(article.find('a', first=True).attrs['href'])
return articles
if __name__ == '__main__':
_session = HTMLSession()
url = 'https://lifebridgecapital.com/podcast/'
links = tag_words_links(url)
print(parse_tag_links('https://lifebridgecapital.com/tag/multifamily/'))
To print title of every article under each tag and each page under the tag you can use this example:
import requests
from bs4 import BeautifulSoup
url = "https://lifebridgecapital.com/podcast/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag_links = [a["href"] for a in soup.select(".tagcloud a")]
for link in tag_links:
while True:
print(link)
print("-" * 80)
soup = BeautifulSoup(requests.get(link).content, "html.parser")
for title in soup.select("h3 a"):
print(title.text)
print()
next_link = soup.select_one("a.next")
if not next_link:
break
link = next_link["href"]
Prints:
...
https://lifebridgecapital.com/tag/multifamily/
--------------------------------------------------------------------------------
WS890: Successful Asset Classes In The Current Market with Jerome Maldonado
WS889: How To Avoid A $1,000,000 Mistake with Hugh Odom
WS888: Value-Based On BRRRR VS Cap Rate with John Stoeber
WS887: Slow And Steady Still Wins The Race with Nicole Pendergrass
WS287: Increase Your NOI by Converting Units to Short Term Rentals with Michael Sjogren
WS271: Investment Strategies To Survive An Economic Downturn with Vinney Chopra
WS270: Owning a Construction Company Creates More Value with Abraham Ng’hwani
WS269: The Impacts of Your First Deal with Kyle Mitchell
WS260: Structuring Deals To Get The Best Return On Investment with Jeff Greenberg
WS259: Capital Raising For Newbies with Bryan Taylor
https://lifebridgecapital.com/tag/multifamily/page/2/
--------------------------------------------------------------------------------
WS257: Why Ground Up Development is the Best Investment with Sam Bates
WS256: Mobile Home Park Investing: The Real Deal with Jefferson Lilly
WS249: Managing Real Estate Paperwork Successfully with Krista Testani
WS245: Multifamily Syndication with Venkat Avasarala
WS244: Passive Investing In Real Estate with Kay Kay Singh
WS243: Getting Started In Real Estate Brokerage with Tyler Chesser
WS213: Data Analytics In Real Estate with Raj Tekchandani
WS202: Ben Leybovich and Sam Grooms on The Advantages Of A Partnership In Real Estate Business
WS199: Financial Freedom Through Real Estate Investing with Rodney Miller
WS197: Loan Qualifications: How The Whole Process Works with Vinney Chopra
https://lifebridgecapital.com/tag/multifamily/page/3/
--------------------------------------------------------------------------------
WS172: Real Estate Syndication with Kyle Jones
...

Scraping citation text from PubMed search results with BeautifulSoup and Python?

So I'm attempting to scrape all the citations in AMA format from a PubMed search from every article. The following code is just intended to get the citation data from the first article.
import requests
import xlsxwriter
from bs4 import BeautifulSoup
URL = 'https://pubmed.ncbi.nlm.nih.gov/?term=infant+formula&size=200'
response = requests.get(URL)
html_soup = BeautifulSoup(response.text, 'html5lib')
article_containers = html_soup.find_all('article', class_ = 'labs-full-docsum')
first_article = article_containers[0]
citation_text = first_article.find('div', class_ = 'docsum-wrap').find('div', class_ = 'result-actions-bar').div.div.find('div', class_ = 'content').div.div.text
print(citation_text)
The script returns a blank line, even though when I inspect the source through Google Chrome, the text is clearly visible within that "div".
Does this have something to do with JavaScript, and if so, how do I fix it?
This script will get all citations in "AMA" format from the URL provided:
import json
import requests
from bs4 import BeautifulSoup
URL = 'https://pubmed.ncbi.nlm.nih.gov/?term=infant+formula&size=200'
response = requests.get(URL)
html_soup = BeautifulSoup(response.text, 'html5lib')
for article in html_soup.select('article'):
print(article.select_one('.labs-docsum-title').get_text(strip=True, separator=' '))
citation_id = article.input['value']
data = requests.get('https://pubmed.ncbi.nlm.nih.gov/{citation_id}/citations/'.format(citation_id=citation_id)).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
print(data['ama']['orig'])
print('-' * 80)
Prints:
Review of Infant Feeding: Key Features of Breast Milk and Infant Formula .
Martin CR, Ling PR, Blackburn GL. Review of Infant Feeding: Key Features of Breast Milk and Infant Formula. Nutrients. 2016;8(5):279. Published 2016 May 11. doi:10.3390/nu8050279
--------------------------------------------------------------------------------
Prebiotics in infant formula .
Vandenplas Y, De Greef E, Veereman G. Prebiotics in infant formula. Gut Microbes. 2014;5(6):681-687. doi:10.4161/19490976.2014.972237
--------------------------------------------------------------------------------
Effects of infant formula composition on long-term metabolic health.
Lemaire M, Le Huërou-Luron I, Blat S. Effects of infant formula composition on long-term metabolic health. J Dev Orig Health Dis. 2018;9(6):573-589. doi:10.1017/S2040174417000964
--------------------------------------------------------------------------------
Selenium in infant formula milk.
He MJ, Zhang SQ, Mu W, Huang ZW. Selenium in infant formula milk. Asia Pac J Clin Nutr. 2018;27(2):284-292. doi:10.6133/apjcn.042017.12
--------------------------------------------------------------------------------
... and so on.

Extracting title from link in Python (Beautiful soup)

I am new to Python and I'm looking to extract the title from a link. So far I have the following but have hit a dead end:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
tonight = book_list[0]
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.find('a')
print (price)
print (title.contents[0])
To extract title from links, you can use title attribute.
Fore example:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
for a in soup.select('h3 > a'):
print(a['title'])
Prints:
A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red
The Dirty Little Secrets of Getting Your Dream Job
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
The Black Maria
Starving Hearts (Triangular Trade Trilogy, #1)
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Rip it Up and Start Again
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Olio
Mesaerion: The Best Science Fiction Stories 1800-1849
Libertarianism for Beginners
It's Only the Himalayas
you can use it:
import requests
from bs4 import BeautifulSoup
page = requests.get("http://books.toscrape.com/")
soup = BeautifulSoup(page.content, 'html.parser')
books = soup.find("section")
book_list = books.find_all(class_="product_pod")
tonight = book_list[0]
for book in book_list:
price = book.find(class_="price_color").get_text()
title = book.select_one('a img')['alt']
print (title)
Output:
A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red...
By just modifying your existing code you can use the alt text which contains the book titles in your example.
print (title.contents[0].attrs["alt"])

Get House Price data

I am trying to grab the house price along with the address, and hopefully other relevant data (bedrooms?). I have got the following so far. Using google's element inspection I can see that there is a element, but if I search for this I won't get the address.
Any thoughts?
import requests
from bs4 import BeautifulSoup
query='http://www.realestate.com.au/buy/with-2-bedrooms-in-epping%2c+nsw+2121/list-1?maxBeds=2&source=refinements'
resp = requests.get(query)
soup = BeautifulSoup(resp.text)
ads=soup.findAll("div", {"id": "searchResultsTbl"})
If you need to get address use this:
import requests
from bs4 import BeautifulSoup
query='http://www.realestate.com.au/buy/with-2-bedrooms-in-epping%2c+nsw+2121/list-1?maxBeds=2&source=refinements'
resp = requests.get(query)
soup = BeautifulSoup(resp.text)
ads = soup.find("div", {"class": "vcard"})
print ads.h2.a.text
Output:
61 Mobbs Lane, Epping, NSW 2121
For all addresses use this:
soup = BeautifulSoup(resp.text)
ads = soup.findAll("div", {"class": "vcard"})
for ad in ads:
print ad.h2.a.text
Output:
61 Mobbs Lane, Epping, NSW 2121
29/3-5 Kandy Avenue, Epping, NSW 2121
5/30 Cambridge Street, Epping, NSW 2121
...
101/239-243 Carlingford Rd, Carlingford, NSW...
65-69 Adderton Road, Telopea, NSW 2117
And for rooms you can use something like this:
rooms = soup.findAll("li", {"class":"first"})
for room in rooms:
if room.span:
print room.span.text

Categories