I have the following script which scraps all the information from a website.
However, when I run it, I get duplicated records of the blogs.
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
blog_topics = []
page = "https://www.bartonassociates.com/blog/"
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
for link in soup.find_all(href=re.compile("/blog/tag")):
url = link.get('href')
if '/blog/tag/p' not in urlparse(link.get('href')).path:
blog_topics.append(url)
else:
pass
# VARIABLE TO DEFINE A RANGE BASED ON NO.OF PAGES
pages = np.arange(1)
# DEFINING CUSTOM VARIABLES
title_blognames_links_ = []
author_and_dates_ = []
# LOOP TO RETRIEVE TITLE, BLOG NAMES, LINKS, AUTHORS AND DATE PUBLISHED
for page in pages:
for blogs in blog_topics:
blog_url= blogs +'/p' + str(page)
sleep(randint(2,7))
soup = BeautifulSoup(requests.get(blog_url).content, 'html.parser')
#Information on title, blog names and their links
for h4 in soup.select("h4"):
for h2 in soup.select("h2"):
title_blognames_links_.append((h4.get_text(strip=True), h4.a["href"], h2.get_text(strip=True).replace('"',"")[11:]))
#Information of authors and dates
for tag in soup.find_all(class_="author"):
author_and_dates_.append(tag.get_text(strip=True))
I believe it has to do something with the pages = np.arange(1) range I have provided.
P.S. (1) was just a trail. I have tried (1,17),(1),(2)
Background: The maximum pages in a blog topic I have is 17, each topic
has 10 blogs each (approx)
What I am looking for, is to get all unique blog information from all the blog topics
Not sure what I am doing wrong here
To get all information from all topics you can first grab all topic links (you've done that in your code too) and then for each topic get all pages and all information (not other way around):
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.bartonassociates.com/blog"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
topics = [
a["href"] for a in soup.select('h3:-soup-contains("Blog Topics") + ul a')
]
all_data = []
for t in topics:
while True:
soup = BeautifulSoup(requests.get(t).content, "html.parser")
topic_name = re.search(
r'"([^"]+)"', soup.select_one("h2").get_text(strip=True)
).group(1)
for entry in soup.select(".blog-entry"):
title = entry.h4.get_text(strip=True)
title = entry.h4.get_text(strip=True)
link = entry.a["href"]
tmp = entry.select_one(".author").get_text(strip=True)
if tmp:
author, date = map(
str.strip,
entry.select_one(".author").get_text(strip=True).split("|"),
)
else:
author, date = "N/A", "N/A"
all_data.append([topic_name, title, link, author, date])
print(topic_name, title, link, author, date, sep="\n")
print()
t = soup.select_one('a:-soup-contains("View More")')
if not t:
break
t = t["href"]
df = pd.DataFrame(
all_data, columns=["topic", "title", "link", "author", "date"]
)
print(df)
df.to_csv("data.csv", index=False)
Prints:
Healthcare News and Trends
DO vs. MD: What’s the Difference?
https://www.bartonassociates.com/blog/whats-the-difference-do-md
Tayla Holman September 09, 2021
Healthcare News and Trends
What is “The Great Resignation?”
https://www.bartonassociates.com/blog/what-is-the-great-resignation
Chris Keeley September 02, 2021
Healthcare News and Trends
CME Requirements for Physicians by State
https://www.bartonassociates.com/blog/cme-requirements-for-physicians-by-state
Teresa Otto, MD July 15, 2021
...and so on.
and saves data.csv (screenshot from LibreOffice):
Related
I am trying to scrape numerous companies sites in Python for their news releases.
I figured out I need to use chickennoodle = soup(html_text, 'lxml') instead of chickennoodle = soup(html_text, 'html.parser') for aspx sites. I am still getting the basic urls back like their contact and careers links instead of the actual news article links. When I inspect the website it looks something like:
<a class="module_headline-link" href="/news-and-events/news/news-details/2022/Compugen-to-Release-Second-Quarter-Results-on-Thursday-August-4-2022/default.aspx">Compugen to Release Second Quarter Results on Thursday, August 4, 2022</a>.
On the basic html sites it works to print all of my_links and I can filter which link by the hashed out lines. I thought I'd add a few examples of troubled scrapes and one of a working one. I assume the not working ones are the same problem and probably due to not understanding the intricacies of lxml. I just assume it can't see the articles for some reason (unlike the html) because they start with /. Thanks for any help.
COMPANY 1-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://ir.cgen.com/news-and-events/news/default.aspx'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'lxml')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
#if str(my_links).startswith("/news-and-events/news/news-details/"):
# print(str(full)+my_links)
#else:
# None
COMPANY 2-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://www.meipharma.com/media/press-releases'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'html.parser')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
# if str(my_links).startswith(""):
# print(str(full)+my_links)
# else:
# None
COMPANY 3-
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = 'https://investor.sierraoncology.com/news-releases/default.aspx'
full = ''
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'lxml')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
print(my_links)
VS html site that works for my purposes
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
URL = "https://investors.aileronrx.com/index.php/news-releases"
full = "https://investors.aileronrx.com"
ALRNlinks = []
html_text = requests.get(URL).text
chickennoodle = soup(html_text, 'html.parser')
for link in chickennoodle.find_all('a'):
my_links = (link.get('href'))
if str(my_links).startswith("/news-rele"):
ALRN = (str(full)+my_links)
ALRNlinks.append(ALRN)
print(ALRNlinks)
The website from your first example is loading information dynamically in page, so requests won't see the information pulled by javascript, after the page loaded. You can however look into Dev Tools - network tab, and see which urls are being accessed by javascript, and try and scrape those. For example:
import requests
import pandas as pd
url = 'https://ir.cgen.com/feed/PressRelease.svc/GetPressReleaseList?LanguageId=1&bodyType=0&pressReleaseDateFilter=3&categoryId=1cb807d2-208f-4bc3-9133-6a9ad45ac3b0&pageSize=-1&pageNumber=0&tagList=&includeTags=true&year=2022&excludeSelection=1'
r = requests.get(url)
df = pd.DataFrame(r.json()['GetPressReleaseListResult'])
print(df)
This will print out:
Attachments Body Category DocumentFileSize DocumentFileType DocumentPath ExcludeFromLatest Headline LanguageId LinkToDetailPage ... RevisionNumber SeoName ShortBody ShortDescription Subheadline SubheadlineHtml TagsList ThumbnailPath WorkflowId PressReleaseDate
0 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Release Second Quarter Results on ... 1 /news-and-events/news/news-details/2022/Compug... ... 33221 Compugen-to-Release-Second-Quarter-Results-on-... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... e7b13fbb-ddc7-4955-a9c6-b44e6ab223ec 07/21/2022 07:00:00
1 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Present at Upcoming Industry Confe... 1 /news-and-events/news/news-details/2022/Compug... ... 33213 Compugen-to-Present-at-Upcoming-Industry-Confe... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... 1e5cb121-a9f7-4e1b-86c1-1571065d40b5 06/27/2022 07:00:00
2 [] None PDF https://s26.q4cdn.com/977440944/files/doc_news... False Compugen to Present at Upcoming Investor Confe... 1 /news-and-events/news/news-details/2022/Compug... ... 33202 Compugen-to-Present-at-Upcoming-Investor-Confe... None None None [] https://s26.q4cdn.com/977440944/files/doc_news... 8c004950-09c8-4831-bdfa-25f660afe250 06/01/2022 07:00:00
[...]
You can apply this for your other examples as well.
I'm a beginner with Python & trying to learn with a BeautifulSoup webscraping project.
I'm looking to scrape the record item title, URL of item & purchase date from this URL & export to a CSV.
I made great progress with scraping title & URL but just cannot figure out how to properly code the purchase date info correctly in my for loop (purchase_date variable below).
What's currently happening is the data in the csv file for the purchase date (e.g. p_date title) just displays blank cells with no text.. no error message just no data getting put into csv. Any guidance is much appreciated.
Thank you!!
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.popsike.com/php/quicksearch.php?searchtext=metal+-signed+-promo+-beatles+-zeppelin+-acetate+-test+-sinatra&sortord=aprice&pagenum=1&incldescr=1&sprice=100&eprice=&endfrom=2020&endthru=2020&bidsfrom=&bidsthru=&layout=&flabel=&fcatno="
results = requests.get(url, headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
title = []
date = []
URL = []
record_div = soup.find_all('div', class_='col-md-7 add-desc-box')
for container in record_div:
description = container.a.text
title.append(description)
link = container.find('a')
URL.append(link.get('href'))
purchase_date = container.find('span',class_= 'info-row').text
date.append(purchase_date)
test_data = pd.DataFrame({
'record_description': title,
'link': URL,
'p_date': date
})
test_data['link'] = test_data['link'].str.replace('../','https://www.popsike.com/',1)
print(test_data)
test_data.to_csv('popaaron.csv')
I suggest to change parser type:
soup = BeautifulSoup(results.text, "html5")
And fix search expression for purchase date:
purchase_date = container.select('span.date > b')[0].text.strip(' \t\n\r')
I am using BeautifulSoup to scrape movies in the IMDB website. I was able to scrape name, genre, duration, rating of movies successfully. But I am not able to scrape description of the movies as when I am looking at the classes, it is "text-muted" and since this class is there multiple times holding other data such as rating, genre, duration. But since these data has inner classes also, so it was easier for me to scrape it but when it is coming to description, it does not have any inner class. So when pulling out data just using "text-muted" is giving other data also. How do I just get the description of the movies?
Attaching the code and screenshot for reference:
The sample code which I used to scrape genre is as follows:
genre_tags=data.select(".text-muted .genre")
genre=[g.get_text() for g in genre_tags]
Genre = [item.strip() for item in genre if str(genre)]
print(Genre)
In general, lxml is much better than beautifulsoup.
import requests
from lxml
import html
url = "xxxx"
r = requests.get(url)
tree = html.fromstring(r.text)
rows = tree.xpath('//div[#class="lister-item mode-detail"]')
for row in rows:
description = row.xpath('.//div[#class="ratings-bar"]/following-sibling::p[#class="text-muted"]/text()')[0].strip()
You can use this, :) , if helped you, UP my solution pls.. thks,
from bs4 import BeautifulSoup
from requests_html import HTMLSession
URL = 'https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm' #url of Most Popular Movies in IMDB
PAGE = HTMLSession().get(URL)
PAGE_BS4 = BeautifulSoup(PAGE.html.html,'html.parser')
MoviesObj = PAGE_BS4.find_all("tbody","lister-list") #get table body of Most Popular Movies
for index in range(len(MoviesObj[0].find_all("td","titleColumn"))):
a = list(MoviesObj[0].find_all("td","titleColumn")[index])[1]
href = 'https://www.imdb.com'+a.get('href') #get each link for movie page
moviepage = HTMLSession().get(href) #request each page of movie
moviepage = BeautifulSoup(moviepage.html.html,'html.parser')
title = list(moviepage.find_all('h1')[0].stripped_strings)[0] #parse title
year = list(moviepage.find_all('h1')[0].stripped_strings)[2] #parse year
try:
score = list(moviepage.find_all('div','ratingValue')[0].stripped_strings)[0] #parse score if is available
except IndexError:
score = '-' #if score is not available '-' is filled
description = list(moviepage.find_all('div','summary_text')[0].stripped_strings)[0] #parse description
print(f'TITLE: {title} YEAR: {year} SCORE: {score}\nDESCRIPTION:{description}\n')
PRINT
Junior Saldanha
#UmSaldanha
I am trying to extract drug information from this website: https://www.medindia.net/doctors/drug_information/abacavir.htm.
So far, i am able to extract the drug information (abacavir)for one web-page. Now, i need advice on how to construct a loop to extract all drug information i need for other drug from other web-pages in the same website (www.medindia.net). The code is as below.
import pandas as pd
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
url = 'https://www.medindia.net/doctors/drug_information/abacavir.htm'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
drug = soup.find(class_='mi-container__fluid')
print(drug)
# whole page contain drug content
items = drug.find_all(class_='report-content drug-widget')
print(items)
# extract drug information from drug content into individual variable
trade_name = items[0].find(class_='drug-content').get_text(strip=True).replace("\n", "")
function = items[1].find(class_='drug-content').get_text(strip=True).replace("\n", "")
Contraindications = items[2].find(class_='drug-content').get_text(strip=True).replace("\n", "")
Dosage = items[3].find(class_='drug-content').get_text(strip=True).replace("\n", "")
how_to_use = items[4].find(class_='drug-content').get_text(strip=True).replace("\n", "")
warnings = items[5].find(class_='drug-content').get_text(strip=True).replace("\n", "")
storage = items[7].find(class_='drug-content').get_text(strip=True).replace("\n", "")
drug_stuff = pd.DataFrame(
{
'trade_name':[trade_name],
'function': [function],
'Contraindications': [Contraindications],
'Dosage': [Dosage],
'how_to_use':[how_to_use],
'warnings':[warnings],
'storage':[storage],
})
print(drug_stuff)
Create a function!
def extract_drug_info(url):
""" Extracts drug information of a given medindia.com website. """
# ... rest of the code you posted above here ...
return drug_stuff
Then loop through some URLs:
urls = ["https://www.medindia.net/doctors/drug_information/abacavir.htm", "..."]
for url in urls:
print(extract_drug_info(url))
I have a dataframe from web scraping all pages from the website animeka website:
import pandas as pd
import requests
from bs4 import BeautifulSoup
for page_no in range(1, 467):
url = 'http://www.animeka.com/animes/~_{}.html'.format(page_no)
titles, studios, genres, durations = [], [], [], []
for page_no in range(1, 467):
url = 'http://www.animeka.com/animes/~_{}.html'.format(page_no)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
for table in soup.find_all('table', class_='animesindex'):
td = table.find_all('td', class_='animestxt')
titles.append(td[1].text.split(':')[1])
studios.append(td[3].text.split(':')[1])
genres.append(td[4].text.split(':')[1])
durations.append(td[6].text.split(':')[1])
headers = ['Title', 'Studio', 'Genres', 'Duration']
df = pd.DataFrame(dict(zip(headers, [titles, studios, genres, durations])))
df = pd.DataFrame({'duration':df["Duration"], "genre" : df["Genres"], 'studio':df["Studio"], "titre" : df["Title"]})
And I would like to get user_id and rating they put for each anime but this is in picture in "detail" subsection and I do not know how to do to gather that information.
This is a picture code where rating is:
<img src="/animes/13498.png" width="400" height="100" alt="graph">
You can use the find_previous method to find tags and strings that come before a particular tag in the document.
td[1].find_previous('td')
So, if you wanted to extract the name of the image, you'd try this:
td[1].find_previous('td').img['src'].split('/')[-1]