Scraping AJAX loaded content with python? - python

So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?

You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].

Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.

Related

How can i extract values from bs4.element.?

So far I managed to make this:
from bs4 import BeautifulSoup
import requests
def function():
url = 'https://dynasty-scans.com/chapters/liar_satsuki_can_see_death_ch28_6#6'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
script = soup.find_all('script')
print(script[1])
output:
<script>
//<![CDATA[
var pages = [{"image":"/system/releases/000/036/945/1.png","name":"1"},{"image":"/system/releases/000/036/945/2.png","name":"2"},{"image":"/system/releases/000/036/945/3.png","name":"3"},{"image":"/system/releases/000/036/945/4.png","name":"4"},{"image":"/system/releases/000/036/945/5.png","name":"5"},{"image":"/system/releases/000/036/945/6.png","name":"6"},{"image":"/system/releases/000/036/945/7.png","name":"7"},{"image":"/system/releases/000/036/945/credits.png","name":"credits"}];
//]]>
</script>
I'm trying to extract values of "image" as strings
example: "/system/releases/000/036/945/7.png"
How can I do it ?
you can use a regular expression to extract the variable "pages"
import re, json, requests
url = 'https://dynasty-scans.com/chapters/liar_satsuki_can_see_death_ch28_6#6'
r = requests.get(url)
# extract the data
match = re.search('var pages = (\[.*?\]);', r.text).group(1)
# parse it into json
match_json = json.loads(match)
# iterate through it to get the links
images = [img['image'] for img in match_json]
output:
['/system/releases/000/036/945/1.png',
'/system/releases/000/036/945/2.png',
'/system/releases/000/036/945/3.png',
'/system/releases/000/036/945/4.png',
'/system/releases/000/036/945/5.png',
'/system/releases/000/036/945/6.png',
'/system/releases/000/036/945/7.png',
'/system/releases/000/036/945/credits.png']

Problem in fetching long URLs using BeautifulSoup

I am trying to fetch a URL from a webpage, here is how the URL looks in the Inspect section:
Here is how the URL looks in my python-code:
How can I get the actual URL without the ../../ part using BeautifulSoup?
Here is my code in case it's needed:
import re
import requests
from bs4 import BeautifulSoup
source = requests.get('https://books.toscrape.com/catalogue/category/books_1/index.html').text
soup = BeautifulSoup(source, 'lxml')
# article = soup.find('article')
# title = article.div.a.img['alt']
# print(title['alt'])
titles, topics,urls,sources = [], [], [],[]
article_productPod = soup.findAll('article', {"class":"product_pod"})
for i in article_productPod:
titles.append(i.div.a.img['alt'])
# print(titles)
for q in article_productPod:
urls.append(q.h3.a['href'])
print(urls[0])
# for z in range(len(urls)):
# source2 = requests.get("https://" + urls[z])
Use urllib:
import urllib
Store your target URL in a separate variable :
src_url = r'https://books.toscrape.com/catalogue/category/books_1/index.html'
source = requests.get(src_url).text
Join the website's URL and the relative URL:
for q in article_productPod:
urls.append(urllib.parse.urljoin(src_url, q.h3.a['href']))

how to scrape author name and author url from a webpage using python

i am trying to scrape author name and author url from the following webpage.
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive
and i am using following code;
author_flag = 0
divs = soup.find_all('h2')
for div in divs:
author = div.find('a')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+ author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
can anyone take a look what i am doing wrong in this? As this code is not picking anything.
its is just returning blank list.
Full code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
data = pd.read_csv('url_technology.csv')
author_art = []
author_url = []
for i in range(1):
try:
author_flag = 0
divs = soup.find_all('meta')
for div in divs:
author = div.find('span')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
except:
print('no data found')
author_art = pd.DataFrame(title)
author_url = pd.DataFrame(url)
res = pd.concat([author_art, author_art] , axis=1)
res.columns = ['Author_Art', 'Author_url']
res.to_csv('combined1.csv')
print('File created successfully')
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive---------0-----------------------
https://medium.com/job-advice-for-software-engineers/what-i-want-and-dont-want-to-see-on-your-software-engineering-resume-cbc07913f7f6?source=tag_archive---------1-----------------------
https://itnext.io/load-testing-using-apache-jmeter-af189dd6f805?source=tag_archive---------2-----------------------
https://medium.com/s/story/black-mirror-bandersnatch-a-study-guide-c46dfe9156d?source=tag_archive---------3-----------------------
https://medium.com/fast-company/the-worst-design-crimes-of-2018-56f32b027bb7?source=tag_archive---------4-----------------------
https://towardsdatascience.com/make-your-pictures-beautiful-with-a-touch-of-machine-learning-magic-31672daa3032?source=tag_archive---------5-----------------------
https://medium.com/hackernoon/the-state-of-ruby-2019-is-it-dying-509160a4fb92?source=tag_archive---------6-----------------------
One possibility how to get author Name and author URL is to parse the Ld+Json data embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.select_one('[type="application/ld+json"]').contents[0])
# uncomment this to print all LD+JSON data:
# print(json.dumps(data, indent=4))
print("Author:", data["author"]["name"])
print("URL:", data["author"]["url"])
Prints:
Author: Eric Elliott
URL: https://medium.com/#_ericelliott
EDIT: A function that returns Author Name/URL:
import json
import requests
from bs4 import BeautifulSoup
def get_author_name_url(medium_url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(
soup.select_one('[type="application/ld+json"]').contents[0]
)
return data["author"]["name"], data["author"]["url"]
url_list = [
"https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20",
]
for url in url_list:
name, url = get_author_name_url(url)
print("Author:", name)
print("URL:", url)
I've launched a python package called medium-apis to do such tasks.
Install medium-apis
pip install medium-apis
Get you RapidAPI key. See how
Run the code:
from medium_apis import Medium
medium = Medium('YOUR_RAPIDAPI_KEY')
def get_author(url):
url_without_parameters = url.split('?')[0]
article_id = url_without_parameters.split('-')[-1]
article = medium.article(article_id=article_id)
author = article.author
author.save_info()
return author
urls = [
"https://nishu-jain.medium.com/medium-apis-documentation-3384e2d08667",
]
for url in urls:
author = get_author(url)
print('Author: ', author.fullname)
print('Profile URL: ', f'https://medium.com/#{author.username}')
Github repo: https://github.com/weeping-angel/medium-apis

beginner web scraping code iteration issue

I am new to Python and would really appreciate some help!!
I have been trying to create a dictionary for assigning books to their authors, only for it to come out messy and be repeating itself.
How can I fix this?
import requests
from bs4 import BeautifulSoup
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
article = scraped.find("div", class_="block block-system block-odd clearfix")
for i in article.find_all():
name = i.find("h2", "a href", class_="teaser-title")
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name:author})
print(results)
import requests
from bs4 import BeautifulSoup
import re
url = "https://www.banyen.com/new-arrivals/index.html"
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html,'html.parser')
results = []
articles = scraped.find_all("div", id=re.compile("node-"))
for i in articles:
name = i.find("h2").find('a')
author = i.find("span", class_="price-amount")
if name is not None:
if author is not None:
results.append({name.text.strip():author.text})
print(results)

Problems with data retrieving using Python web scraping

I wrote a simple code for scraping data from a web page but I mention all the thing like object class with tag but my program does not scrape data. One more thing there is an email that I also want to scrape but not know how to mention its id or class. Could you please guide me - how can I fix this issue? Thanks!
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('hi',class_="page-header",id=False).text
except:
title = 'empty'
print(title)
try:
email = soup.find('',class_="",id=False).text
except:
email = 'empty'
print(email)
def main():
url = "https://www.igrc.org/clergydetail/2747164"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
As noticed the value of the email is not in plain text. The html is loaded via JS in a script tag :
<script type="text/javascript">document.write(String.fromCharCode(60,97,32,104,114,101,102,61,34,35,34,32,115,116,121,108,101,61,34,117,110,105,99,111,100,101,45,98,105,100,105,58,98,105,100,105,45,111,118,101,114,114,105,100,101,59,100,105,114,101,99,116,105,111,110,58,114,116,108,59,34,32,111,110,99,108,105,99,107,61,34,116,104,105,115,46,104,114,101,102,61,83,116,114,105,110,103,46,102,114,111,109,67,104,97,114,67,111,100,101,40,49,48,57,44,57,55,44,49,48,53,44,49,48,56,44,49,49,54,44,49,49,49,44,53,56,44,49,49,52,44,49,49,49,44,57,56,44,54,52,44,49,48,57,44,49,48,49,44,49,49,54,44,49,48,52,44,49,49,49,44,49,48,48,44,49,48,53,44,49,49,53,44,49,49,54,44,52,54,44,57,57,44,57,57,41,59,34,62,38,35,57,57,59,38,35,57,57,59,38,35,52,54,59,38,35,49,49,54,59,38,35,49,49,53,59,38,35,49,48,53,59,38,35,49,48,48,59,38,35,49,49,49,59,38,35,49,48,52,59,38,35,49,49,54,59,38,35,49,48,49,59,38,35,49,48,57,59,38,35,54,52,59,38,35,57,56,59,38,35,49,49,49,59,38,35,49,49,52,59,60,47,97,62));</script>
which contains all the characters code (ascii code). When decoded will gives :
cc.tsidohtem#bor
which needs to be decoded too. We just needs the mailto which is present in onclick (the content in the mailto is unchanged whereas the text of the a tag is reversed (using direction: rtl as noticed by Hugo) :
mailto:john#doe.inc
The following python code extracts the mail :
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.igrc.org/clergydetail/2747164")
soup = BeautifulSoup(r.text, 'html.parser')
titleContainer = soup.find(class_ = "page-header")
title = titleContainer.text.strip() if titleContainer else "empty"
emailScript = titleContainer.findNext("script").text
def parse(data):
res = re.search('\(([\d+,]*)\)', data, re.IGNORECASE)
return "".join([
chr(int(i))
for i in res.group(1).split(",")
])
emailData1 = parse(emailScript)
email = parse(emailData1)
print(title)
print(email.split(":")[1])
One could reproduce this encoding the other way around using the following code :
def encode(data):
return ",".join([str(ord(i)) for i in data])
mail = "john#doe.inc"
encodedMailTo = encode("mailto:" + mail)
encodedHtmlEmail = "".join(["&#" + str(ord(i)) + ";" for i in mail])
htmlContainer = f'{encodedHtmlEmail}'
encodedHtmlContainer = encode(htmlContainer)
scriptContainer = f'<script type="text/javascript">document.write(String.fromCharCode({encodedHtmlContainer}));</script>'
print(scriptContainer)

Categories