Problems with data retrieving using Python web scraping - python

I wrote a simple code for scraping data from a web page but I mention all the thing like object class with tag but my program does not scrape data. One more thing there is an email that I also want to scrape but not know how to mention its id or class. Could you please guide me - how can I fix this issue? Thanks!
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('hi',class_="page-header",id=False).text
except:
title = 'empty'
print(title)
try:
email = soup.find('',class_="",id=False).text
except:
email = 'empty'
print(email)
def main():
url = "https://www.igrc.org/clergydetail/2747164"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()

As noticed the value of the email is not in plain text. The html is loaded via JS in a script tag :
<script type="text/javascript">document.write(String.fromCharCode(60,97,32,104,114,101,102,61,34,35,34,32,115,116,121,108,101,61,34,117,110,105,99,111,100,101,45,98,105,100,105,58,98,105,100,105,45,111,118,101,114,114,105,100,101,59,100,105,114,101,99,116,105,111,110,58,114,116,108,59,34,32,111,110,99,108,105,99,107,61,34,116,104,105,115,46,104,114,101,102,61,83,116,114,105,110,103,46,102,114,111,109,67,104,97,114,67,111,100,101,40,49,48,57,44,57,55,44,49,48,53,44,49,48,56,44,49,49,54,44,49,49,49,44,53,56,44,49,49,52,44,49,49,49,44,57,56,44,54,52,44,49,48,57,44,49,48,49,44,49,49,54,44,49,48,52,44,49,49,49,44,49,48,48,44,49,48,53,44,49,49,53,44,49,49,54,44,52,54,44,57,57,44,57,57,41,59,34,62,38,35,57,57,59,38,35,57,57,59,38,35,52,54,59,38,35,49,49,54,59,38,35,49,49,53,59,38,35,49,48,53,59,38,35,49,48,48,59,38,35,49,49,49,59,38,35,49,48,52,59,38,35,49,49,54,59,38,35,49,48,49,59,38,35,49,48,57,59,38,35,54,52,59,38,35,57,56,59,38,35,49,49,49,59,38,35,49,49,52,59,60,47,97,62));</script>
which contains all the characters code (ascii code). When decoded will gives :
cc.tsidohtem#bor
which needs to be decoded too. We just needs the mailto which is present in onclick (the content in the mailto is unchanged whereas the text of the a tag is reversed (using direction: rtl as noticed by Hugo) :
mailto:john#doe.inc
The following python code extracts the mail :
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.igrc.org/clergydetail/2747164")
soup = BeautifulSoup(r.text, 'html.parser')
titleContainer = soup.find(class_ = "page-header")
title = titleContainer.text.strip() if titleContainer else "empty"
emailScript = titleContainer.findNext("script").text
def parse(data):
res = re.search('\(([\d+,]*)\)', data, re.IGNORECASE)
return "".join([
chr(int(i))
for i in res.group(1).split(",")
])
emailData1 = parse(emailScript)
email = parse(emailData1)
print(title)
print(email.split(":")[1])
One could reproduce this encoding the other way around using the following code :
def encode(data):
return ",".join([str(ord(i)) for i in data])
mail = "john#doe.inc"
encodedMailTo = encode("mailto:" + mail)
encodedHtmlEmail = "".join(["&#" + str(ord(i)) + ";" for i in mail])
htmlContainer = f'{encodedHtmlEmail}'
encodedHtmlContainer = encode(htmlContainer)
scriptContainer = f'<script type="text/javascript">document.write(String.fromCharCode({encodedHtmlContainer}));</script>'
print(scriptContainer)

Related

Indeed scraper bs4, splitting parsed HTML code after grabbing it

import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(url):
titles = data_grabber(url)
for title in titles:
t = title.find_all('tbody')
return t
this is my source code, and im testing it out in jupyter notebook to make sure my functions work correctly but I've hit a small road block. My html soup from my first function works perfectly. It grabs all the info from indeed, especially the job_seen_beacon class.
Mr job_title function is wrong because it only outputs the first 'tbody' class it finds. refer to image here, I don't have enough points on stack
while for my data_grabber it returns every single job_seen_beacon. If you were able to scroll, you would easily see the multiple job_seen_beacon's.
I'm clearly missing something but I can't see it, any ideas?
What happens?
In moment you are return something from a function you leave it and that happens in first iteration.
Not sure where you will end up with your code, but you can do something like that:
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
Example
from bs4 import BeautifulSoup
import requests
url = 'https://fr.indeed.com/jobs?q=data%20anlayst&l=france'
#grabbing page content and parsing it into html
def data_grabber(url):
page = requests.get(url)
html = page.text
soup = BeautifulSoup(html, 'html.parser')
job_soup = soup.find_all('div', {"class":"job_seen_beacon"})
return job_soup
def job_title(item):
title = item.select_one('h2')
return title.get_text('|',strip=True).split('|')[-1] if title else 'No Title'
def job_location(item):
location = item.select_one('div.companyLocation')
return location.get_text(strip=True) if location else 'No Location'
data = []
for item in data_grabber(url):
data.append({
'title':job_title(item),
'companyLocation':job_location(item)
})
data
Output
[{'title': 'Chef de Projet Big Data H/F', 'companyLocation': 'Lyon (69)'},{'title': 'Chef de Projet Big Data F/H', 'companyLocation': 'Lyon 9e (69)'}]

Having problems with a simple Instagram Scraper

i am pretty new to all the programming stuff and I am learning Python for my social engineering project. So really sorry if you will hit your own forehead.
So now i was looking at a tutorial to scrape certain information from a certain instagram page. Lets say f.e. i wanted to extract info from www.instagram.com/nbamemes
I am getting a problem in Line 12 "IndentationError: expected an indented block". So i have googled that, but i just dont get the Code. Where are my placeholders which i need to place info from myself.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo(self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (text[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print('User:', user)
print('Followers:', followers)
print('Following:', following)
print('Posts:', posts)
print('-----------------------')
def mail(self):
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.mail()
I used a Tutorial for programming this. However I dont get the whole thing right. Its not completely beginner friendly and I seem to need help. Again sorry for this super beginners question.
beste regards,
lev
In the future, it would be useful to share the error message produced by your code. It includes the line at which the error has occurred.
Based on the code you provided, I can see that you did not indent the code inside your functions. After the function declaration def, you need to indent all code inside it
So from:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
To:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
Indentations are the block separators in python. Below is the indented code. Whenever you are using the condition loops , def , class you are creating a block. In order to define that you have to indent the code using spaces . Usually a tab space is been preferred , but even a single space also works fine .
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (test[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print ('User:', user)
print ( 'Followers:', followers)
print ('Following:', following)
print ('Posts:', posts)
print ('-----------------------')
def mail(self:
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.main()
Ref : Geeks For Geeks : Indentation
Thanks

Crawling over a website directories using BeautifulSoup?

This is my code:
https://pastebin.com/R11qiTF4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from urllib.parse import urljoin
import re
urls = ["https://www.helios-gesundheit.de"]
domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
prohibited = ["info", "news"]
text_keywords = ["Helios", "Helios"]
url_list = []
desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
for x in range(len(domain_list)):
url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
print(url_list)
def prohibitedChecker(prohibited_list, string):
for x in prohibited_list:
if x in string:
return True
else:
return False
break
def parseHTML(url):
requestHTML = req(url)
htmlPage = requestHTML.read()
requestHTML.close()
parsedHTML = soup(htmlPage, "html.parser")
return parsedHTML
searched_word = "Helios"
for url in url_list:
parsedHTML = parseHTML(url)
href_crawler = parsedHTML.find_all("a", href=True)
for href in href_crawler:
crawled_url = urljoin(url,href.get("href"))
print(crawled_url)
if "www" not in crawled_url:
continue
parsedHTML = parseHTML(crawled_url)
results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
for single_result in results:
keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
if keyword_text_check != True:
continue
print(single_result.string)
I'm trying to print the contents of ''desired'' variable. The problem is the following, my code doesn't even get to request the URL of ''desired'' because its not in the website scope. ''desired'' href link is inside another href link that's inside the page I'm currently scraping. I thought I'd fix this by adding another for loop inside line 39 for loop, that requests every href found in my first, but this is too messy and not efficient
Is there way to get a list of every directory of a website url?

Scraping AJAX loaded content with python?

So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?
You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].
Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.

beautifulsoup add a tag inside a string

Any browser that encounter the text www.domain.com or http://domain.com/etc/ in a text section of some html will automatically translate it into www.domain.com or http://domain.com/etc/ tag. I have to clean-up and verify some texts like this and do this replacement automatically, but the problem is that i can't insert new tags into an element's string.
#!/usr/bin/python
# -*- coding: utf8
import re
from bs4 import BeautifulSoup as bs
def html_content_to_soup(data):
soup = bs(data, "html5lib")
soup.html.unwrap()
soup.head.unwrap()
soup.body.unwrap()
return soup
def create_tag(soup):
def resubfunction(m):
url = m.group(0)
if not url.startswith("http://") and not url.startswith("https://"):
_url = "http://%s" % url
else:
_url = url
tag = soup.new_tag('a', href=_url)
tag.string = url.replace(".", "[.]")
return tag.prettify(formatter=None)
return resubfunction
def replace_vulnerable_text(soup, data):
ex = r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s\`!()\[\]{};:\'\".,<>?ÂŤÂťââââ])|(?:(?<!#)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!#)))"
return re.sub(ex, create_tag(soup), data)
if __name__ == "__main__":
html = """<html><body>The website bbc.co.uk is down</body></html>"""
soup = bs(html, "html5")
for elem in soup.find_all(text=True):
if elem.string is not None:
elem.replace_with( html_content_to_soup(replace_vulnerable_text(soup, elem.string)))
print unicode(soup)
Instead of the expected
<html><body>The website bbc[.]co[.]uk is down</body></html>
i'm getting
<html><head></head><body>The website <a href="http://bbc.co.uk"> bbc[.]co[.]uk </a> is down</body></html>
The html tags are getting escaped. Any pointers in the right direction? I'm not sure how to approach this.
EDIT: Edited the original question with the correct answer.
import HTMLParser
html_p = HTMLParser.HTMLParser()
string = '<html><head></head><body>The website <a href="http://bbc.co.uk"> bbc[.]co[.]uk </a> is down</body></html>'
print html_p.unescape(string)
it will give you the required output.
I need a function which will return a soup from arbitrary html:
def html_content_to_soup(data):
soup = bs(data, "html5lib")
soup.html.unwrap()
soup.head.unwrap()
soup.body.unwrap()
return soup
Afterwards, we get:
elem.replace_with( html_content_to_soup(replace_vulnerable_text(soup, elem.string)) )
This produces the required content:
<html><head></head><body>The website bbc[.]co[.]uk is down</body></html>

Categories