I wrote a simple code for scraping data from a web page but I mention all the thing like object class with tag but my program does not scrape data. One more thing there is an email that I also want to scrape but not know how to mention its id or class. Could you please guide me - how can I fix this issue? Thanks!
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('hi',class_="page-header",id=False).text
except:
title = 'empty'
print(title)
try:
email = soup.find('',class_="",id=False).text
except:
email = 'empty'
print(email)
def main():
url = "https://www.igrc.org/clergydetail/2747164"
#get_page(url)
get_detail_data(get_page(url))
if __name__ == '__main__':
main()
As noticed the value of the email is not in plain text. The html is loaded via JS in a script tag :
<script type="text/javascript">document.write(String.fromCharCode(60,97,32,104,114,101,102,61,34,35,34,32,115,116,121,108,101,61,34,117,110,105,99,111,100,101,45,98,105,100,105,58,98,105,100,105,45,111,118,101,114,114,105,100,101,59,100,105,114,101,99,116,105,111,110,58,114,116,108,59,34,32,111,110,99,108,105,99,107,61,34,116,104,105,115,46,104,114,101,102,61,83,116,114,105,110,103,46,102,114,111,109,67,104,97,114,67,111,100,101,40,49,48,57,44,57,55,44,49,48,53,44,49,48,56,44,49,49,54,44,49,49,49,44,53,56,44,49,49,52,44,49,49,49,44,57,56,44,54,52,44,49,48,57,44,49,48,49,44,49,49,54,44,49,48,52,44,49,49,49,44,49,48,48,44,49,48,53,44,49,49,53,44,49,49,54,44,52,54,44,57,57,44,57,57,41,59,34,62,38,35,57,57,59,38,35,57,57,59,38,35,52,54,59,38,35,49,49,54,59,38,35,49,49,53,59,38,35,49,48,53,59,38,35,49,48,48,59,38,35,49,49,49,59,38,35,49,48,52,59,38,35,49,49,54,59,38,35,49,48,49,59,38,35,49,48,57,59,38,35,54,52,59,38,35,57,56,59,38,35,49,49,49,59,38,35,49,49,52,59,60,47,97,62));</script>
which contains all the characters code (ascii code). When decoded will gives :
cc.tsidohtem#bor
which needs to be decoded too. We just needs the mailto which is present in onclick (the content in the mailto is unchanged whereas the text of the a tag is reversed (using direction: rtl as noticed by Hugo) :
mailto:john#doe.inc
The following python code extracts the mail :
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.igrc.org/clergydetail/2747164")
soup = BeautifulSoup(r.text, 'html.parser')
titleContainer = soup.find(class_ = "page-header")
title = titleContainer.text.strip() if titleContainer else "empty"
emailScript = titleContainer.findNext("script").text
def parse(data):
res = re.search('\(([\d+,]*)\)', data, re.IGNORECASE)
return "".join([
chr(int(i))
for i in res.group(1).split(",")
])
emailData1 = parse(emailScript)
email = parse(emailData1)
print(title)
print(email.split(":")[1])
One could reproduce this encoding the other way around using the following code :
def encode(data):
return ",".join([str(ord(i)) for i in data])
mail = "john#doe.inc"
encodedMailTo = encode("mailto:" + mail)
encodedHtmlEmail = "".join(["&#" + str(ord(i)) + ";" for i in mail])
htmlContainer = f'{encodedHtmlEmail}'
encodedHtmlContainer = encode(htmlContainer)
scriptContainer = f'<script type="text/javascript">document.write(String.fromCharCode({encodedHtmlContainer}));</script>'
print(scriptContainer)
i am pretty new to all the programming stuff and I am learning Python for my social engineering project. So really sorry if you will hit your own forehead.
So now i was looking at a tutorial to scrape certain information from a certain instagram page. Lets say f.e. i wanted to extract info from www.instagram.com/nbamemes
I am getting a problem in Line 12 "IndentationError: expected an indented block". So i have googled that, but i just dont get the Code. Where are my placeholders which i need to place info from myself.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo(self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (text[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print('User:', user)
print('Followers:', followers)
print('Following:', following)
print('Posts:', posts)
print('-----------------------')
def mail(self):
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.mail()
I used a Tutorial for programming this. However I dont get the whole thing right. Its not completely beginner friendly and I seem to need help. Again sorry for this super beginners question.
beste regards,
lev
In the future, it would be useful to share the error message produced by your code. It includes the line at which the error has occurred.
Based on the code you provided, I can see that you did not indent the code inside your functions. After the function declaration def, you need to indent all code inside it
So from:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
To:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
Indentations are the block separators in python. Below is the indented code. Whenever you are using the condition loops , def , class you are creating a block. In order to define that you have to indent the code using spaces . Usually a tab space is been preferred , but even a single space also works fine .
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
class insta_Scraper_v1:
def getinfo (self, url):
html = urllib.request.urlopen('www.instagram.com/nbamemes', context=self.ctx).read()
soup = BeautifulSoup(html, 'html.parser')
data = soup.find_all ('meta', attr={'property': 'og:description'})
text = data[0]
user = '%s %s %s' % (test[-3], text[-2], text[-1])
followers = text[0]
following = text[2]
posts = text[4]
print ('User:', user)
print ( 'Followers:', followers)
print ('Following:', following)
print ('Posts:', posts)
print ('-----------------------')
def mail(self:
self.ctx = ssl.create_default_context()
self.ctx.check_hostname = False
self.ctx.verify_mode = ssl.CERT_NONE
with open('123.txt') as f:
self.content = f.readlines()
self.content = [x.strip() for x in self.content]
for url in self.content:
self.getinfo(url)
if __name__ == '__main__'
obj = insta_Scraper_v1()
obj.main()
Ref : Geeks For Geeks : Indentation
Thanks
Hi I wanted to create a mini crawler but not use Scrapy,
I created something like this:
response = requests.get(url)
homepage_link_list = []
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
homepage_link_list.append(link.get("href"))
link_list = []
for item in homepage_link_list:
response = requests.get(item)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
if link.get("href"):
link_list.append(link.get("href"))
Although the problem I am encountering is that it only get the the link within the link of webpage, how can I make it do get all the links within all the links of website.
You need a recursive call flow. I have written below a class-oriented code. Main points are as follows:
This implementation is depth-first
Keep track of already scraped URLs so that we don't scrape them again
Ignore targets on a page. Eg. if http://example.com#item1, ignore item1
If https://example.com is already crawled, ignore http://example.com
Discard trailing slash. Eg. if http://example.com is already crawled, ignore http://example.com/
''' Scraper.
'''
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
if fields.netloc == self.domain:
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
self.urls.add(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
if __name__ == '__main__':
rscraper = RecursiveScraper("http://bbc.com")
rscraper.scrape()
print(rscraper.urls)
It could be that the links you want to scrape are not actually links. They could be images. Sorry for writing this answer here actally I dont have much reputation to comment,
Your code is not fetching all the links of the website because it is not recursive. You're fetching the homepage links and traversing the links available in the content of the homepage links. But, you're not traversing the links you get in the content of those links you just traversed. My advice is you should check out some tree traversal algorithms and develop a scheme of traversal (recursive) according to the algorithm. The nodes of the trees will represent the links, root node being the link you passed in the beginning.
95% based on #coder.in.me answer let me insert another code here that can resolve an issue I was facing with.
My issue was: "If you try to scrape a url like: https://www.americanexpress.com/hu-hu/, it will only keep the https://www.americanexpress.com/ part of it and scrape all the amex sites globally, but I don't need all the non-hungarian pages."
You just need to change the
if fields.netloc == self.domain:
code to
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
Here is the modified code:
import re
from urllib.parse import urljoin, urlsplit, SplitResult
import requests
from bs4 import BeautifulSoup
class RecursiveScraper:
''' Scrape URLs in a recursive manner.
'''
def __init__(self, url):
''' Constructor to initialize domain name and main URL.
'''
self.domain = urlsplit(url).netloc
self.mainurl = url
self.urls = set()
def preprocess_url(self, referrer, url):
''' Clean and filter URLs before scraping.
'''
if not url:
return None
fields = urlsplit(urljoin(referrer, url))._asdict() # convert to absolute URLs and split
fields['path'] = re.sub(r'/$', '', fields['path']) # remove trailing /
fields['fragment'] = '' # remove targets within a page
fields = SplitResult(**fields)
#if fields.netloc == self.domain:
if fields.netloc == self.domain and (fields.path.startswith('/hu-hu') or fields.path.startswith('/en-hu')):
# Scrape pages of current domain only
if fields.scheme == 'http':
httpurl = cleanurl = fields.geturl()
httpsurl = httpurl.replace('http:', 'https:', 1)
else:
httpsurl = cleanurl = fields.geturl()
httpurl = httpsurl.replace('https:', 'http:', 1)
if httpurl not in self.urls and httpsurl not in self.urls:
# Return URL only if it's not already in list
return cleanurl
return None
def scrape(self, url=None):
''' Scrape the URL and its outward links in a depth-first order.
If URL argument is None, start from main page.
'''
if url is None:
url = self.mainurl
print("Scraping {:s} ...".format(url))
try:
response = requests.get(url)
self.urls.add(url)
soup = BeautifulSoup(response.content, 'lxml')
for link in soup.findAll("a"):
childurl = self.preprocess_url(url, link.get("href"))
if childurl:
self.scrape(childurl)
except requests.exceptions.SSLError:
pass
except requests.exceptions.InvalidSchema:
pass
if __name__ == '__main__':
rscraper = RecursiveScraper('https://www.americanexpress.com/hu-hu/')
rscraper.scrape()
Thanks!