Instagram Web scraping followers - python

I am trying to extract the followers of a random web page in Instagram. I tried to use python in combination with Beautiful Soup.
Nonetheless I have not received any information at web page where I could access
def get_user_info( user_name):
url = "https://www.instagram.com/" + user_name + "/?__a=1"
try:
r = requests.get(url)
except requests.exceptions.ConnectionError:
print ('Seems like dns lookup failed..')
time.sleep(60)
return None
if r.status_code != 200:
print ('User: ' + user_name + ' status code: ' + str(r.status_code))
print (r)
return None
info = json.loads(r.text)
return info['user']
get_user_info("wernergruener")
As mentioned I do not get the followers of the page. How could I do this?
Cheers,
Andi

With API/JSON:
I'm not familiar with the Instagram API, but it doesn't look like it returns detailed information about a person's followers, just the number of followers.
You should be able to get that information using info["user"]["followed_by"]["count"].
With raw page/Beautiful Soup:
Assuming the non-API page reveals the information you want about a person's followers, you'll want to download the raw HTML (instead of JSON) and parse it using Beautiful Soup.
def get_user_info( user_name):
url = "https://www.instagram.com/" + user_name
try:
r = requests.get(url)
except requests.exceptions.ConnectionError:
print ('Seems like dns lookup failed..')
time.sleep(60)
return None
if r.status_code != 200:
print ('User: ' + user_name + ' status code: ' + str(r.status_code))
print (r)
return None
soup = BeautifulSoup(r.text, 'html.parser')
# find things using Beautiful Soup
get_user_info("wernergruener")
Beautiful Soup has some of the most intuitive documentation I've ever read. I'd start there:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
With API/python-instagram:
Other people have already done a lot of the heavy lifting for you. I think python-instagram should offer you easier access to the information you want.

Related

Touble web scraping streaming sites

I'm trying to make a web scraper using Python and I'm facing one issue. Every streaming site I try to inspect doesnt let me inspect it's html code when i'm on the episodes page. It sends me back to the home page whenever I open Element Inspector. Any help?
I'm new to web scraping so I don't know another way of doing this
import requests
from bs4 import BeautifulSoup
# THIS PROJECT IS JUST TO SCRAPE ZORO.TO AND DOWNLOAD ANIME USING IT
# Make a request and get the return the HTML content of a webpage
def getWebpage(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html5lib')
# Search the title and get it's Page Url
def getWatchUrl(titleName):
keyword = ""
# Transform the Anime title into a Url as on the website
for word in titleName.split(sep=" "):
keyword += '+' + word
keyword = keyword[1:]
SearchURL = f'https://zoro.to/search?keyword={keyword}'
# Get the HTML contents of the website
try:
soup = getWebpage(SearchURL)
except Exception as e:
print(f"Unexpected Error {e}. Please try again!")
return None
# Seperate the useful content (Anime title and Links)
try:
titles = soup.findAll('div', {'class' : 'film-poster'}, limit=None)
except:
print(e)
print("Couldn't Find title. Check spellings and please try again!")
return None
# Search the content for Anime title and extract it's link
for title in titles:
for content in title:
try:
if titleName in content.get('title').lower():
path = content.get('href').replace('?ref=search', '')
watchURL = f'https://zoro.to/watch{path}'
return watchURL
except:
continue
print("Couldn't Find title. Check spellings and please try again!")
return None
# Get the direct download links from the Web page
def getDownloadUrl(watchUrl):
try:
soup = getWebpage(watchUrl)
print(soup.prettify())
except Exception as e:
print(f"Unexpected Error {e}. Please try again!")
return None
def main():
animeName = input("Enter Anime Name: ").lower()
watchURL = getWatchUrl(animeName)
if watchURL is not None: getDownloadUrl(watchURL)
if __name__ == "__main__":
main()

Get request API

I use API:
https://www.blockchain.com/api/q
Trying to make a Get request:
url = 'https://www.blockchain.info/api/q/getreceivedbyaddress/' + strpod + '?confirmations=6'
zapros = requests.get(url)
But it returns the entire page.
And I only need the balance value.
Please help me.
import requests
address = "17LREmmnmTxCoFZ59wfhg4S639GsPqjTRT"
URL = "https://blockchain.info/q/getreceivedbyaddress/"+address+"?confirmations=6"
r = requests.get(url = URL)
# extracting balance ((in satoshi))
bt_balance = r.json()
The API link is not wrong. Please check with the blockchain addresss

Extract images from HTML file using python standard libraries

so I'm trying to write a script that basically parses through an HTML file, finds all the images and saves those images into another folder. How would one accomplish this only using libraries that come with python3 when you install it on your computer? I currently have this script that I would like to incorporate more into.
date = datetime.date.today()
backup_path = os.path.join(str(date), language)
if not os.path.exists(backup_path):
os.makedirs(backup_path)
log = []
endpoint = zendesk + '/api/v2/help_center/en-us/articles.json'
while endpoint:
response = requests.get(endpoint, auth=credentials)
if response.status_code != 200:
print('Failed to retrieve articles with error {}'.format(response.status_code))
exit()
data = response.json()
for article in data['articles']:
if article['body'] is None:
continue
title = '<h1>' + article['title'] + '</h1>'
filename = '{id}.html'.format(id=article['id'])
with open(os.path.join(backup_path, filename), mode='w', encoding='utf-8') as f:
f.write(title + '\n' + article['body'])
print('{id} copied!'.format(id=article['id']))
log.append((filename, article['title'], article['author_id']))
endpoint = data['next_page']
This is a script I found on a zendesk forum that basically backs up our articles on Zendesk.
Try using beautiful soup to retrieve all the nodes and for each node using urllib to get the picture.
from bs4 import BeautifulSoup
#note here using response.text to get raw html
soup = BeautifulSoup(response.text)
#get the src of all images
img_source = [x.src for x in soup.find_all("img")]
#get the images
images = [urllib.urlretrieve(x) for x in img_source]
And you probably need to add some error handling and change it a bit to fit your page, but the idea remains the same.

Python post request for USPTO site scraping

I’m trying to scrape data from http://portal.uspto.gov/EmployeeSearch/ web site.
I open the site in browser, click on the Search button inside the Search by Organisation part of the site and look for the request being sent to server.
When I post the same request using python requests library in my program, I don’t get the result page which I am expecting but I get the same Search page, with no employee data on it.
I’ve tried all variants, nothing seems to work.
My question is, what URL should I use in my request, do I need to specify headers (tried also, copied headers viewed in Firefox developer tools upon request) or something else?
Below is the code that sends the request:
import requests
from bs4 import BeautifulSoup
def scrape_employees():
URL = 'http://portal.uspto.gov/EmployeeSearch/searchEm.do;jsessionid=98BC24BA630AA0AEB87F8109E2F95638.prod_portaljboss4_jvm1?action=displayResultPageByOrgShortNm&currentPage=1'
response = requests.post(URL)
site_data = response.content
soup = BeautifulSoup(site_data, "html.parser")
print(soup.prettify())
if __name__ == '__main__':
scrape_employees()
All the data you need is in a form tag:
action is the url when you make a post to server.
input is the data you need post to server. {name:value}
import requests, bs4, urllib.parse,re
def make_soup(url):
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
return soup
def get_form(soup):
form = soup.find(name='form', action=re.compile(r'OrgShortNm'))
return form
def get_action(form, base_url):
action = form['action']
# action is reletive url, convert it to absolute url
abs_action = urllib.parse.urljoin(base_url, action)
return abs_action
def get_form_data(form, org_code):
data = {}
for inp in form('input'):
# if the value is None, we put the org_code to this field
data[inp['name']] = inp['value'] or org_code
return data
if __name__ == '__main__':
url = 'http://portal.uspto.gov/EmployeeSearch/'
soup = make_soup(url)
form = get_form(soup)
action = get_action(form, url)
data = get_form_data(form, '1634')
# make request to the action using data
r = requests.post(action, data=data)

web scraping and 403 forbidden: My web scraper is blocked by a website, what should I do to make request?

I wrote a script to pull data from a website. But after several times, it shows 403 forbidden when I request.
What should I do for this issue.
My code is below:
import requests, bs4
import csv
links = []
with open('1-432.csv', 'rb') as urls:
reader = csv.reader(urls)
for i in reader:
links.append(i[0])
info = []
nbr = 1
for url in links:
# Problem is here.
sub = []
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
start = soup.find('em')
forname = soup.find_all('b')
name = []
for b in forname:
name.append(b.text)
name = name[7]
sub.append(name.encode('utf-8'))
for b in start.find_next_siblings('b'):
if b.text in ('Category:', 'Website:', 'Email:', 'Phone' ):
sub.append(b.next_sibling.strip().encode('utf-8'))
info.append(sub)
print('Page ' + str(nbr) + ' is saved')
with open('Canada_info_4.csv', 'wb') as myfile:
wr = csv.writer(myfile,quoting=csv.QUOTE_ALL)
for u in info:
wr.writerow(u)
nbr += 1
what should I do to make requests to the website.
Example url is http://www.worldhospitaldirectory.com/dr-bhandare-hospital/info/43225
Thanks.
There's a bunch of different things that could be the problem, and depending on what their blacklisting policy it might be too late to fix.
At the very least, scraping like this is generally considered to be dick behavior. You're hammering their server. Try putting a time.sleep(10) inside your main loop.
Secondly, try setting your user agents. See here or here
A better solution though would be to see if they have an API you can use.

Categories