Python : How to stay logged in while scraping? - python

Just to clarify from the beginning: I'm a total beginner (I wrote something in Python for the first time today). This was more applying from a guide and trying to remember what I did 7 years ago when I tried learning java than anything else.
I wanted to scrape the image tags from a website (to plot them later) but have to stay logged in to view all images. After I got the scraping down I noticed that there were some tags blocked so the issue with the login came up. I now managed to log in but it doesn't work outside of the session itself which makes the rest of my code useless. Can I get this to work or do I have to give up?
This is the working login:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'theusername',
'pass' : 'thepassword',
'op' : 'Log in'
}
with requests.Session() as s:
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url)
r = s.post(url, data=login_data)
And what I had working before to scrape the website but with the login missing:
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 150:
url = "https://thatwebsite.com/index.php?page=post&s=list&tags=absurdres&pid=" + str(pid)
print(url)
client = urlopen(url)
page_html = client.read()
client.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
print("Current pid: " + str(pid))
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
pid = pid + 42
print("Current page: " + str(actual_page))
actual_page += 1
print("Done.")
f.close()
Out comes a list of every tag used by high res images.
I hope I don't offend anyone with this.
Edit: The code is working now, had a cookie typo:
import requests
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'myusername',
'pass' : 'mypassword',
'op' : 'Log in'
}
s = requests.Session()
print("\n\n\n\n\n")
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 42:
url2 = "https://thiswebsite.com/index.php?page=post&s=list&tags=rating:questionable&pid=" + str(pid)
r = s.get(url2, cookies={'duid' : 'somehash', 'user_id' : 'my userid', 'pass_hash' : 'somehash'})
page_html = str(r.content)
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
print("\nCurrent page: " + str(actual_page) + " Current pid: " + str(pid) + "\nDone.")
actual_page += 1
pid = pid + 42
f.close()

You use two different libraries for doing web requests right now. requests and urllib. I would opt for using only requests.
Also don't use the Session() context manager. Context manager are used to do some cleanup after leaving the indented block and have that with ... as x syntax you use on the requests.Session() object. In context of requests this will clear the cookies as you leave the session. (I assume login is managed by cookies at this site).
Keep the session in a variable instead that you can use for subsequent requests as this stores your cookies at login. You need them for subsequent requests.
s = requests.Session()
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url) # do you need this request?
r = s.post(url, data=login_data)
Also make the subsequent call in the loop with requests:
client = s.get(url)

Related

Web scraping keeps blocking website even after mentioning proxy server

I am scrapping the website craiglist.com but after getting certain requests it keeps blocking my device. I tried out the solution in Proxies with Python 'Requests' module but didn't understand how to specify the headers every time. Here's the code :
from bs4 import BeautifulSoup
import requests,json
list_of_tuples_with_given_zipcodes = []
id_of_apartments = []
params = {
'sort': 'dd',
'filter': 'reviews-dd',
'res_id': 18439027
}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxies = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
for i in range(1,30):
content = requests.get('https://losangeles.craigslist.org/search/apa?s = ' + str(i),params = params) #https://losangeles.craigslist.org/search/apa?s=120
# content = requests.get('https://www.zillow.com/homes/for_rent/')
soup = BeautifulSoup(content.content, 'html.parser')
my_anchors = list(soup.find_all("a",{"class": "result-image gallery"}))
for index,each_anchor_tag in enumerate(my_anchors):
URL_to_look_for_zipcode = soup.find_all("a",{"class": "result-title"}) #taking set so that a page is not visited twice.
for each_href in URL_to_look_for_zipcode:
# content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
content_href = requests.get(each_href['href']) #script id="ld_posting_data" type="application/ld+json">
# print(each_href['href'])
soup_href = BeautifulSoup(content_href.content, 'html.parser')
my_script_tags = soup_href.find("script",{"id": "ld_posting_data"})
# for each_tag in my_script_tags:
if my_script_tags:
res = json.loads(str(list(my_script_tags)[0]))
if res and 'address' in list(res.keys()):
if res['address']['postalCode'] == "90012": #use the input zipcode entered by the user.
list_of_tuples_with_given_zipcodes.append(each_href['href'])
I am still not sure about the value of the http_proxy variable. I specified it as what was given but should it be the IP address of my device mapped to the localhost port number? It still keeps blocking the code.
Please help.
request's GET method lets you specify the proxy to use it on a call
r = requests.get(url, headers=headers, proxies=proxies)

python requests-html Chromium process leaking

My program cannot run thought the entire loop because a leak crashes it before it gets to the end.
I have the following script:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import requests
for x in range(9376,23534):
session = HTMLSession()
r = session.get('https://someexampleurl.com/yadayada/database1/{}'.format(x))
r.html.render() # this call executes the js in the page
soup = BeautifulSoup(r.html.html, features="lxml")
r.close()
print(x)
name = "\n".join([img['alt'] for img in soup.find_all('img', alt=True)])
name = name[1:]
name = name[:-1]
url = "\n".join([img['src'] for img in soup.find_all('img', alt=True)])
def solve_fast(s):
ind1 = s.find('\n')
ind2 = s.rfind('\n')
return s[ind1+1:ind2]
url = solve_fast(url)
url = url[0:41] + "1" + url[41+1: ]
url = url[0:42] + "2" + url[42+1: ]
url = url[0:43] + "8" + url[43+1: ]
img_data = requests.get(url)
with open('local_database1/{}{}.avif'.format(x,name), 'wb') as handler:
handler.write(img_data.content)
img_data.close()
When ran in a loop the chromium process stacks up infinitely until the program crashes, I can't see where I am not closing the connection to the request.
In my case session.close() works for me.
Code
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://xxxxxxxx')
r.html.render()
...
session.close()

Scraping multiple pages with beautifulsoup4 using python 3.6.3

I am trying to loop through multiple pages and my code doesn't extract anything. I am kind of new to scraping so bear with me. I made a container so I can target each listing. I also made a variable to target the anchor tag that you would press to go to the next page. I would really appreciate any help I could get. Thanks.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
for page in range(0,25):
file = "breakfeast_chicago.csv"
f = open(file, "w")
Headers = "Nambusiness_name, business_address, business_city, business_region, business_phone_number\n"
f.write(Headers)
my_url = 'https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}'.format(page)
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each listing
containers = page_soup.findAll("div",{"class": "result"})
new = page_soup.findAll("a", {"class":"next ajax-page"})
for i in new:
try:
for container in containers:
b_name = i.find("container.h2.span.text").get_text()
b_addr = i.find("container.p.span.text").get_text()
city_container = container.findAll("span",{"class": "locality"})
b_city = i.find("city_container[0].text ").get_text()
region_container = container.findAll("span",{"itemprop": "postalCode"})
b_reg = i.find("region_container[0].text").get_text()
phone_container = container.findAll("div",{"itemprop": "telephone"})
b_phone = i.find("phone_container[0].text").get_text()
print(b_name, b_addr, b_city, b_reg, b_phone)
f.write(b_name + "," +b_addr + "," +b_city.replace(",", "|") + "," +b_reg + "," +b_phone + "\n")
except: AttributeError
f.close()
If using BS4 try : find_all
Try dropping into a trace using import pdb;pdb.set_trace() and try to debug what is being selected in the for loop.
Also, some content may be hidden if it is loaded via javascript.
Each anchor tag or href for "clicking" is just another network request, and if you plan to follow the link consider slowing down the number of requests in between each request, so you don't get blocked.
You can try like the below script. It will traverse different pages through pagination and collect name and phone numbers from each container.
import requests
from bs4 import BeautifulSoup
my_url = "https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}"
for link in [my_url.format(page) for page in range(1,5)]:
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".info"):
try:
name = item.select(".business-name [itemprop='name']")[0].text
except Exception:
name = ""
try:
phone = item.select("[itemprop='telephone']")[0].text
except Exception:
phone = ""
print(name,phone)

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

Python - save requests or BeautifulSoup object locally

I have some code that is quite long, so it takes a long time to run. I want to simply save either the requests object (in this case "name") or the BeautifulSoup object (in this case "soup") locally so that next time I can save time. Here is the code:
from bs4 import BeautifulSoup
import requests
url = 'SOMEURL'
name = requests.get(url)
soup = BeautifulSoup(name.content)
Since name.content is just HTML, you can just dump this to a file and read it back later.
Usually the bottleneck is not the parsing, but instead the network latency of making requests.
from bs4 import BeautifulSoup
import requests
url = 'https://google.com'
name = requests.get(url)
with open("/tmp/A.html", "w") as f:
f.write(name.content)
# read it back in
with open("/tmp/A.html") as f:
soup = BeautifulSoup(f)
# do something with soup
Here is some anecdotal evidence for the fact that bottleneck is in the network.
from bs4 import BeautifulSoup
import requests
import time
url = 'https://google.com'
t1 = time.clock();
name = requests.get(url)
t2 = time.clock();
soup = BeautifulSoup(name.content)
t3 = time.clock();
print t2 - t1, t3 - t2
Output, from running on Thinkpad X1 Carbon, with a fast campus network.
0.11 0.02
Storing requests locally and restoring them as Beautifoul Soup object latter on
If you are iterating through pages of web site you can store each page with request explained here.
Create folder soupCategory in same folder where your script is.
Use any latest user agent for headers
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15'}
def getCategorySoup():
session = requests.Session()
retry = Retry(connect=7, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
basic_url = "https://www.somescrappingdomain.com/apartments?adsWithImages=1&page="
t0 = time.time()
j=0
totalPages = 1525 # put your number of pages here
for i in range(1,totalPages):
url = basic_url+str(i)
r = requests.get(url, headers=headers)
pageName = "./soupCategory/"+str(i)+".html"
with open(pageName, mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(r.text)
print (pageName, end=" ")
t1 = time.time()
total = t1-t0
print ("Total time for getting ",totalPages," category pages is ", round(total), " seconds")
return
Latter on you can create Beautifoul Soup object as #merlin2011 mentioned with:
with open("/soupCategory/1.html") as f:
soup = BeautifulSoup(f)

Categories