I am working on my first project with API's and I am having trouble accessing the data. I am working off an example that calls its data with this loop:
for item in data['objects']:
print item['name'], item['phone']
This works great for data stored as nested dictionaries (the outside being called objects, and the inside containing the data)
The issue I am having is my data is formatted with dictonaries inside of lists
[
{
"key":"2014cama",
"website":"http://www.cvrobotics.org/frc/regional.html",
"official":true,
"end_date":"2014-03-09",
"name":"Central Valley Regional",
"short_name":"Central Valley",
"facebook_eid":null,
"event_district_string":null,
"venue_address":"Madera South High School\n705 W. Pecan Avenue\nMadera, CA 93637\nUSA",
"event_district":0,
"location":"Madera, CA, USA",
"event_code":"cama",
"year":2014,
"webcast":[],
"timezone":"America/Los_Angeles",
"alliances":[],
"event_type_string":"Regional",
"start_date":"2014-03-07",
"event_type":0
},'more data...']
so calling,
for item in data['objects']:
print item['name']
Won't work to pull the value stored in name.
Any help would be much appreciated.
Edit: The full Dataset I'm pulling (http://www.thebluealliance.com/api/v2/team/frc254/2014/events?X-TBA-App-Id=Peter_Hartnett:Scouting:v1)
And the code I am running:
import json,urllib2, TBA
team ='frc254'
year = '2014'
Url = 'http://www.thebluealliance.com/api/v2/team/'+team+'/'+year+'/events?X- TBA-App-Id=Peter_Hartnett:Scouting:v1'
data = TBA.GetData(Url)
for item in data:
print item['name']
The TBA Class just imports the data and returns it.
Edit2:
Here is the TBA class that pulls the data, I can assure you it is identical to that found at the link above
import urllib2,cookielib
content='none'
def GetData(Url):
site= Url
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': ' 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib2.Request(site, headers=hdr)
try:
page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
print e.fp.read()
content = page.read()
return content
If I understood it correctly, your data['objects'] is now an entry in a list, right?
So just iterate the list and your logic will remain the same
for item in objects:
print item['name'], item['phone']
being
objects = [
{
"key":"2014cama",
"website":"http://www.cvrobotics.org/frc/regional.html",
"official":true,
"end_date":"2014-03-09",
"name":"Central Valley Regional",
"short_name":"Central Valley",
"facebook_eid":null,
"event_district_string":null,
"venue_address":"Madera South High School\n705 W. Pecan Avenue\nMadera, CA 93637\nUSA",
"event_district":0,
"location":"Madera, CA, USA",
"event_code":"cama",
"year":2014,
"webcast":[],
"timezone":"America/Los_Angeles",
"alliances":[],
"event_type_string":"Regional",
"start_date":"2014-03-07",
"event_type":0
},'more data...']
Edit
I get your problem now. Your object data is a string that represents a JSONArray . You should load that before iterating, in order to be able to work with that as a real list, like so:
data = GetData(Url)
loaded_array = json.loads(data)
for item in loaded_array:
print item['name']
Related
I have a web scrap script written in python and when I use it to a website it blocks me and says "you getting the page very fast. you might be a bot".
I tried adding time.sleep() to delay code but it always gets blocked. Is there any way to make this code a little slower?
I'm not sure why it should say so. Isn't it the same as viewing page from a website? What does it load that makes it not labelled as a bot but my script does?
from bs4 import BeautifulSoup
import re
import requests
import time
import sys
import csv
FIXED_WEB = "web.net"
def load_car_pages(seq, limit, i):
time.sleep(10)
html_web = requests.get(
f"web.net/homepage",
headers={
'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
'Accept-Language': "en-US,en;q=0.5",
'Accept-Encoding': "gzip, deflate",
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Te': 'trailers'
}).text
time.sleep(10)
sup_me_patate = BeautifulSoup(html_web, 'lxml')
headers = sup_me_patate.find_all('div', class_='sui-AtomCard-info') # find headers
print(f"{headers}")
for a in headers:
string = str(a)
href_pos = [m.start() for m in re.finditer('href=', string)]
for pos in href_pos:
slicing = string[pos + 6: string.find('"', pos + 6)]
print(f"For Link: {slicing}")
web_link = FIXED_WEB + slicing
print(f"LINK: {web_link}")
# limit = 25
# i = 0
time.sleep(10)
try:
car_web = requests.get(web_link, headers={
'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0',
'Origin': FIXED_WEB,
"Access-Control-Request-Method": "GET",
'Accept-Language': "en-US,en;q=0.5",
'Accept-Encoding': "gzip, deflate",
'Request-Domain': 'web.net',
'Site': 'car',
'Referer': web_link,
"Sec-Fetch-Dest": "empty",
"Sec- Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Te": "trailers",
'Connection': 'close'}).text
soup = BeautifulSoup(web_link, "lxml")
# with open(soup.title.string + ".html", 'w') as coolhtml:
# string = str(soup)
# coolhtml.write(string)
# sys.exit(0)
name = soup.find_all('h2',
class_="mt-TitleBasic-title mt-TitleBasic-title--xs mt-TitleBasic-title--black")
address = soup.find('p', class_="mt-CardUser-location").text
phone_number = soup.find('span', class_='mt-LeadPhoneCall-linkText mt-LeadPhoneCall-linkText--small')\
.text
j = 0
for b in name:
if j == 8:
real_name = b.text
print(b.text)
j += 1
# some costansts
NAME = real_name
ADDRESS = address
PHONE_NUMBER = phone_number
header = ['Name', 'Address', 'Phone Number']
data = [ADDRESS, PHONE_NUMBER, NAME]
with open("info.csv", 'a', encoding='UTF8') as csv_numbers:
writer = csv.writer(csv_numbers)
writer.writerow(data)
i += 1
print(i)
if i == limit:
print("it prints...")
limit += 35
seq += 1
load_car_pages(seq, limit, i)
except Exception as ACX:
print(f"Bro Exception occurred::{ACX}...")
# continue
def main():
# get_car_links()
load_car_pages(0, 35, 0)
main()
You're asking too many overloaded questions all at once (even though they're somewhat related in your particular context). I'll only answer the one in your title: How to make a web scraper more human-like?
That question is too open-ended to be definitively answered. New methods of bot detection will continue to be developed, as well as ways to bypass them.
That being said: a couple highlights off the top of my head:
Browsers send & receive a lot of metadata, like user agent, headers, cookies, runtime JavaScript, etc. Bare HTTP requests look very different from that.
Browser automation systems behave very differently from humans by default: they don't really use the mouse, they click buttons instantly at their exact centers, etc
Browser automation detection and detection bypass is a rabbit hole: Can a website detect when you are using Selenium with chromedriver?
I wrote a parser that should parse exchange rates but there is a final touch.
Code:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.google.com/search?sxsrf=ALeKk02hYi-HCGXbHdPuek-VJRu_8qsUVg%3A1587054998453&ei=lomYXvaSG7zAmwWP_LHQBA&q=%D0%B4%D0%BE%D0%BB%D0%BB%D0%B0%D1%80+%D0%B3%D1%80%D0%B8%D0%B2%D0%BD%D0%B0&oq=&gs_lcp=CgZwc3ktYWIQARgBMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIABBHSgkIFxIFMTAtMjRKCAgYEgQxMC0yUPFtWPFtYKt8aAFwAngAgAEAiAEAkgEAmAEAoAEBqgEHZ3dzLXdperABCg&sclient=psy-ab'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'accept': '*/*'}
def get_html(url, params=None):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_="VgAgW")
currency = []
for item in items:
currency.append({
'uah': item.find('span', class_='SwHCTb').get_text(strip=True),
})
print(f"'Now the course:' + {currency}")
return currency
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print('Error')
parse()
I don’t know how to remove this: [{'uah':}]
Here is what comes out:
'Now the course:' + [{'uah': '27,22'}]
Process finished with exit code 0
Currency is a list currency = [] so when you print list it's always prints like this [].
Currency is a list of dicts {'uah': ...} so when you print dict it's always prints like this {key: value}.
Looks like you need to print(f"Now the course: {currency[0]['uah']}") where [0] is the first element of list, which is dict, and then gets value of that first dict by it's key 'uah'.
You can add an additional variable course to make it easier to access the value:
course = item.find('span', class_='SwHCTb').get_text(strip=True)
currency.append({'uah': course})
print(f"Now the course: {course}")
I wrote this code but got this as the error "IndexError: list index out of range" after running the last line. Please, how do I fix this?
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div",attrs={"class": "sc-bblaLu dOXFUL"})
list_tr = top_rest[0].find_all("div",attrs={"class": "sc-gTAwTn cKXlHE"})
list_rest =[]
for tr in list_tr:
dataframe ={}
dataframe["rest_name"] = (tr.find("div",attrs={"class": "res_title zblack bold nowrap"})).text.replace('\n', ' ')
dataframe["rest_address"] = (tr.find("div",attrs={"class": "nowrap grey-text fontsize5 ttupper"})).text.replace('\n', ' ')
dataframe["cuisine_type"] = (tr.find("div",attrs={"class":"nowrap grey-text"})).text.replace('\n', ' ')
list_rest.append(dataframe)
list_rest
You are receiving this error because top_rest is empty when you attempt to get the first element of it "top_rest[0]". The reason for that is because the first class your attempting to reference is dynamically named. You will notice if you refresh the page the same location of that div will not be named the same. So when you attempt to scrape you get empty results.
An alternative would be to scrape ALL divs, then narrow in on the elements you want, be mindful of the dynamic div naming schema so from one request to another you will get different results:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/bangalore/top-restaurants",headers=headers)
content = response.content
soup = BeautifulSoup(content,"html.parser")
top_rest = soup.find_all("div")
list_tr = top_rest[0].find_all("div",attrs={"class": "bke1zw-1 eMsYsc"})
list_tr
I recently did a project that made me research scraping the Zomato's website in Manila, Philippines. I used Geolibrary to get the longitude and latitude values of Manila City, then scraped the restaurants' details using this information.
ADD: You can get your own API key on zomato website to make up to 1000 calls in a day.
# Use geopy library to get the latitude and longitude values of Manila City.
from geopy.geocoders import Nominatim
address = 'Manila City, Philippines'
geolocator = Nominatim(user_agent = 'Makati_explorer')
location = geolocator.geocode(address)
latitude = location.lenter code hereatitude
longitude = location.longitude
print('The geographical coordinate of Makati City are {}, {}.'.format(latitude, longitude))
# Use Zomato's API to make call
headers = {'user-key': '617e6e315c6ec2ad5234e884957bfa4d'}
venues_information = []
for index, row in foursquare_venues.iterrows():
print("Fetching data for venue: {}".format(index + 1))
venue = []
url = ('https://developers.zomato.com/api/v2.1/search?q={}' +
'&start=0&count=1&lat={}&lon={}&sort=real_distance').format(row['name'], row['lat'], row['lng'])
try:
result = requests.get(url, headers = headers).json()
except:
print("There was an error...")
try:
if (len(result['restaurants']) > 0):
venue.append(result['restaurants'][0]['restaurant']['name'])
venue.append(result['restaurants'][0]['restaurant']['location']['latitude'])
venue.append(result['restaurants'][0]['restaurant']['location']['longitude'])
venue.append(result['restaurants'][0]['restaurant']['average_cost_for_two'])
venue.append(result['restaurants'][0]['restaurant']['price_range'])
venue.append(result['restaurants'][0]['restaurant']['user_rating']['aggregate_rating'])
venue.append(result['restaurants'][0]['restaurant']['location']['address'])
venues_information.append(venue)
else:
venues_information.append(np.zeros(6))
except:
pass
ZomatoVenues = pd.DataFrame(venues_information,
columns = ['venue', 'latitude',
'longitude', 'price_for_two',
'price_range', 'rating', 'address'])
Using Web Scraping Language I was able to write this:
GOTO https://www.zomato.com/bangalore/top-restaurants
EXTRACT {'rest_name': '//div[#class="res_title zblack bold nowrap"]',
'rest_address': '//div[#class="nowrap grey-text fontsize5 ttupper',
'cusine_type': '//div[#class="nowrap grey-text"]'} IN //div[#class="bke1zw-1 eMsYsc"]
This will iterate over each record element with class bke1zw-1 eMsYsc and pull
each restaurant information.
I have these beginnings of a Python pandas script that searches for values in on Google and grabs any PDF links it can find on the first page.
I have two questions, listed below.
import pandas as pd
from bs4 import BeautifulSoup
import urllib2
import re
df = pd.DataFrame(["Shakespeare", "Beowulf"], columns=["Search"])
print "Searching for PDFs ..."
hdr = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(search):
google = "http://www.google.com/search?q="
url = google + search + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
pdf_links = None
placeholder = None #just a column placeholder
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
cite = soup.find_all("cite", attrs={"class":"_Rm"})
for link in cite:
all_links = re.search(r".+", link.text).group().encode("utf-8")
if all_links.endswith(".pdf"):
pdf_links = re.search(r"(.+)pdf$", all_links).group()
print pdf_links
except urllib2.HTTPError, e:
print e.fp.read()
return pd.Series([pdf_links, placeholder])
df[["PDF links", "Placeholder"]] = df["Search"].apply(crawl)
df.to_csv(FileName, index=False, delimiter=",")
The results from print pdf_links will be:
davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
sparks.eserver.org/books/shakespeare-tempest.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
www.w3.org/People/maxf/.../hamlet.pdf
calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
www.yorku.ca/inpar/Beowulf_Child.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
https://is.muni.cz/el/1441/.../2._Beowulf.pdf
www.penguin.com/static/pdf/.../beowulf.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
www.neshaminy.org/cms/lib6/.../380/text.pdf
sparks.eserver.org/books/beowulf.pdf
And the csv output will look like:
Search PDF Links
Shakespeare calhoun.k12.il.us/teachers/wdeffenbaugh/.../Shakespeare%20Sonnets.pdf
Beowulf sparks.eserver.org/books/beowulf.pdf
Questions:
Is there a way to write all of the results as rows to the csv instead of
just the bottom one? And if possible, include the value in Search for each row that corresponds to "Shakespeare" or "Beowulf"?
How can I write out the full pdf links without long links automatically abbreviating with "..."?
This will get you all the proper pdf links using soup.find_all("a",href=True) and save them in a Dataframe and to a csv:
hdr = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive"}
def crawl(columns=None, *search):
df = pd.DataFrame(columns= columns)
for term in search:
google = "http://www.google.com/search?q="
url = google + term + "+" + "PDF"
req = urllib2.Request(url, headers=hdr)
try:
page = urllib2.urlopen(req).read()
soup = BeautifulSoup(page)
pdfs = []
links = soup.find_all("a",href=True)
for link in links:
lk = link["href"]
if lk.endswith(".pdf"):
pdfs.append((term, lk))
df2 = pd.DataFrame(pdfs, columns=columns)
df = df.append(df2, ignore_index=True)
except urllib2.HTTPError, e:
print e.fp.read()
return df
df = crawl(["Search", "PDF link"],"Shakespeare","Beowulf")
df.to_csv("out.csv",index=False)
out.csv:
Search,PDF link
Shakespeare,http://davidlucking.com/documents/Shakespeare-Complete%20Works.pdf
Shakespeare,http://www.w3.org/People/maxf/XSLideMaker/hamlet.pdf
Shakespeare,http://sparks.eserver.org/books/shakespeare-tempest.pdf
Shakespeare,https://phillipkay.files.wordpress.com/2011/07/william-shakespeare-plays.pdf
Shakespeare,http://www.artsvivants.ca/pdf/eth/activities/shakespeare_overview.pdf
Shakespeare,http://triggs.djvu.org/djvu-editions.com/SHAKESPEARE/SONNETS/Download.pdf
Beowulf,http://www.yorku.ca/inpar/Beowulf_Child.pdf
Beowulf,https://is.muni.cz/el/1441/podzim2013/AJ2RC_STAL/2._Beowulf.pdf
Beowulf,http://teacherweb.com/IL/Steinmetz/MottramM/Beowulf---Seamus-Heaney.pdf
Beowulf,http://www.penguin.com/static/pdf/teachersguides/beowulf.pdf
Beowulf,http://www.neshaminy.org/cms/lib6/PA01000466/Centricity/Domain/380/text.pdf
Beowulf,http://www.sparknotes.com/free-pdfs/uscellular/download/beowulf.pdf
To get PDF links, you're looking for these selectors:
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class OR use try/except instead
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
CSS selectors reference. Have a look at SelectorGadget Chrome extension to grab CSS selectors by clicking on the desired element in your browser.
To save them to CSV, you're looking for this:
# store all links from a for loop
pdfs = []
# create PDF Link column and append PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
# save to csv and delete default pandas index column. Done!
df.to_csv('PDFs.csv', index=False)
Code and example in the online IDE (also shows how to save locally):
from bs4 import BeautifulSoup
import requests, lxml
import pandas as pd
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "best lasagna recipe:pdf"
}
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
pdfs = []
for result in soup.select('.tF2Cxc'):
# check if PDF is present via according CSS class
if result.select_one('.ZGwO7'):
pdf_file = result.select_one('.yuRUbf a')['href']
pdfs.append(pdf_file)
# creates PDF Link column and appends PDF links from a pdfs list()
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('Bs4_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Alternatively, you can achieve the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that rather than creating everything from scratch, figuring out why certain things don't work as expected, and then maintain it over time, all that you need to do is to iterate over structured JSON and get the data you want. It might be also more readable and quickly understand what's going on inside the code.
Code to integrate with your example:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "best lasagna recipe:pdf",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
pdfs = []
# iterate over organic results and check if .pdf file type exists in link
for result in results['organic_results']:
if '.pdf' in result['link']:
pdf_file = result['link']
pdfs.append(pdf_file)
df = pd.DataFrame({'PDF Link': pdfs})
df.to_csv('SerpApi_PDFs.csv', index=False)
-----------
# from CSV
'''
PDF Link
http://www.bakersedge.com/PDF/Lasagna.pdf
http://greatgreens.ca/recipes/Recipe%20-%20Worlds%20Best%20Lasagna.pdf
https://liparifoods.com/wp-content/uploads/2015/10/lipari-foods-holiday-recipes.pdf
...
'''
Disclaimer, I work for SerpApi.
I have a Python code that sends POST request to a website, reads the response and filters it. For the POST data I used ('number', '11111') and it works perfect. However, I want to create a txt file that contains 100 different numbers as 1111,2222,3333,4444... and then send the POST requests for each of them. Can you help me how to do this in Python?
import urllib
from bs4 import BeautifulSoup
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Origin': 'http://mahmutesat.com/python.aspx',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'http://mahmutesat.com/python.aspx',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
}
class MyOpener(urllib.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'http://mahmutesat.com/python.aspx'
# first HTTP request without form data
f = myopener.open(url)
soup = BeautifulSoup(f)
# parse and retrieve two vital form values
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
viewstategenerator = soup.select("#__VIEWSTATEGENERATOR")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR',viewstategenerator),
('number', '11111'),
('Button', 'Sorgula'),
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
soup = BeautifulSoup(f.read())
name=soup.findAll('input',{'id':'name_field'})
for eachname in name:
print eachname['value']
If your file has data:
"sample.txt"
1111,2222,3333,4444,5555,6666,7777,8888,......(and so on)
To read the file contents, you can use the file open operation:
import itertools
#open the file for read
with open("sample.txt", "r") as fp:
values = fp.readlines()
#Get the values split with ","
data = [map(int, line.split(",")) for line in values]
numbers = list(itertools.chain(*data)) #Ensuring if its having many lines then concatenate
Now, use it as:
for number in numbers:
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('__VIEWSTATEGENERATOR',viewstategenerator),
('number', str(number)), # Here you use the number obtained
('Button', 'Sorgula'),
)
encodedFields = urllib.urlencode(formData)
# second HTTP request with form data
f = myopener.open(url, encodedFields)
soup = BeautifulSoup(f.read())
name=soup.findAll('input',{'id':'name_field'})
for eachname in name:
print eachname['value']
1 - Here is an example on how to create a file:
f = open('test.txt','w')
This will open the test.txt file for writing ('w') (if it has already data, it will be erased but if you want to append it write: f = open('test.txt','a') ) or create one if it does not exist yet. Note that this will happen in your current working directory, if you want it in a specific directory, include with the file name the full directory path, example:
f = open('C:\\Python\\test.txt','w')
2 - Then write/append to this file the data you want, example:
for i in range(1,101):
f.write(str(i*1111)+'\n')
This will write 100 numbers as string from 1111 to 111100
3 - You should always close the file at the end:
f.close()
4 - Now if you want to read from this file 'test.txt':
f = open('C:\\Python\\test.txt','r')
for i in f:
print i,
file.close()
This is as simple as it can be,
You need to read about File I/O in python from:
https://docs.python.org/2.7/tutorial/inputoutput.html#reading-and-writing-files
Make sure you select the right Python version for you in this docs.
using dictionary you can deal with the multiple requests, very easily.
import requests
values = {
'__EVENTVALIDATION': event_validation,
'__LASTFOCUS': '',
'__VIEWSTATE': view_state,
'__VIEWSTATEGENERATOR': '6264FB8D',
'ctl00$ContentPlaceHolder1$ButGet': 'Get Report',
'ctl00$ContentPlaceHolder1$Ddl_Circles': 'All Circles',
'ctl00$ContentPlaceHolder1$Ddl_Divisions': '-- Select --',
'ctl00$ContentPlaceHolder1$TxtTin': tin_num,
'ctl00$ContentPlaceHolder1$dropact': 'all'
}
headers_1 = {
'Origin': 'https://www.apct.gov.in',
'User-Agent': user_agent,
'Cookie': cookie_1,
'Accept-Encoding': 'gzip, deflate, br',
'Referer': url_1,
'Content-Type': 'application/x-www-form-urlencoded',
'Upgrade-Insecure-Requests': '1'
}
try:
req = requests.post(url_1, data=values, headers=headers_1)