How can I parse every single page for eth addresses from https://etherscan.io/token/generic-tokenholders2?a=0x6425c6be902d692ae2db752b3c268afadb099d3b&s=0&p=1 ? Then add it to .txt .
Okay, possibly off-topic, but I had a play around with this. (Mainly because I thought I might need to use something similar to grab stuff in future that Etherscan's APIs don't return... )
The following Python2 code will grab what you're after. There's a hacky sleep in there to get around what I think is either something to do with how quickly the pages load, or some rate limiting imposed by Etherscan. I'm not sure.
Data gets written to a .csv file - a text file wouldn't be much fun.
#!/usr/bin/env python
from __future__ import print_function
import os
import requests
from bs4 import BeautifulSoup
import csv
import time
RESULTS = "results.csv"
URL = "https://etherscan.io/token/generic-tokenholders2?a=0x6425c6be902d692ae2db752b3c268afadb099d3b&s=0&p="
def getData(sess, page):
url = URL + page
print("Retrieving page", page)
return BeautifulSoup(sess.get(url).text, 'html.parser')
def getPage(sess, page):
table = getData(sess, str(int(page))).find('table')
return [[X.text.strip() for X in row.find_all('td')] for row in table.find_all('tr')]
def main():
resp = requests.get(URL)
sess = requests.Session()
with open(RESULTS, 'wb') as f:
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(map(str, "Rank Address Quantity Percentage".split()))
page = 0
while True:
page += 1
data = getPage(sess, page)
# Even pages that don't contain the data we're
# after still contain a table.
if len(data) < 4:
break
else:
for row in data:
wr.writerow(row)
time.sleep(1)
if __name__ == "__main__":
main()
I'm sure it's not the best Python in the world.
Related
When I write to the csv file all of my data is printed in only the first column. Using my loop, how do I iterate along the columns to write the data?
import csv
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#For sites that can't be opened due to Urllib blocker, use a Mozilla User agent to get access
pageRequest = Request('https://coronavirusbellcurve.com/', headers = {'User-Agent': 'Mozilla/5.0'})
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
specificDiv = page_soup.find("div", {"class": "table-responsive-xl"})
TbodyStats = specificDiv.table.tbody.tr.contents
TbodyDates = specificDiv.table.thead.tr.contents
with open('CovidHTML.csv','w', newline= '') as file:
theWriter = csv.writer(file)
theWriter.writerow(['5/4', ' 5/5', ' 5/6',' 5/7',' 5/8',' 5/9'])
for i in range(3,len(TbodyStats)):
if i%2 != 0:
theWriter.writerow([TbodyStats[i].text])
Another method, For reference only.
from simplified_scrapy import SimplifiedDoc,utils,req
html = req.get('https://coronavirusbellcurve.com/')
doc = SimplifiedDoc(html)
specificDiv = doc.select('div.table-responsive-xl') # Get first div. If you want to get all divs, use this method: doc.selects('div.table-responsive-xl')
# TbodyStats = specificDiv.tbody.trs.selects('td|th').text # Get data
# TbodyDates = specificDiv.thead.trs.selects('td|th').text # Get date
data = specificDiv.table.trs.selects('td|th').text # Get all
rows = []
for row in data:
rows.append(row[1:])
utils.save2csv('test.csv',rows)
Result:
5/5,5/6,5/7,5/8,5/9
1213260,1237960,1266822,1294664,1314610
24423,24700,28862,27842,19946
2.05%,2.04%,2.33%,2.20%,1.54%
I think you may be able to do this (I can't test for sure because I don't have your exact data on hand):
row = []
for i in range(3, len(TbodyStats), 2):
row.append(TbodyStats[i].text)
if len(row) == 6:
theWriter.writerow(row)
row = []
I added the 'step' to your range so you don't have to use % for finding odd numbered indices, then just built each row until it hits 6 members, then flush that to the csv file, then empty the row so you can repeat the process.
I am trying to extract data from website and have following code which is extracting all URLs from Main category and its sub category links.
I am now stuck in saving the extracted output with line separator (to move each URL in separate line) in a file -Medical.tsv
Need help on this.
Code is given below:
from bs4 import BeautifulSoup
import requests
import time
import random
def write_to_file(file,mode, data, newline=None, with_tab=None): #**
with open(file, mode, encoding='utf-8') as l:
if with_tab == True:
data = ''.join(data)
if newline == True:
data = data+'\n'
l.write(data)
def get_soup(url):
return BeautifulSoup(requests.get(url).content, "lxml")
url = 'http://www.medicalexpo.com/'
soup = get_soup(url)
raw_categories = soup.select('div.univers-main li.category-group-item a')
category_links = {}
for cat in (raw_categories):
t0 = time.time()
response_delay = time.time() - t0 # It wait 10x longer than it took them to respond using delay.
time.sleep(10*response_delay) # This way if the site gets overwhelmed and starts to slow down, the code will automatically back off.
time.sleep(random.randint(2,5)) # This will provide random time intervals of 2 and 3 secs acting as human crawl instead of bot.
soup = get_soup(cat['href'])
links = soup.select('#category-group li a')
category_links[cat.text] = [link['href'] for link in links]
print(category_links)
You got the write_to_file function but you never call it? mode have to be w or w+(if you wanna overwrite in the case if the file already exists)
Been using beautiful soup to iterate through pages, but for whatever reason I can't get the loop to advance beyond the first page. it seems like it should be easy because it's a text string, but it seems to loop back, maybe it's my structure not my text string?
Here's what I have:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
this gets the same first 40 records over and over.
I tried using this solution as an if and did find that doing
prevLink = soup.select('a[rel="nofollow"]')[0]
newurl = "http:" + prevLink.get('href')
did work better, but I'm not sure how to do the loop in such a way that it advances? possibly just tired but my loop there still just goes to the next set of records and gets stuck on that one. please help me fix my loop
UPDATE
my formatting was lost in the copy paste, my actual code looks like:
import csv
import urllib2
from bs4 import BeautifulSoup
f = open('nhlstats.csv', "w")
groups=['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015","2014","2013","2012"]
for yr in year:
for gr in groups:
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/year/"+str(yr)
#www.espn.com/nhl/statistics/player/_/stat/points/year/2014/
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
pagecount = soup.findAll(attrs= {"class":"page-numbers"})[0].string
pageliteral = int(pagecount[5:])
for i in range(0,pageliteral):
number = int(((i*40) + 1))
URL = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/"+str(yr) + "/count/"+str(number)
page = urllib2.urlopen(url)
soup=BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row =[]
for ob in range(1,15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
f.write(str(yr) +","+",".join(row) + "\n")
f.close()
Your code indenting was mostly at fault. Also it would be wise to actually use the CSV library you imported, this will automatically wrap the player names in quotes to avoid any commas inside from ruining the csv structure.
This works by looking for the link to the next page and extracting the starting count. This is then used to build your the next page get. If no next page can be found, it moves to the next year group. Note, the count is not a page count but a starting entry count.
import csv
import urllib2
from bs4 import BeautifulSoup
groups= ['points', 'shooting', 'goaltending', 'defensive', 'timeonice', 'faceoffs', 'minor-penalties', 'major-penalties']
year = ["2016", "2015", "2014", "2013", "2012"]
with open('nhlstats.csv', "wb") as f_output:
csv_output = csv.writer(f_output)
for yr in year:
for gr in groups:
start_count = 1
while True:
#print "{}, {}, {}".format(yr, gr, start_count) # show progress
url = "http://www.espn.com/nhl/statistics/player/_/stat/points/sort/points/year/{}/count/{}".format(yr, start_count)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
for tr in soup.select("#my-players-table tr[class*=player]"):
row = [yr]
for ob in range(1, 15):
player_info = tr('td')[ob].get_text(strip=True)
row.append(player_info)
csv_output.writerow(row)
try:
start_count = int(soup.find(attrs= {"class":"page-numbers"}).find_next('a')['href'].rsplit('/', 1)[1])
except:
break
Using with will also automatically close your file at the end.
This would give you a csv file starting as follows:
2016,"Patrick Kane, RW",CHI,82,46,60,106,17,30,1.29,287,16.0,9,17,20
2016,"Jamie Benn, LW",DAL,82,41,48,89,7,64,1.09,247,16.6,5,17,13
2016,"Sidney Crosby, C",PIT,80,36,49,85,19,42,1.06,248,14.5,9,10,14
2016,"Joe Thornton, C",SJ,82,19,63,82,25,54,1.00,121,15.7,6,8,21
You are changing the URL many times before you are opening it the first time, due to an indentation error. Try this:
for gr in groups:
url = "...some_url..."
page = urllib2.urlopen(url)
...everything else should be indented....
My goal is to scrape data from the PGA website to extract all the golf course locations in the USA. I aim to scrape from the 907 pages the name, address, ownership, phone number, and website.
I have created the script below but when the CSV is created it produces errors. The CSV file created from the script has data repetitions of the first few pages and the pages of the website. It does not give the whole data of the 907 pages.
How can I fix my script so that it will scrape all 907 pages and produce a CSV with all the golf courses listed on the PGA website?
Below is my script:
import csv
import requests
from bs4 import BeautifulSoup
for i in range(907): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
courses_list=[]
for item in g_data2:
try:
name=item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1=item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2=item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website=item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber=item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(course)
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
Her is the code that you want. It will first parse the current page before going on to the next one. (There are some blank rows, I hope you can fix that yourself).
import csv
import requests
from bs4 import BeautifulSoup
def encode(l):
out = []
for i in l:
text = str(i).encode('utf-8')
out.append(''.join([i if ord(i) < 128 else ' ' for i in text])) #taken from Martjin Pieter's answer
# http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space/20078869#20078869
return out
courses_list = []
for i in range(5): # Number of pages plus one
url = "http://www.pga.com/golf-courses/search?page={}&searchbox=Course+Name&searchbox_zip=ZIP&distance=50&price_range=0&course_type=both&has_events=0".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content)
g_data2=soup.find_all("div",{"class":"views-field-nothing"})
for item in g_data2:
try:
name = item.contents[1].find_all("div",{"class":"views-field-title"})[0].text
except:
name=''
try:
address1= item.contents[1].find_all("div",{"class":"views-field-address"})[0].text
except:
address1=''
try:
address2= item.contents[1].find_all("div",{"class":"views-field-city-state-zip"})[0].text
except:
address2=''
try:
website= item.contents[1].find_all("div",{"class":"views-field-website"})[0].text
except:
website=''
try:
Phonenumber= item.contents[1].find_all("div",{"class":"views-field-work-phone"})[0].text
except:
Phonenumber=''
course=[name,address1,address2,website,Phonenumber]
courses_list.append(encode(course))
with open ('PGA_Data.csv','a') as file:
writer=csv.writer(file)
for row in courses_list:
writer.writerow(row)
EDIT: After the inevitable problems of unicode encoding/decoding, I have modified the answer and it will (hopefully) work now. But http://nedbatchelder.com/text/unipain.html see this.
I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel