I have this python code, but when I run it, is printing out just the first target, here is my python code:
def get_next_target(S):
start_link = S.find('<a href=')
start_quote = S.find('"', start_link)
end_quote = S.find('"', start_quote + 1)
url = S[start_quote + 1:end_quote]
print url
return url, end_quote
get_next_target(S)
where variable S = '<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href="www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'
What I want is to print out the three targets, but instead it's just printing the first one, why is that?
I think you should use BeautifulSoup to extract info from html/xml.
In [1]: from bs4 import BeautifulSoup
In [2]: html = '''<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href=
...: "www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'''
In [3]: soup = BeautifulSoup(html, 'lxml')
In [4]: for a in soup.find_all('a'):
...: print(a['href'])
...:
www.target1.com
www.target2.com
www.target3.com
If you logically want to achieve this without using any special module then following code will do that.
import re
import sys
S = '<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href="www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'
abc = []
def get_next_target(S):
search_index = [i.start() for i in re.finditer('<a href=', S)]
for j in range(len(search_index)):
if ( j == len(search_index)-1):
A =S[ search_index[j]:len(S) ]
search_start_index = A.find('"')
search_end_index = A.rfind('"')
start_final = search_index[j] + search_start_index + 1
start_end = search_index[j] + search_end_index
final_result = S[ start_final:start_end ]
abc.append(final_result)
print abc
else:
A = S[ search_index[j]:search_index[j+1] ]
search_start_index = A.find('"')
search_end_index = A.rfind('"')
start_final = search_index[j] + search_start_index + 1
start_end = search_index[j] + search_end_index
final_result = S[ start_final:start_end ]
abc.append(final_result)`enter code here`
get_next_target(S)
Note: If you don't want to append the result in to a list then replace the last two line of if and else statement with "print final_result".enter code here
Related
so what i wanna do is basically i have a list of urls with multiple parameters, such as:
https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2
and i would want to get is something like this:
https://www.somesite.com/path/path2/path3?param1=PAYLOAD¶m2=value2
https://www.somesite.com/path/path2/path3?param1=value1¶m2=PAYLOAD
like i wanna iterate through every parameter (basically every match of "=" and "&") and replace each value one per time. Thank you in advance.
from urllib.parse import urlparse
import re
urls = ["https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2¶m3=value3",
"https://www.anothersite.com/path/path2/path3?param1=value1¶m2=value2¶m3=value3"]
parseds = [urlparse(url) for url in urls]
newurls = []
for parsed in parseds:
params = parsed[4].split("&")
for i, param in enumerate(params):
newparam = re.sub("=.+", "=PAYLOAD", param)
newurls.append(
parsed[0] +
"://" +
parsed[1] +
parsed[2] +
"?" +
parsed[4].replace(param, newparam)
)
newurls is
['https://www.somesite.com/path/path2/path3?param1=PAYLOAD¶m2=value2¶m3=value3',
'https://www.somesite.com/path/path2/path3?param1=value1¶m2=PAYLOAD¶m3=value3',
'https://www.somesite.com/path/path2/path3?param1=value1¶m2=value2¶m3=PAYLOAD',
'https://www.anothersite.com/path/path2/path3?param1=PAYLOAD¶m2=value2¶m3=value3',
'https://www.anothersite.com/path/path2/path3?param1=value1¶m2=PAYLOAD¶m3=value3',
'https://www.anothersite.com/path/path2/path3?param1=value1¶m2=value2¶m3=PAYLOAD']
I've solved it:
from urllib.parse import urlparse
url = "https://github.com/search?p=2&q=user&type=Code&name=djalel"
parsed = urlparse(url)
query = parsed.query
params = query.split("&")
new_query = []
for param in params:
l = params.index(param)
param = str(param.split("=")[0]) + "=" + "PAYLOAD"
params[l] = param
new_query.append("&".join(params))
params = query.split("&")
for query in new_query:
print(str(parsed.scheme) + '://' + str(parsed.netloc) + str(parsed.path) + '?' + query)
Output:
https://github.com/search?p=PAYLOAD&q=user&type=Code&name=djalel
https://github.com/search?p=2&q=PAYLOAD&type=Code&name=djalel
https://github.com/search?p=2&q=user&type=PAYLOAD&name=djalel
https://github.com/search?p=2&q=user&type=Code&name=PAYLOAD
So I am using append to extend my list of scraped apartments. In this code I run in to a problem, because I have created second for loop to change the page in the website. So first for loop gives new page to the next for loop to scrape. But when its done with one page it just overwrites the last list. What I am doing wrong?
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list) # Tried to make this work.. Think one list should be enaught.
And the functions:
def get_content_value(info_list_data):
if info_list_data.find("li"):
return [li.get_text(" ", strip=True).replace("\xa0", "").replace("€", "").replace("/ kk",
"").replace("\n", "") for li in info_list_data.find_all("li")]
else:
return info_list_data.get_text(" ", strip=True).replace("\xa0" , "").replace("€", "").replace("/
kk", "").replace("\n", "")
Last:
def get_apartment_data(url):
r = requests.get(url)
soup = bs(r.content)
all_info_list = soup.find_all(class_ = "CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy")
for info_list in all_info_list:
info_list.prettify()
info = {}
for index, info_list in enumerate(all_info_list):
content_key = info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-4__3RH7g
ItemHeader__itemHeader__32xAv").get_text(" ", strip=True)
content_value = get_content_value(info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS
flexboxgrid__col-sm-8__2jfMv CompactInfoRow__content__3jGt4"))
info[content_key] = content_value
return info
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list.copy())
You need to use the copy() method to make an independent copy. Otherwise each time you make a new apartment_list, it will change in your x list too. Like twins lists.
More generally:
x = []
lst = [1,2,3]
x.append(lst)
print (x)
lst[0] = 0
x.append(lst)
print (x)
Output:
[[1,2,3]]
[[0,2,3],[0,2,3]]
The right way is:
x = []
lst = [1,2,3]
x.append(lst.copy())
print (x)
lst[0] = 0
x.append(lst.copy())
print (x)
Output:
[[1,2,3]]
[[1,2,3],[0,2,3]]
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).
On a personal whim I have written some code to search for the shortest series of links between any two Wikipedia articles. It turned out to be very brute force and takes a long long time to find the goal if it's more than a link or two deep, but it works! I will eventually keep track of and make use of the link paths and stuff, but I wanted to get the search working optimally first. Is there a faster way to do this or a good way to cut some major corners here?
import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'
#Using BeautifulSoup, this grabs the page
def soup_request(target):
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0")
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
return soup
#This will grab all Wiki links off a given page
def get_links(Start):
soup = soup_request(Start)
Wiki_links = []
#Finds all links
for url in soup.findAll('a'):
result = url.get('href')
try:
if str(result)[:5] == '/wiki':
Wiki_links.append(result)
except:
pass
for q in range(len(Wiki_links)):
Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
print "Got new links from",Start
return Wiki_links
#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
goalsoup = soup_request(End)
goaltitle = goalsoup.html.title
Found = False
count = 0
for q in Links:
if Found:
break
length = len(Links)
#Runs through all the given links and checks their titles for correct one
if q is not None:
count += 1
soup = soup_request(q)
print "Checked",count,"links out of",length
try:
title = soup.html.head.title
if title == goaltitle:
Found = True
print "Found it!"
break
except:
print 'doh'
pass
return Found
#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
Old_Links = [Start]
count = depth
while count > 0:
New_Links = []
for q in range(len(Old_Links)):
New_Links.extend(get_links(Old_Links[q]))
Found = check_links(New_Links,End)
if Found:
print "All done."
break
Old_Links = New_Links
count -= 1
print "_______________________________________________________________ROUND DONE"
if not Found:
print "Did not find the page, you must go deeper!"
wiki_crawl(Start, End, 2)
Here are some functions to take info from wiki. The only problems with it is that sometimes it takes out a space from the info on the webpage.
def take_out_parenthesis(st):
string = list(st)
for a in string:
if a == '(':
del string[st.find(a)]
if a == ')':
del string[st.find(a) - 1]
return ''.join(string)
def take_out_tags(string):
st = list(string)
odd = ['<', '>']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('<') - 1
end = string.find('>')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_out_brackets(string):
st = list(string)
odd = ['[', ']']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('[') - 1
end = string.find(']')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_from_web_page(text):
n = 0
url = text.replace(" ", "_")
search = "http://en.wikipedia.org/wiki/%s" % url
page = urllib2.urlopen(search).read()
start = page.find('<p><b>') + 6
end = page.find('</a>.', start) + 5
new_page = page[start:end]
for a in new_page:
if a == '<':
if new_page[n - 1] != ' ':
lst = list(new_page)
lst.insert(n, ' ')
new_page = ''.join(lst)
n += 1
n += 1
return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))
As of now I am trying to scrape Good.is.The code as of now gives me the regular image(turn the if statement to True) but I want to higher res picture. I was wondering how I would replace a certain text so that I could download the high res picture. I want to change the html: http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html to http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html (The end is different). My code is:
import os, urllib, urllib2
from BeautifulSoup import BeautifulSoup
import HTMLParser
parser = HTMLParser.HTMLParser()
# make folder.
folderName = 'Good.is'
if not os.path.exists(folderName):
os.makedirs(folderName)
list = []
# Python ranges start from the first argument and iterate up to one
# less than the second argument, so we need 36 + 1 = 37
for i in range(1, 37):
list.append("http://www.good.is/infographics/page:" + str(i) + "/sort:recent/range:all")
listIterator1 = []
listIterator1[:] = range(0,37)
counter = 0
for x in listIterator1:
soup = BeautifulSoup(urllib2.urlopen(list[x]).read())
body = soup.findAll("ul", attrs = {'id': 'gallery_list_elements'})
number = len(body[0].findAll("p"))
listIterator = []
listIterator[:] = range(0,number)
for i in listIterator:
paragraphs = body[0].findAll("p")
nextArticle = body[0].findAll("a")[2]
text = body[0].findAll("p")[i]
if len(paragraphs) > 0:
#print image['src']
counter += 1
print counter
print parser.unescape(text.getText())
print "http://www.good.is" + nextArticle['href']
originalArticle = "http://www.good.is" + nextArticle['href']
article = BeautifulSoup(urllib2.urlopen(originalArticle).read())
title = article.findAll("div", attrs = {'class': 'title_and_image'})
getTitle = title[0].findAll("h1")
article1 = article.findAll("div", attrs = {'class': 'body'})
articleImage = article1[0].find("p")
betterImage = articleImage.find("a")
articleImage1 = articleImage.find("img")
paragraphsWithinSection = article1[0].findAll("p")
print betterImage['href']
if len(paragraphsWithinSection) > 1:
articleText = article1[0].findAll("p")[1]
else:
articleText = article1[0].findAll("p")[0]
print articleImage1['src']
print parser.unescape(getTitle)
if not articleText is None:
print parser.unescape(articleText.getText())
print '\n'
link = articleImage1['src']
x += 1
actually_download = False
if actually_download:
filename = link.split('/')[-1]
urllib.urlretrieve(link, filename)
Have a look at str.replace. If that isn't general enough to get the job done, you'll need to use a regular expression ( re -- probably re.sub ).
>>> str1="http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html"
>>> str1.replace("flash","flat")
'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flat.html'
I think the safest and easiest way is to use a regular expression:
import re
url = 'http://www.google.com/this/is/sample/url/flash.html'
newUrl = re.sub('flash\.html$','flat.html',url)
The "$" means only match the end of the string. This solution will behave correctly even in the (admittedly unlikely) event that your url includes the substring "flash.html" somewhere other than the end, and also leaves the string unchanged (which I assume is the correct behavior) if it does not end with 'flash.html'.
See: http://docs.python.org/library/re.html#re.sub
#mgilson has a good solution, but the problem is it will replace all occurrences of the string with the replacement; so if you have the word "flash" as part of the URL (and not the just the trailing file name), you'll have multiple replacements:
>>> str = 'hello there hello'
>>> str.replace('hello','world')
'world there world'
An alternate solution is to replace the last part after / with flat.html:
>>> url = 'http://www.google.com/this/is/sample/url/flash.html'
>>> url[:url.rfind('/')+1]+'flat.html'
'http://www.google.com/this/is/sample/url/flat.html'
Using urlparse you can do a few bits and bobs:
from urlparse import urlsplit, urlunsplit, urljoin
s = 'http://awesome.good.is/transparency/web/1207/invasion-of-the-drones/flash.html'
url = urlsplit(s)
head, tail = url.path.rsplit('/', 1)
new_path = head, 'flat.html'
print urlunsplit(url._replace(path=urljoin(*new_path)))