How to use append in a second for loop - python

So I am using append to extend my list of scraped apartments. In this code I run in to a problem, because I have created second for loop to change the page in the website. So first for loop gives new page to the next for loop to scrape. But when its done with one page it just overwrites the last list. What I am doing wrong?
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list) # Tried to make this work.. Think one list should be enaught.
And the functions:
def get_content_value(info_list_data):
if info_list_data.find("li"):
return [li.get_text(" ", strip=True).replace("\xa0", "").replace("€", "").replace("/ kk",
"").replace("\n", "") for li in info_list_data.find_all("li")]
else:
return info_list_data.get_text(" ", strip=True).replace("\xa0" , "").replace("€", "").replace("/
kk", "").replace("\n", "")
Last:
def get_apartment_data(url):
r = requests.get(url)
soup = bs(r.content)
all_info_list = soup.find_all(class_ = "CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy")
for info_list in all_info_list:
info_list.prettify()
info = {}
for index, info_list in enumerate(all_info_list):
content_key = info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-4__3RH7g
ItemHeader__itemHeader__32xAv").get_text(" ", strip=True)
content_value = get_content_value(info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS
flexboxgrid__col-sm-8__2jfMv CompactInfoRow__content__3jGt4"))
info[content_key] = content_value
return info

for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list.copy())
You need to use the copy() method to make an independent copy. Otherwise each time you make a new apartment_list, it will change in your x list too. Like twins lists.
More generally:
x = []
lst = [1,2,3]
x.append(lst)
print (x)
lst[0] = 0
x.append(lst)
print (x)
Output:
[[1,2,3]]
[[0,2,3],[0,2,3]]
The right way is:
x = []
lst = [1,2,3]
x.append(lst.copy())
print (x)
lst[0] = 0
x.append(lst.copy())
print (x)
Output:
[[1,2,3]]
[[1,2,3],[0,2,3]]

Related

possibilities for except (python, crawler)

i'm an absolut beginner, but with youtube and some websites i've written a crawler for the german website Immoscout24.
My problem: the crawler works fine, if all attributes are excisting. But if one site hasn't any attribute (e.g. "pre" in "beschreibung_container"), i'll get "NameError: name 'beschreibung' is not defined". How can i do, that it writes nothing ("") into my result list (csv), if the attribute not exists ans continues crawling?
for number in numbers:
my_url = "https://www.immobilienscout24.de/expose/%s#/" %number
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", {"id":"is24-content"})
filename = "results_"+current_datetime+".csv"
f = open(filename, "a")
headers = "Objekt-ID##Titel##Adresse##Merkmale##Kosten##Bausubstanz und Energieausweis##Beschreibung##Ausstattung##Lage\n"
f.write(headers)
for container in containers:
try:
objektid_container = container.find_all("div", {"class":"is24-scoutid__content padding-top-s"})
objektid = objektid_container[0].get_text().strip()
titel_container = container.find_all("h1", {"class":"font-semibold font-xl margin-bottom margin-top-m palm-font-l"})
titel = titel_container[0].get_text().strip()
adresse_container = container.find_all("div", {"class":"address-block"})
adresse = adresse_container[0].get_text().strip()
criteria_container = container.find_all("div", {"class":"criteriagroup criteria-group--two-columns"})
criteria = criteria_container[0].get_text().strip()
preis_container = container.find_all("div", {"class":"grid-item lap-one-half desk-one-half padding-right-s"})
preis = preis_container[0].get_text().strip()
energie_container = container.find_all("div", {"class":"criteriagroup criteria-group--border criteria-group--two-columns criteria-group--spacing"})
energie = energie_container[0].get_text().strip()
beschreibung_container = container.find_all("pre", {"class":"is24qa-objektbeschreibung text-content short-text"})
beschreibung = beschreibung_container[0].get_text().strip()
ausstattung_container = container.find_all("pre", {"class":"is24qa-ausstattung text-content short-text"})
ausstattung = ausstattung_container[0].get_text().strip()
lage_container = container.find_all("pre", {"class":"is24qa-lage text-content short-text"})
lage = lage_container[0].get_text().strip()
except:
print("some mistake")
pass
f.write(objektid + "##" + titel + "##" + adresse + "##" + criteria.replace(" ", ";") + "##" + preis.replace(" ", ";") + "##" + energie.replace(" ", ";") + "##" + beschreibung.replace("\n", " ") + "##" + ausstattung.replace("\n", " ") + "##" + lage.replace("\n", " ") + "\n")
f.close()
EDIT
First problem is solved. Another problem: my result list shows in each column like:
look here
How can i do, that "Objekt-ID" and the other headlines are only in row No. 1?
For each variable, you can simply just do the following
obj = container.find_all("div", {"class":"xxxxx"}) or ""
objid = obj[0].get_text().strip() if obj else ""
The first line will default the value into "" empty string if find_all returns empty list or none. The second also does the same thing but check for the existence of value first then apply the if else condition.
I think you need to encapsulate each variable in try-except block.
E.g:
try:
objektid_container = container.find_all("div", {"class":"is24-scoutid__content padding-top-s"})
objektid = objektid_container[0].get_text().strip()
except:
objektid = ""
Do this for all variables
For Second issue Move your headers outside loop
Remove this code:
filename = "results_"+current_datetime+".csv"
f = open(filename, "a")
headers = "Objekt-ID##Titel##Adresse##Merkmale##Kosten##Bausubstanz und Energieausweis##Beschreibung##Ausstattung##Lage\n"
f.write(headers)
And add it before:
for number in numbers:

How to get the URL from local files?

I used Wget to download webpages.
I want to ask "Is it possible to get the URL from local html files?"
I used python to analysis the html file content. I want to print all files' URL.
I am trying to add more functions on this program, so I think if I can print the URL follow result, then user can easy click link to get the webpage.
Here is my code:
def search(self):
keyword = self.entry.get()
mypath = "/Users/Tsu-AngChou/MasterProject/Practice/try_test/"
files = listdir(mypath)
translator = str.maketrans("","",string.punctuation)
count1 = 0
test_list = []
test_list2 = []
for f in files:
fullpath = join(mypath, f)
if f == '.DS_Store':
os.remove(f)
elif isfile(fullpath):
# print(f)
for html_cont in range(1):
response = open(f,'r',encoding='utf-8')
html_cont = response.read()
soup = bs(html_cont, 'html.parser')
regular_string = soup.get_text()
new_string = regular_string.translate(translator).split()
new_list = [item[:14] for item in new_string]
a = dict.fromkeys(new_list, f)
wordfreq = []
c = new_list
for w in c:
wordfreq.append(c.count(w))
fre = dict(zip(c,wordfreq))
sentence= new_list
keyword1= keyword
words = sentence
if keyword in fre:
test_list.append(a[keyword])
test_list2.append(fre[keyword])
count1 = count1+1
for (i, subword) in enumerate(words):
if (subword == keyword1):
test_list3= i+1
for i in range(0,count1-1):
for j in range(0,count1-1-i):
if (test_list2[j]<test_list2[j+1]):
temp=test_list[j]
temp2=test_list2[j]
test_list[j]=test_list[j+1]
test_list2[j]=test_list2[j+1]
test_list[j+1]=temp
test_list2[j+1]=temp2
for i in range(0,count1):
print(keyword, "Filename:", test_list[i], "Frequency:", test_list2[i])
return a
This is my output, and I want to have the link follow every result.

Function invoking issue

I have this python code, but when I run it, is printing out just the first target, here is my python code:
def get_next_target(S):
start_link = S.find('<a href=')
start_quote = S.find('"', start_link)
end_quote = S.find('"', start_quote + 1)
url = S[start_quote + 1:end_quote]
print url
return url, end_quote
get_next_target(S)
where variable S = '<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href="www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'
What I want is to print out the three targets, but instead it's just printing the first one, why is that?
I think you should use BeautifulSoup to extract info from html/xml.
In [1]: from bs4 import BeautifulSoup
In [2]: html = '''<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href=
...: "www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'''
In [3]: soup = BeautifulSoup(html, 'lxml')
In [4]: for a in soup.find_all('a'):
...: print(a['href'])
...:
www.target1.com
www.target2.com
www.target3.com
If you logically want to achieve this without using any special module then following code will do that.
import re
import sys
S = '<susuds><a href="www.target1.com"/><ahsahsh><saudahsd><a href="www.target2.com"/><p>sa</h1><a href="www.target3.com"/>'
abc = []
def get_next_target(S):
search_index = [i.start() for i in re.finditer('<a href=', S)]
for j in range(len(search_index)):
if ( j == len(search_index)-1):
A =S[ search_index[j]:len(S) ]
search_start_index = A.find('"')
search_end_index = A.rfind('"')
start_final = search_index[j] + search_start_index + 1
start_end = search_index[j] + search_end_index
final_result = S[ start_final:start_end ]
abc.append(final_result)
print abc
else:
A = S[ search_index[j]:search_index[j+1] ]
search_start_index = A.find('"')
search_end_index = A.rfind('"')
start_final = search_index[j] + search_start_index + 1
start_end = search_index[j] + search_end_index
final_result = S[ start_final:start_end ]
abc.append(final_result)`enter code here`
get_next_target(S)
Note: If you don't want to append the result in to a list then replace the last two line of if and else statement with "print final_result".enter code here

Extract data from web page

I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).

Python Wiki Path Searching

On a personal whim I have written some code to search for the shortest series of links between any two Wikipedia articles. It turned out to be very brute force and takes a long long time to find the goal if it's more than a link or two deep, but it works! I will eventually keep track of and make use of the link paths and stuff, but I wanted to get the search working optimally first. Is there a faster way to do this or a good way to cut some major corners here?
import urllib2
from bs4 import BeautifulSoup
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29'
End = 'http://en.wikipedia.org/wiki/Ayr'
#Using BeautifulSoup, this grabs the page
def soup_request(target):
request = urllib2.Request(target)
request.add_header("User-Agent", "Mozilla/5.0")
page = urllib2.urlopen(target)
soup = BeautifulSoup(page)
return soup
#This will grab all Wiki links off a given page
def get_links(Start):
soup = soup_request(Start)
Wiki_links = []
#Finds all links
for url in soup.findAll('a'):
result = url.get('href')
try:
if str(result)[:5] == '/wiki':
Wiki_links.append(result)
except:
pass
for q in range(len(Wiki_links)):
Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q])
print "Got new links from",Start
return Wiki_links
#This will check all the given links to see if the title matches the goal webpage
def check_links(Links,End):
goalsoup = soup_request(End)
goaltitle = goalsoup.html.title
Found = False
count = 0
for q in Links:
if Found:
break
length = len(Links)
#Runs through all the given links and checks their titles for correct one
if q is not None:
count += 1
soup = soup_request(q)
print "Checked",count,"links out of",length
try:
title = soup.html.head.title
if title == goaltitle:
Found = True
print "Found it!"
break
except:
print 'doh'
pass
return Found
#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links
def wiki_crawl(Start, End, depth):
Old_Links = [Start]
count = depth
while count > 0:
New_Links = []
for q in range(len(Old_Links)):
New_Links.extend(get_links(Old_Links[q]))
Found = check_links(New_Links,End)
if Found:
print "All done."
break
Old_Links = New_Links
count -= 1
print "_______________________________________________________________ROUND DONE"
if not Found:
print "Did not find the page, you must go deeper!"
wiki_crawl(Start, End, 2)
Here are some functions to take info from wiki. The only problems with it is that sometimes it takes out a space from the info on the webpage.
def take_out_parenthesis(st):
string = list(st)
for a in string:
if a == '(':
del string[st.find(a)]
if a == ')':
del string[st.find(a) - 1]
return ''.join(string)
def take_out_tags(string):
st = list(string)
odd = ['<', '>']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('<') - 1
end = string.find('>')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_out_brackets(string):
st = list(string)
odd = ['[', ']']
times = 0
for a in string:
if a in odd:
times += 1
times /= 2
for b in range(times):
start = string.find('[') - 1
end = string.find(']')
bet = end - start + 1
for a in range(bet):
del st[start]
string = ''.join(st)
return string
def take_from_web_page(text):
n = 0
url = text.replace(" ", "_")
search = "http://en.wikipedia.org/wiki/%s" % url
page = urllib2.urlopen(search).read()
start = page.find('<p><b>') + 6
end = page.find('</a>.', start) + 5
new_page = page[start:end]
for a in new_page:
if a == '<':
if new_page[n - 1] != ' ':
lst = list(new_page)
lst.insert(n, ' ')
new_page = ''.join(lst)
n += 1
n += 1
return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))

Categories