I have this code below, requesting an api. Every ID in listID loops 4 times in the url. So if the listID = [1,2,3,4,5], the url will be :
url = "https://xxxxapi/v1/implantacao/projeto/1/tarefa?start=0&limit=50" then
url = "https://xxxxapi/v1/implantacao/projeto/1/tarefa?start=1&limit=50" and etc, start goes 0 to 3 for every id in the list
Then, im saving every data request i get into and xls file and that's working fine. For example, every 4 loops at the id, normally returns 120 tasks. I want to print the ID for every task that the code returns, in the line :
#sheet.cell(row=i+1, column=1).value = listID
def teste(id):
listID = (id)
headers = {
"xxxxxxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
for li in range(length):
for count in range(4):
url = "https://xxxxapi/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count)
response = requests.get(url, headers=headers)
data = response.json()
unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
#sheet.cell(row=i+1, column=1).value = listID
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
wb.save("dados11.xlsx")
def unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
coluna +=1
if __name__ == '__main__':
Sults()
To be more clear : One output example from one task at project ID 1 :
INAUGURAÇÃO
T98
4
I want this output (1 is the first item in listID for example) :
1
INAUGURAÇÃO
T98
4
How can i get it ? Thanks for the help btw
def teste(id):
listID = (id)# this is list of ids. Yes, that list you want this id go to the file xml. Yes, but the file today is a big one
#print (listID)
headers = {
"Authorization":"xxxxxxxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
#A contagem do counter começa negativa pois eu quero que a primeira busca de página seja no valor 0 > que busca as primeiras 50 tarefas
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
global ID_List
ID_List = [] #but how did u get this list became an [254,254,254,254,255,255,255 etc]
for li in range(length):
for count in range(4):
#o start dentro de url é o counter. Ou seja, ele vai até 4, a partir do 0
url = "https://xxxxxxxxx.com/api/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count) #here i got the id from the code above, and count is the pagination about api
#print(url)
response = requests.get(url, headers=headers)
data = response.json()
unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome, listID[li])#li is legnth of id list ok? Ok, i get it now. So )
#print(nome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
sheet.cell(row=i+1, column=1).value = ID_List[i]
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
wb.save("dados12.xlsx")
#print(codigoTarefa)
#print(situacaoTarefa)
#print(faseNome)
def unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome, ID): # ok just wait secounds this ID here is the parameter to the above listID[li]
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
ID_List.append(ID)# append here
coluna +=1
if __name__ == '__main__':
Sults()
This will probably work because every four name elements you have, you have one listID.
sheet.cell(row=i+1, column=1).value = listID[i/4]
Edit:
I realize now that your listID is actually a tuple. I changed the code above to this. This should resolve the float error.
sheet.cell(row=i+1, column=1).value = listID[i//4]
Related
Im struggling here with a problem that probably will be simple to solve. The code below perform 3 requests to an api
To get all the project's ID's (data)
To return the excluded id's (data2)
To return the concluded projects(data3)
After that, i have created a new function call Listas() that the result is the unique elements from list 1, and did not appear in list 2 or 3
How can I use this result as parameter to the teste() function?
Now I'm passing teste(ids), but ids is the first data request (all the project's ID's), and I need to use Listas() result as argument to teste
import requests
import xlwt
import openpyxl
def Sults():
headers = {
"Authorization":"xxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
global id
global id_exclude
global id_conclude
global nomeUnidade
global dataInicio
global dataFinal
global responsavel
global dtConclusao
id = []
id_exclude = []
id_conclude = []
nomeUnidade = []
dataInicio = []
dataFinal = []
responsavel = []
dtConclusao = []
for count1 in range(3):
url = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z".format(count1)
response = requests.get(url, headers=headers)
data = response.json()
url2 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=false".format(count1)
response2 = requests.get(url2, headers=headers)
data2 = response2.json()
url3 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=true".format(count1)
response3 = requests.get(url3, headers=headers)
data3 = response3.json()
# print(data)
ids = unidades(data)
Excel()
unidades2(data2)
unidades3(data3)
Listas()
teste(ids)
def unidades(data):
#sheet.write(0, 5, "Modelo")
for i in data['data']:
id.append(i['id']) #append id list
nomeUnidade.append(i['nome']) #append id list
dataInicio.append(i['dtInicio']) #append id list
dataFinal.append(i['dtFim']) #append id list
responsavel.append(i['responsavel']['nome']) #append id list
return id
return nomeUnidade
return dataInicio
return dataFinal
return responsavel
def Excel():
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(id)):
sheet.cell(row=i+1, column=1).value = id[i]
sheet.cell(row=i+1, column=2).value = nomeUnidade[i]
sheet.cell(row=i+1, column=3).value = dataInicio[i]
sheet.cell(row=i+1, column=4).value = dataFinal[i]
sheet.cell(row=i+1, column=5).value = responsavel[i]
wb.save("NewUNIDADES6.xlsx")
def unidades2(data2):
for j in data2['data']:
id_exclude.append(j['id'])
return id_exclude
def unidades3(data3):
for k in data3['data']:
id_conclude.append(k['id'])
return id_conclude
def Listas():
result = list(set(id) - set(id_exclude) - set(id_conclude))
print("Implantação:")
print(result)
def teste(result):
listID = (result)
print("List ID:")
print (listID)
headers = {
"Authorization":"O3BldGxhbmQ7MTU5NTAxNTI1NTI1OA==",
"Content-Type":"application/json;charset=UTF-8"
}
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
global ID_List
ID_List = [] #but how did u get this list became an [254,254,254,254,255,255,255 etc]
for li in range(length):
for count in range(4):
#o start dentro de url é o counter. Ou seja, ele vai até 4, a partir do 0
url = "https://api/api/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count) #here i got the id from the code above, and count is the pagination about api
print("Print")
print(url)
response = requests.get(url, headers=headers)
data = response.json()
unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, listID[li])#li is legnth of id list ok? Ok, i get it now. So )
#print(nome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
sheet.cell(row=i+1, column=1).value = ID_List[i]
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
sheet.cell(row=i+1, column=5).value = faseNome[i]
wb.save("TarefasNEW.xlsx")
#print(codigoTarefa)
#print(situacaoTarefa)
#print(faseNome)
def unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, ID): # ok just wait secounds this ID here is the parameter to the above listID[li]
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
faseNome.append(i['fase']['nome'])
ID_List.append(ID)# append here
coluna +=1
if __name__ == '__main__':
Sults()
My code does the following:
1st enter the site
2nd collects the links and saves in a dictionary
3rd the code enters the links saved in the dictionary to extract the elements and also saves in a dictionary
4th finally it saves the information of the elements that are in the dictionary in excel pandas
Problem:
Some pages do not contain information to be extracted, it is probably a bug on the site, so pandas does not save the information already collected.
Here's the error:
valueerror: all arrays must be of the same length
Here's part of my code, I didn't put it in full so it wouldn't get long.
I'm using selenium.
links = []
imagem = []
pacote = []
counter = 1
for linkAtual in links:
driver.get(linkAtual)
try:
driver.find_element(
By.XPATH, "//button[normalize-space()='Ir para a oferta']").click()
sleep(2)
except:
print("proxima pagina")
try:
titulo = driver.find_element(By.TAG_NAME, "h1")
print(titulo.text)
pacote.append(titulo.text.replace("Pacote de Viagem - ", "").replace("+", " e ").replace("2022", "").replace(
"2023", "").replace("2024", "").replace("2025", "").replace("(", "").replace(")", "").replace("-", ""))
print("baixar imagem ")
primeiro_caminho = driver.find_element(By. XPATH, "(//img)[2]")
atributoSrc = primeiro_caminho.get_attribute("src")
# file_name = f"{titulo}{counter:02d}.jpg"
file_name = f"image{counter:02d}.jpg"
imagem.append(atributoSrc)
urllib.request.urlretrieve(
atributoSrc, f"C:\\__Imagens e Planilhas Python\\Afiliacoes\\Fotos\\{file_name}")
counter += 1
except:
print("não tem conteudo")
data = {'Pacote': pacote, 'Link Afiliado': links}
#df = pd.DataFrame.from_dict(data, orient="index")
df = pd.DataFrame(data)
df.to_excel(r"C:\__Imagens e Planilhas Python\Afiliacoes\pacotes.xlsx",engine='xlsxwriter')
print(df)
Page that doesn't work I found one, it has 300 links and about 4 that don't have information:
Here is an example of an error link:
https://www.hurb.com/br/packages/la-romana-passagem-aerea-hospedagem/1416419?utm_source=Felipe-F-clubehurb&utm_medium=clubehu-promotion&utm_campaign=689696&cmp=689696
Here are examples of valid links:
https://www.hurb.com/br/packages/costa-do-sauipe-passagem-aerea-hospedagem-all-inclusive/1421105?utm_source=Felipe-F-clubehurb&utm_medium=clubehu-product&utm_campaign=689696&cmp=689696
https://www.hurb.com/br/packages/rio-de-janeiro-passagem-aerea-hospedagem/1407451?utm_source=Felipe-F-clubehurb&utm_medium=clubehu-product&utm_campaign=689696&cmp=689696
https://www.hurb.com/br/packages/pacote-aereo-hospedagem-dubrovnik/1419049?utm_source=Felipe-F-clubehurb&utm_medium=clubehu-product&utm_campaign=689696&cmp=689696
you could edit your 2nd try...except block to
titlTxt, atributoSrc = None, None # initiate/clear as null
try:
titulo = driver.find_element(By.TAG_NAME, "h1")
print(titulo.text)
# pacote.append... # outside block
titlTxt = titulo.text.replace("Pacote de Viagem - ", "").replace("+", " e ").replace("2022", "").replace(
"2023", "").replace("2024", "").replace("2025", "").replace("(", "").replace(")", "").replace("-", "")
print("baixar imagem ")
primeiro_caminho = driver.find_element(By. XPATH, "(//img)[2]")
atributoSrc = primeiro_caminho.get_attribute("src")
# file_name = f"{titulo}{counter:02d}.jpg"
file_name = f"image{counter:02d}.jpg"
# imagem.append(atributoSrc) # outside block
urllib.request.urlretrieve(
atributoSrc, f"C:\\__Imagens e Planilhas Python\\Afiliacoes\\Fotos\\{file_name}")
counter += 1
except:
print("não tem conteudo")
pacote.append(titlTxt)
imagem.append(atributoSrc)
so now pacote and imagem should have exactly one item [each] corresponding to each item in links, so all 3 lists can be expected to be of the same length.
(If you had separate try...except blocks for each list, you could append in both try and in except; but when extracting for multiple lists in the same try, some of them might already be appended to before raising an exception, so appending in except as well risks adding to the same list twince in a single loop.)
import requests
from bs4 import BeautifulSoup
import csv
import time
def fin_car(url):
x = {}
y = []
page = ''
while page == '':
try:
page = requests.get(url)
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
#page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
Precio = []
price = soup.find('div' , 'price').text
Precio = (str(price).strip())
print (Precio)
#-------------------------------------------------------------------------------------------------------------#
# #Tipo Propiedad #Tipo de oferta #
#-------------------------------------------------------------------------------------------------------------#
Tipo_Propiedad = []
Tipo_de_oferta = []
T_1 = soup.find('div' , 'box').h1.text
text = (str(T_1).strip())
l = text.find(' ')
m = text.find(' ', l+1)
n = text.find(' ', m+1)
Tipo_Propiedad = text[0 : l]
Tipo_de_oferta = text[m+1 : n]
print (Tipo_Propiedad)
print (Tipo_de_oferta)
#-------------------------------------------------------------------------------------------------------------#
# #Departamento #Ciudad #Zona #Barrio #
#-------------------------------------------------------------------------------------------------------------#
Departamento = []
Ciudad = []
Zona = []
Barrio = []
first = soup.find('div' , 'breadcrumb left')
link = first.find('div')
a_link = link.findAll('a')
box1 = []
for row in a_link:
box1.append(row.text)
Departamento = (box1[1:2].pop())
Ciudad = (box1[2:3].pop())
Zona = (box1[3:4].pop())
Barrio = (box1[4:5])
print (Departamento)
print (Ciudad)
print (Zona)
print (Barrio)
#-------------------------------------------------------------------------------------------------------------#
# #Área #Habitaciones #Baños #Parqueaderos #
#-------------------------------------------------------------------------------------------------------------#
box_2 = soup.find('div' ,'features clearfix')
box_2_1 = box_2.findAll('span')
box2 = []
Área=[]
Habitaciones = []
Baños = []
Parqueaderos = []
for row2 in box_2_1:
box2.append(str(row2.text).strip())
for i in box_2_1:
a = box2[0:1].pop()
b = box2[1:2].pop()
c = box2[2:3].pop()
d = box2[3:4].pop()
a1 = a[0 : a.find(' ')]
Área = (a1)
Habitaciones = (b.rstrip()[-1])
Baños = (c.rstrip()[-1])
Parqueaderos =(d)
print (Área)
print (Habitaciones)
print (Baños)
print (Parqueaderos)
#-------------------------------------------------------------------------------------------------------------#
# #Área_Privada #Área_Const #Antigüedad #Admón #Estrato #Estado #Piso_No #
#-------------------------------------------------------------------------------------------------------------#
box_3 = soup.find('div' ,'row features_2 ')
box_3_1 = box_3.findAll('li') #
Área_Privada = []
Área_Const = []
Antigüedad = []
Admón = []
Estrato = []
Estado = []
Piso_No = []
for li in box_3_1:
heading_words = li.b.text.split() #
target_content = str(li.br.next_sibling).strip() #
if "privada:" in heading_words:
Área_Privada = (target_content) #
elif "Const.:" in heading_words:
Área_Const = (target_content)
elif "Antigüedad:" in heading_words:
Antigüedad = (target_content)
elif "Admón:" in heading_words:
Admón = (target_content)
elif "Estrato:" in heading_words:
Estrato = (target_content)
elif "Estado:" in heading_words:
Estado = (target_content)
elif "Piso" in heading_words:
Piso_No = (target_content)
print (Área_Privada) #
print (Área_Const)
print (Antigüedad)
print (Admón)
print (Estrato) #
print (Estado)
print (Piso_No[0:1])
#-------------------------------------------------------------------------------------------------------------#
# #Actualizado #Visitas #Código_FincaRaiz # #
#-------------------------------------------------------------------------------------------------------------#
box4 = soup.find('div' , 'box_content row')
box4_1 = box4.findAll('span')
vis = []
Actualizado = []
Visitas = []
Código_FincaRaiz = []
for i in box4_1:
vis.append((str(i.text).strip()))
for j in box4_1:
e = vis[0:1].pop()
f = vis[2:3].pop()
Actualizado = e
Código_FincaRaiz = f
url="https://www.fincaraiz.com.co/WebServices/Statistics.asmx/GetAdvertVisits?idAdvert={}&idASource=40&idType=1001".format(Código_FincaRaiz) #
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content , 'lxml')
visit1 = soup1.find('double').text
Visitas = (visit1)
print (Actualizado)
print (Visitas)
print (Código_FincaRaiz)
#-------------------------------------------------------------------------------------------------------------#
x['Código FincaRaiz'] = Código_FincaRaiz
x['Departamento'] = Departamento
x['Ciudad'] = Ciudad
x['Zona'] = Zona
x['Barrio'] = Barrio
x['Tipo Propiedad'] = Tipo_Propiedad
x['Tipo de oferta'] = Tipo_de_oferta
x['Precio'] = Precio
x['Área'] = Área
x['Área Privada'] = Área_Privada
x['Área Const.'] = Área_Const
x['Antigüedad'] = Antigüedad
x['Baños'] = Baños
x['Habitaciones'] = Habitaciones
x['Parqueaderos'] = Parqueaderos
x['Admón'] = Admón
x['Estrato'] = Estrato
x['Estado'] = Estado
x['Piso No.'] = Piso_No
x['Actualizado'] = Actualizado
x['Visitas'] = Visitas
y.append(x)
x = {}
y = []
filename = 'Fincar.csv'
with open(filename, 'w', newline='') as f:
w = csv.DictWriter(f,['Código FincaRaiz','Departamento','Ciudad','Zona','Barrio', 'Tipo Propiedad', 'Tipo de oferta',
'Precio' , 'Área' , 'Área Privada' , 'Área Const.', 'Antigüedad', 'Baños' , 'Habitaciones',
'Parqueaderos' , 'Admón', 'Estrato' , 'Estado' , 'Piso No.' , 'Actualizado', 'Visitas'])
w.writeheader()
for x in y:
w.writerow(x)a
tab = []
xen = []
key_value = 'https://www.fincaraiz.com.co'
for i in range(2,6):
tab.append('https://www.fincaraiz.com.co/finca-raiz/?ad=30|{}||||1||||||||||||||||||||||1|||1||||||'.format(i))
for j in tab:
page = requests.get(j)
soup = BeautifulSoup(page.content , 'lxml')
index = soup.findAll('div' , 'span-title')
for i in index:
xen.append(i.find('a').get('href'))
for j in xen:
url = (key_value + j)
fin_car(url)
I've tried to fetch values from list of pages and trying to save the file to csv document, however csv document only storing the last value inside the csv document.
Tried multiple ways but it always giving the same output.
also the column with the blank values need to be filled with nil value, however it posting only [] symbol.
New to python and find it difficult to saving it to python. Need your support to achieve this task.
How should I proceed further
def fin_car(url):
x = {}
y = []
...
x = {}
y = []
These values are in different scopes. Assigning x inside fin_car doesn't affect it outside. You could change that using a global statement, but much better is to return from the function.
Even if you were changing the outside values of x and y, you only call fin_car long after writing to the CSV. The order of events in your code matters.
I suggest:
def fin_car(url):
x = {}
...
return x
with open...:
w = csv.DictWriter(...)
...
for j in tab:
...
for j in xen:
url = ...
w.writerow(fin_car(url))
You don't need y at all.
import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
My input file is a basic excel file with the match id's all lined up in column one of the excel file. When it creates the output file, it's blank. I am not getting any error messages either.
The issue is in your regex search, so perhaps change it to:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
Alternative:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
The /> is not needed, and this essentially makes title="" not match for group(1), which in turn creates an Attribute Error, and the script stops. If you want to include /> then remove the space in your regex pattern, since that is ultimately what kills it.
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).