Im struggling here with a problem that probably will be simple to solve. The code below perform 3 requests to an api
To get all the project's ID's (data)
To return the excluded id's (data2)
To return the concluded projects(data3)
After that, i have created a new function call Listas() that the result is the unique elements from list 1, and did not appear in list 2 or 3
How can I use this result as parameter to the teste() function?
Now I'm passing teste(ids), but ids is the first data request (all the project's ID's), and I need to use Listas() result as argument to teste
import requests
import xlwt
import openpyxl
def Sults():
headers = {
"Authorization":"xxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
global id
global id_exclude
global id_conclude
global nomeUnidade
global dataInicio
global dataFinal
global responsavel
global dtConclusao
id = []
id_exclude = []
id_conclude = []
nomeUnidade = []
dataInicio = []
dataFinal = []
responsavel = []
dtConclusao = []
for count1 in range(3):
url = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z".format(count1)
response = requests.get(url, headers=headers)
data = response.json()
url2 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=false".format(count1)
response2 = requests.get(url2, headers=headers)
data2 = response2.json()
url3 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=true".format(count1)
response3 = requests.get(url3, headers=headers)
data3 = response3.json()
# print(data)
ids = unidades(data)
Excel()
unidades2(data2)
unidades3(data3)
Listas()
teste(ids)
def unidades(data):
#sheet.write(0, 5, "Modelo")
for i in data['data']:
id.append(i['id']) #append id list
nomeUnidade.append(i['nome']) #append id list
dataInicio.append(i['dtInicio']) #append id list
dataFinal.append(i['dtFim']) #append id list
responsavel.append(i['responsavel']['nome']) #append id list
return id
return nomeUnidade
return dataInicio
return dataFinal
return responsavel
def Excel():
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(id)):
sheet.cell(row=i+1, column=1).value = id[i]
sheet.cell(row=i+1, column=2).value = nomeUnidade[i]
sheet.cell(row=i+1, column=3).value = dataInicio[i]
sheet.cell(row=i+1, column=4).value = dataFinal[i]
sheet.cell(row=i+1, column=5).value = responsavel[i]
wb.save("NewUNIDADES6.xlsx")
def unidades2(data2):
for j in data2['data']:
id_exclude.append(j['id'])
return id_exclude
def unidades3(data3):
for k in data3['data']:
id_conclude.append(k['id'])
return id_conclude
def Listas():
result = list(set(id) - set(id_exclude) - set(id_conclude))
print("Implantação:")
print(result)
def teste(result):
listID = (result)
print("List ID:")
print (listID)
headers = {
"Authorization":"O3BldGxhbmQ7MTU5NTAxNTI1NTI1OA==",
"Content-Type":"application/json;charset=UTF-8"
}
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
global ID_List
ID_List = [] #but how did u get this list became an [254,254,254,254,255,255,255 etc]
for li in range(length):
for count in range(4):
#o start dentro de url é o counter. Ou seja, ele vai até 4, a partir do 0
url = "https://api/api/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count) #here i got the id from the code above, and count is the pagination about api
print("Print")
print(url)
response = requests.get(url, headers=headers)
data = response.json()
unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, listID[li])#li is legnth of id list ok? Ok, i get it now. So )
#print(nome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
sheet.cell(row=i+1, column=1).value = ID_List[i]
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
sheet.cell(row=i+1, column=5).value = faseNome[i]
wb.save("TarefasNEW.xlsx")
#print(codigoTarefa)
#print(situacaoTarefa)
#print(faseNome)
def unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, ID): # ok just wait secounds this ID here is the parameter to the above listID[li]
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
faseNome.append(i['fase']['nome'])
ID_List.append(ID)# append here
coluna +=1
if __name__ == '__main__':
Sults()
Related
I have this code below, requesting an api. Every ID in listID loops 4 times in the url. So if the listID = [1,2,3,4,5], the url will be :
url = "https://xxxxapi/v1/implantacao/projeto/1/tarefa?start=0&limit=50" then
url = "https://xxxxapi/v1/implantacao/projeto/1/tarefa?start=1&limit=50" and etc, start goes 0 to 3 for every id in the list
Then, im saving every data request i get into and xls file and that's working fine. For example, every 4 loops at the id, normally returns 120 tasks. I want to print the ID for every task that the code returns, in the line :
#sheet.cell(row=i+1, column=1).value = listID
def teste(id):
listID = (id)
headers = {
"xxxxxxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
for li in range(length):
for count in range(4):
url = "https://xxxxapi/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count)
response = requests.get(url, headers=headers)
data = response.json()
unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
#sheet.cell(row=i+1, column=1).value = listID
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
wb.save("dados11.xlsx")
def unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
coluna +=1
if __name__ == '__main__':
Sults()
To be more clear : One output example from one task at project ID 1 :
INAUGURAÇÃO
T98
4
I want this output (1 is the first item in listID for example) :
1
INAUGURAÇÃO
T98
4
How can i get it ? Thanks for the help btw
def teste(id):
listID = (id)# this is list of ids. Yes, that list you want this id go to the file xml. Yes, but the file today is a big one
#print (listID)
headers = {
"Authorization":"xxxxxxxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
#A contagem do counter começa negativa pois eu quero que a primeira busca de página seja no valor 0 > que busca as primeiras 50 tarefas
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
global ID_List
ID_List = [] #but how did u get this list became an [254,254,254,254,255,255,255 etc]
for li in range(length):
for count in range(4):
#o start dentro de url é o counter. Ou seja, ele vai até 4, a partir do 0
url = "https://xxxxxxxxx.com/api/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count) #here i got the id from the code above, and count is the pagination about api
#print(url)
response = requests.get(url, headers=headers)
data = response.json()
unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome, listID[li])#li is legnth of id list ok? Ok, i get it now. So )
#print(nome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
sheet.cell(row=i+1, column=1).value = ID_List[i]
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
wb.save("dados12.xlsx")
#print(codigoTarefa)
#print(situacaoTarefa)
#print(faseNome)
def unidades2(data, nome, codigoTarefa, situacaoTarefa, faseNome, ID): # ok just wait secounds this ID here is the parameter to the above listID[li]
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
ID_List.append(ID)# append here
coluna +=1
if __name__ == '__main__':
Sults()
This will probably work because every four name elements you have, you have one listID.
sheet.cell(row=i+1, column=1).value = listID[i/4]
Edit:
I realize now that your listID is actually a tuple. I changed the code above to this. This should resolve the float error.
sheet.cell(row=i+1, column=1).value = listID[i//4]
Since I am going to create a number of dataframes I know won't fit inside just a single google worksheet (because of the limitation of columns) I want to split the data into multiple worksheets. I'm using set_with_dataframe() and defining which worksheet the dataframes is going to get imported to, so my first thought was to create and define several worksheets and then use the same method - the problem is just that I don't know how to "split" the data when there's no more columns in the first worksheet (and then the second, and the third and so on...)
I'm quite new at working with Python and I have been stuck with this for days so any kind of help would be appreciated.
My code looks like this:
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
import traceback
import os
class DataScraper():
def __init__(self, sheets):
self.data_worksheet = sheets.data_worksheet
self.total_urls = 0
self.urls = self.getAllUrls(sheets.url_worksheet)
def getAllUrls(self, urlWorkSheet):
urls = urlWorkSheet.get_all_values()
finalUrls = []
for r in urls:
# Get all urls
modifiedUrls = [d for d in r[:14] if "https://" in d]
if len(modifiedUrls) != 0:
self.total_urls += len(modifiedUrls)
finalUrls.append(modifiedUrls)
return finalUrls
def StartScrape(self):
current_column_count = 1
last_data_frame_max_width = 0
current_element = 0
for urlRow in self.urls:
current_row_count = 1
for url in urlRow:
current_element += 1
error = False
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
labels = []
results = []
tbl = soup.find('table')
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(self.data_worksheet, df, col=current_column_count, row=current_row_count, include_column_header=False)
current_row_count += df.shape[0]+2
if df.shape[1] > last_data_frame_max_width:
last_data_frame_max_width = df.shape[1]
except Exception:
error = True
finally:
print(f"Processed page {current_element}/{self.total_urls} with status: {'success' if not error else 'error'}")
current_column_count += last_data_frame_max_width+5
last_data_frame_max_width = 0
class Sheets():
def __init__(self, filename, key):
self.filename = filename
self.key = key
self.data_worksheet = None
self.url_worksheet = None
self.getSheets(self.getCredentials())
def getCredentials(self):
# sep = seperator
_ = os.path.normpath(__file__).split(os.sep)
_.insert(1, "/")
credentials = service_account.Credentials.from_service_account_file(os.path.join(os.path.join(*_[0:-1]), self.filename))
return credentials.with_scopes( ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
def getSheets(self, scoped_credentials):
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key(self.key)
# Get sheet with data import
self.data_worksheet = spreadsheet_key.worksheet("Data")
# Get list with url's
self.url_worksheet = url_worksheet = spreadsheet_key.worksheet("Felix Copy")
# Get sheets
sheets = Sheets("credentials.json", "key_id")
# Start scraping
scraper = DataScraper(sheets)
scraper.StartScrape()
import requests
from bs4 import BeautifulSoup
import csv
import time
def fin_car(url):
x = {}
y = []
page = ''
while page == '':
try:
page = requests.get(url)
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
#page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
Precio = []
price = soup.find('div' , 'price').text
Precio = (str(price).strip())
print (Precio)
#-------------------------------------------------------------------------------------------------------------#
# #Tipo Propiedad #Tipo de oferta #
#-------------------------------------------------------------------------------------------------------------#
Tipo_Propiedad = []
Tipo_de_oferta = []
T_1 = soup.find('div' , 'box').h1.text
text = (str(T_1).strip())
l = text.find(' ')
m = text.find(' ', l+1)
n = text.find(' ', m+1)
Tipo_Propiedad = text[0 : l]
Tipo_de_oferta = text[m+1 : n]
print (Tipo_Propiedad)
print (Tipo_de_oferta)
#-------------------------------------------------------------------------------------------------------------#
# #Departamento #Ciudad #Zona #Barrio #
#-------------------------------------------------------------------------------------------------------------#
Departamento = []
Ciudad = []
Zona = []
Barrio = []
first = soup.find('div' , 'breadcrumb left')
link = first.find('div')
a_link = link.findAll('a')
box1 = []
for row in a_link:
box1.append(row.text)
Departamento = (box1[1:2].pop())
Ciudad = (box1[2:3].pop())
Zona = (box1[3:4].pop())
Barrio = (box1[4:5])
print (Departamento)
print (Ciudad)
print (Zona)
print (Barrio)
#-------------------------------------------------------------------------------------------------------------#
# #Área #Habitaciones #Baños #Parqueaderos #
#-------------------------------------------------------------------------------------------------------------#
box_2 = soup.find('div' ,'features clearfix')
box_2_1 = box_2.findAll('span')
box2 = []
Área=[]
Habitaciones = []
Baños = []
Parqueaderos = []
for row2 in box_2_1:
box2.append(str(row2.text).strip())
for i in box_2_1:
a = box2[0:1].pop()
b = box2[1:2].pop()
c = box2[2:3].pop()
d = box2[3:4].pop()
a1 = a[0 : a.find(' ')]
Área = (a1)
Habitaciones = (b.rstrip()[-1])
Baños = (c.rstrip()[-1])
Parqueaderos =(d)
print (Área)
print (Habitaciones)
print (Baños)
print (Parqueaderos)
#-------------------------------------------------------------------------------------------------------------#
# #Área_Privada #Área_Const #Antigüedad #Admón #Estrato #Estado #Piso_No #
#-------------------------------------------------------------------------------------------------------------#
box_3 = soup.find('div' ,'row features_2 ')
box_3_1 = box_3.findAll('li') #
Área_Privada = []
Área_Const = []
Antigüedad = []
Admón = []
Estrato = []
Estado = []
Piso_No = []
for li in box_3_1:
heading_words = li.b.text.split() #
target_content = str(li.br.next_sibling).strip() #
if "privada:" in heading_words:
Área_Privada = (target_content) #
elif "Const.:" in heading_words:
Área_Const = (target_content)
elif "Antigüedad:" in heading_words:
Antigüedad = (target_content)
elif "Admón:" in heading_words:
Admón = (target_content)
elif "Estrato:" in heading_words:
Estrato = (target_content)
elif "Estado:" in heading_words:
Estado = (target_content)
elif "Piso" in heading_words:
Piso_No = (target_content)
print (Área_Privada) #
print (Área_Const)
print (Antigüedad)
print (Admón)
print (Estrato) #
print (Estado)
print (Piso_No[0:1])
#-------------------------------------------------------------------------------------------------------------#
# #Actualizado #Visitas #Código_FincaRaiz # #
#-------------------------------------------------------------------------------------------------------------#
box4 = soup.find('div' , 'box_content row')
box4_1 = box4.findAll('span')
vis = []
Actualizado = []
Visitas = []
Código_FincaRaiz = []
for i in box4_1:
vis.append((str(i.text).strip()))
for j in box4_1:
e = vis[0:1].pop()
f = vis[2:3].pop()
Actualizado = e
Código_FincaRaiz = f
url="https://www.fincaraiz.com.co/WebServices/Statistics.asmx/GetAdvertVisits?idAdvert={}&idASource=40&idType=1001".format(Código_FincaRaiz) #
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content , 'lxml')
visit1 = soup1.find('double').text
Visitas = (visit1)
print (Actualizado)
print (Visitas)
print (Código_FincaRaiz)
#-------------------------------------------------------------------------------------------------------------#
x['Código FincaRaiz'] = Código_FincaRaiz
x['Departamento'] = Departamento
x['Ciudad'] = Ciudad
x['Zona'] = Zona
x['Barrio'] = Barrio
x['Tipo Propiedad'] = Tipo_Propiedad
x['Tipo de oferta'] = Tipo_de_oferta
x['Precio'] = Precio
x['Área'] = Área
x['Área Privada'] = Área_Privada
x['Área Const.'] = Área_Const
x['Antigüedad'] = Antigüedad
x['Baños'] = Baños
x['Habitaciones'] = Habitaciones
x['Parqueaderos'] = Parqueaderos
x['Admón'] = Admón
x['Estrato'] = Estrato
x['Estado'] = Estado
x['Piso No.'] = Piso_No
x['Actualizado'] = Actualizado
x['Visitas'] = Visitas
y.append(x)
x = {}
y = []
filename = 'Fincar.csv'
with open(filename, 'w', newline='') as f:
w = csv.DictWriter(f,['Código FincaRaiz','Departamento','Ciudad','Zona','Barrio', 'Tipo Propiedad', 'Tipo de oferta',
'Precio' , 'Área' , 'Área Privada' , 'Área Const.', 'Antigüedad', 'Baños' , 'Habitaciones',
'Parqueaderos' , 'Admón', 'Estrato' , 'Estado' , 'Piso No.' , 'Actualizado', 'Visitas'])
w.writeheader()
for x in y:
w.writerow(x)a
tab = []
xen = []
key_value = 'https://www.fincaraiz.com.co'
for i in range(2,6):
tab.append('https://www.fincaraiz.com.co/finca-raiz/?ad=30|{}||||1||||||||||||||||||||||1|||1||||||'.format(i))
for j in tab:
page = requests.get(j)
soup = BeautifulSoup(page.content , 'lxml')
index = soup.findAll('div' , 'span-title')
for i in index:
xen.append(i.find('a').get('href'))
for j in xen:
url = (key_value + j)
fin_car(url)
I've tried to fetch values from list of pages and trying to save the file to csv document, however csv document only storing the last value inside the csv document.
Tried multiple ways but it always giving the same output.
also the column with the blank values need to be filled with nil value, however it posting only [] symbol.
New to python and find it difficult to saving it to python. Need your support to achieve this task.
How should I proceed further
def fin_car(url):
x = {}
y = []
...
x = {}
y = []
These values are in different scopes. Assigning x inside fin_car doesn't affect it outside. You could change that using a global statement, but much better is to return from the function.
Even if you were changing the outside values of x and y, you only call fin_car long after writing to the CSV. The order of events in your code matters.
I suggest:
def fin_car(url):
x = {}
...
return x
with open...:
w = csv.DictWriter(...)
...
for j in tab:
...
for j in xen:
url = ...
w.writerow(fin_car(url))
You don't need y at all.
I'm trying to pull all of the 2016 NY Times articles that have the word "economy" in them using the Times' API. I get the following error message at the end of my code:
ValueError: dict contains fields not in fieldnames: 'abstract'
And here is my code:
from nytimesarticle import articleAPI
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New
YorkTimes']},
begin_date = 20151231)
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)
It seems my if statement should prevent the error. Also, if I use "writerow" as I've seen sometimes mentioned on here I get the entire list of details without creating the csv. Any help would be appreciated!
I am not sure what was your problem, but this code creates a file econ-mentions.csv with content.
from nytimesarticle import articleAPI
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
if __name__ == "__main__":
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New YorkTimes']},
begin_date = 20151231)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)
import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
My input file is a basic excel file with the match id's all lined up in column one of the excel file. When it creates the output file, it's blank. I am not getting any error messages either.
The issue is in your regex search, so perhaps change it to:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
Alternative:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
The /> is not needed, and this essentially makes title="" not match for group(1), which in turn creates an Attribute Error, and the script stops. If you want to include /> then remove the space in your regex pattern, since that is ultimately what kills it.