I'm trying to pull all of the 2016 NY Times articles that have the word "economy" in them using the Times' API. I get the following error message at the end of my code:
ValueError: dict contains fields not in fieldnames: 'abstract'
And here is my code:
from nytimesarticle import articleAPI
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New
YorkTimes']},
begin_date = 20151231)
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)
It seems my if statement should prevent the error. Also, if I use "writerow" as I've seen sometimes mentioned on here I get the entire list of details without creating the csv. Any help would be appreciated!
I am not sure what was your problem, but this code creates a file econ-mentions.csv with content.
from nytimesarticle import articleAPI
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
if __name__ == "__main__":
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New YorkTimes']},
begin_date = 20151231)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)
Related
Im struggling here with a problem that probably will be simple to solve. The code below perform 3 requests to an api
To get all the project's ID's (data)
To return the excluded id's (data2)
To return the concluded projects(data3)
After that, i have created a new function call Listas() that the result is the unique elements from list 1, and did not appear in list 2 or 3
How can I use this result as parameter to the teste() function?
Now I'm passing teste(ids), but ids is the first data request (all the project's ID's), and I need to use Listas() result as argument to teste
import requests
import xlwt
import openpyxl
def Sults():
headers = {
"Authorization":"xxxxx",
"Content-Type":"application/json;charset=UTF-8"
}
global id
global id_exclude
global id_conclude
global nomeUnidade
global dataInicio
global dataFinal
global responsavel
global dtConclusao
id = []
id_exclude = []
id_conclude = []
nomeUnidade = []
dataInicio = []
dataFinal = []
responsavel = []
dtConclusao = []
for count1 in range(3):
url = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z".format(count1)
response = requests.get(url, headers=headers)
data = response.json()
url2 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=false".format(count1)
response2 = requests.get(url2, headers=headers)
data2 = response2.json()
url3 = "https://api/api/v1/implantacao/projeto?start={}&dtInicio=2022-01-01T18:02:55Z&concluido=true".format(count1)
response3 = requests.get(url3, headers=headers)
data3 = response3.json()
# print(data)
ids = unidades(data)
Excel()
unidades2(data2)
unidades3(data3)
Listas()
teste(ids)
def unidades(data):
#sheet.write(0, 5, "Modelo")
for i in data['data']:
id.append(i['id']) #append id list
nomeUnidade.append(i['nome']) #append id list
dataInicio.append(i['dtInicio']) #append id list
dataFinal.append(i['dtFim']) #append id list
responsavel.append(i['responsavel']['nome']) #append id list
return id
return nomeUnidade
return dataInicio
return dataFinal
return responsavel
def Excel():
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(id)):
sheet.cell(row=i+1, column=1).value = id[i]
sheet.cell(row=i+1, column=2).value = nomeUnidade[i]
sheet.cell(row=i+1, column=3).value = dataInicio[i]
sheet.cell(row=i+1, column=4).value = dataFinal[i]
sheet.cell(row=i+1, column=5).value = responsavel[i]
wb.save("NewUNIDADES6.xlsx")
def unidades2(data2):
for j in data2['data']:
id_exclude.append(j['id'])
return id_exclude
def unidades3(data3):
for k in data3['data']:
id_conclude.append(k['id'])
return id_conclude
def Listas():
result = list(set(id) - set(id_exclude) - set(id_conclude))
print("Implantação:")
print(result)
def teste(result):
listID = (result)
print("List ID:")
print (listID)
headers = {
"Authorization":"O3BldGxhbmQ7MTU5NTAxNTI1NTI1OA==",
"Content-Type":"application/json;charset=UTF-8"
}
length = len(listID)
nome = []
codigoTarefa = []
situacaoTarefa = []
faseNome = []
global ID_List
ID_List = [] #but how did u get this list became an [254,254,254,254,255,255,255 etc]
for li in range(length):
for count in range(4):
#o start dentro de url é o counter. Ou seja, ele vai até 4, a partir do 0
url = "https://api/api/v1/implantacao/projeto/{}/tarefa?start={}&limit=50".format(listID[li], count) #here i got the id from the code above, and count is the pagination about api
print("Print")
print(url)
response = requests.get(url, headers=headers)
data = response.json()
unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, listID[li])#li is legnth of id list ok? Ok, i get it now. So )
#print(nome)
wb = openpyxl.Workbook()
sheet = wb.active
for i in range(len(nome)):
sheet.cell(row=i+1, column=1).value = ID_List[i]
sheet.cell(row=i+1, column=2).value = nome[i]
sheet.cell(row=i+1, column=3).value = codigoTarefa[i]
sheet.cell(row=i+1, column=4).value = situacaoTarefa[i]
sheet.cell(row=i+1, column=5).value = faseNome[i]
wb.save("TarefasNEW.xlsx")
#print(codigoTarefa)
#print(situacaoTarefa)
#print(faseNome)
def unidades10(data, nome, codigoTarefa, situacaoTarefa, faseNome, ID): # ok just wait secounds this ID here is the parameter to the above listID[li]
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("BACKOFFICE")
coluna = 1
for i in data['data']:
nome.append(i['nome'])
codigoTarefa.append(i['codigo'])
situacaoTarefa.append(i['situacao'])
faseNome.append(i['fase']['nome'])
ID_List.append(ID)# append here
coluna +=1
if __name__ == '__main__':
Sults()
import requests
from bs4 import BeautifulSoup
import csv
import time
def fin_car(url):
x = {}
y = []
page = ''
while page == '':
try:
page = requests.get(url)
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
#page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
Precio = []
price = soup.find('div' , 'price').text
Precio = (str(price).strip())
print (Precio)
#-------------------------------------------------------------------------------------------------------------#
# #Tipo Propiedad #Tipo de oferta #
#-------------------------------------------------------------------------------------------------------------#
Tipo_Propiedad = []
Tipo_de_oferta = []
T_1 = soup.find('div' , 'box').h1.text
text = (str(T_1).strip())
l = text.find(' ')
m = text.find(' ', l+1)
n = text.find(' ', m+1)
Tipo_Propiedad = text[0 : l]
Tipo_de_oferta = text[m+1 : n]
print (Tipo_Propiedad)
print (Tipo_de_oferta)
#-------------------------------------------------------------------------------------------------------------#
# #Departamento #Ciudad #Zona #Barrio #
#-------------------------------------------------------------------------------------------------------------#
Departamento = []
Ciudad = []
Zona = []
Barrio = []
first = soup.find('div' , 'breadcrumb left')
link = first.find('div')
a_link = link.findAll('a')
box1 = []
for row in a_link:
box1.append(row.text)
Departamento = (box1[1:2].pop())
Ciudad = (box1[2:3].pop())
Zona = (box1[3:4].pop())
Barrio = (box1[4:5])
print (Departamento)
print (Ciudad)
print (Zona)
print (Barrio)
#-------------------------------------------------------------------------------------------------------------#
# #Área #Habitaciones #Baños #Parqueaderos #
#-------------------------------------------------------------------------------------------------------------#
box_2 = soup.find('div' ,'features clearfix')
box_2_1 = box_2.findAll('span')
box2 = []
Área=[]
Habitaciones = []
Baños = []
Parqueaderos = []
for row2 in box_2_1:
box2.append(str(row2.text).strip())
for i in box_2_1:
a = box2[0:1].pop()
b = box2[1:2].pop()
c = box2[2:3].pop()
d = box2[3:4].pop()
a1 = a[0 : a.find(' ')]
Área = (a1)
Habitaciones = (b.rstrip()[-1])
Baños = (c.rstrip()[-1])
Parqueaderos =(d)
print (Área)
print (Habitaciones)
print (Baños)
print (Parqueaderos)
#-------------------------------------------------------------------------------------------------------------#
# #Área_Privada #Área_Const #Antigüedad #Admón #Estrato #Estado #Piso_No #
#-------------------------------------------------------------------------------------------------------------#
box_3 = soup.find('div' ,'row features_2 ')
box_3_1 = box_3.findAll('li') #
Área_Privada = []
Área_Const = []
Antigüedad = []
Admón = []
Estrato = []
Estado = []
Piso_No = []
for li in box_3_1:
heading_words = li.b.text.split() #
target_content = str(li.br.next_sibling).strip() #
if "privada:" in heading_words:
Área_Privada = (target_content) #
elif "Const.:" in heading_words:
Área_Const = (target_content)
elif "Antigüedad:" in heading_words:
Antigüedad = (target_content)
elif "Admón:" in heading_words:
Admón = (target_content)
elif "Estrato:" in heading_words:
Estrato = (target_content)
elif "Estado:" in heading_words:
Estado = (target_content)
elif "Piso" in heading_words:
Piso_No = (target_content)
print (Área_Privada) #
print (Área_Const)
print (Antigüedad)
print (Admón)
print (Estrato) #
print (Estado)
print (Piso_No[0:1])
#-------------------------------------------------------------------------------------------------------------#
# #Actualizado #Visitas #Código_FincaRaiz # #
#-------------------------------------------------------------------------------------------------------------#
box4 = soup.find('div' , 'box_content row')
box4_1 = box4.findAll('span')
vis = []
Actualizado = []
Visitas = []
Código_FincaRaiz = []
for i in box4_1:
vis.append((str(i.text).strip()))
for j in box4_1:
e = vis[0:1].pop()
f = vis[2:3].pop()
Actualizado = e
Código_FincaRaiz = f
url="https://www.fincaraiz.com.co/WebServices/Statistics.asmx/GetAdvertVisits?idAdvert={}&idASource=40&idType=1001".format(Código_FincaRaiz) #
page1 = requests.get(url)
soup1 = BeautifulSoup(page1.content , 'lxml')
visit1 = soup1.find('double').text
Visitas = (visit1)
print (Actualizado)
print (Visitas)
print (Código_FincaRaiz)
#-------------------------------------------------------------------------------------------------------------#
x['Código FincaRaiz'] = Código_FincaRaiz
x['Departamento'] = Departamento
x['Ciudad'] = Ciudad
x['Zona'] = Zona
x['Barrio'] = Barrio
x['Tipo Propiedad'] = Tipo_Propiedad
x['Tipo de oferta'] = Tipo_de_oferta
x['Precio'] = Precio
x['Área'] = Área
x['Área Privada'] = Área_Privada
x['Área Const.'] = Área_Const
x['Antigüedad'] = Antigüedad
x['Baños'] = Baños
x['Habitaciones'] = Habitaciones
x['Parqueaderos'] = Parqueaderos
x['Admón'] = Admón
x['Estrato'] = Estrato
x['Estado'] = Estado
x['Piso No.'] = Piso_No
x['Actualizado'] = Actualizado
x['Visitas'] = Visitas
y.append(x)
x = {}
y = []
filename = 'Fincar.csv'
with open(filename, 'w', newline='') as f:
w = csv.DictWriter(f,['Código FincaRaiz','Departamento','Ciudad','Zona','Barrio', 'Tipo Propiedad', 'Tipo de oferta',
'Precio' , 'Área' , 'Área Privada' , 'Área Const.', 'Antigüedad', 'Baños' , 'Habitaciones',
'Parqueaderos' , 'Admón', 'Estrato' , 'Estado' , 'Piso No.' , 'Actualizado', 'Visitas'])
w.writeheader()
for x in y:
w.writerow(x)a
tab = []
xen = []
key_value = 'https://www.fincaraiz.com.co'
for i in range(2,6):
tab.append('https://www.fincaraiz.com.co/finca-raiz/?ad=30|{}||||1||||||||||||||||||||||1|||1||||||'.format(i))
for j in tab:
page = requests.get(j)
soup = BeautifulSoup(page.content , 'lxml')
index = soup.findAll('div' , 'span-title')
for i in index:
xen.append(i.find('a').get('href'))
for j in xen:
url = (key_value + j)
fin_car(url)
I've tried to fetch values from list of pages and trying to save the file to csv document, however csv document only storing the last value inside the csv document.
Tried multiple ways but it always giving the same output.
also the column with the blank values need to be filled with nil value, however it posting only [] symbol.
New to python and find it difficult to saving it to python. Need your support to achieve this task.
How should I proceed further
def fin_car(url):
x = {}
y = []
...
x = {}
y = []
These values are in different scopes. Assigning x inside fin_car doesn't affect it outside. You could change that using a global statement, but much better is to return from the function.
Even if you were changing the outside values of x and y, you only call fin_car long after writing to the CSV. The order of events in your code matters.
I suggest:
def fin_car(url):
x = {}
...
return x
with open...:
w = csv.DictWriter(...)
...
for j in tab:
...
for j in xen:
url = ...
w.writerow(fin_car(url))
You don't need y at all.
Here is the code which I got from the web, when I execute it, It says the following error, I am new to web scraping, so utterly confused about it.
Can anyone tell me where my code went wrong?
Thank you for your help!
from nytimesarticle import articleAPI
api = articleAPI('a0de895aa110431eb2344303c7105a9f')
articles = api.search( q = 'Obama',
fq = {'headline':'Obama', 'source':['Reuters','AP', 'The New York Times']},
begin_date = 20111231 )
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
# locations
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
# subject
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = date + '0101',
end_date = date + '1231',
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
Amnesty_all = []
for i in range(1980,2014):
print ('Processing' + str(i) + '...')
Amnesty_year = get_articles(str(i),'Amnesty International')
Amnesty_all = Amnesty_all + Amnesty_year
import csv
keys = Amnesty_all[0].keys()
with open('amnesty-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(Amnesty_all)
This is the output when I run it on python 3.4:-
OUTPUT:
Traceback (most recent call last):
File "/Users/niharika/Documents/nyt.py", line 7, in <module>
begin_date = 20111231 )
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 111, in search
API_ROOT, response_format, self._options(**kwargs), key
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 84, in _options
v = _format_fq(v)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 69, in _format_fq
d[k] = '"' + v + '"'
TypeError: Can't convert 'bytes' object to str implicitly
>>>
source for code: http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial
The error is telling you to convert v (the bytes object) to a string explicitly.
Basically i copied the code from NYTimesArticleAPI/NYTimesArticleAPI/search_api.py and replaced it with my installed nytimesarticle file nytimesarticle.py
Thus it removed
def _utf8_encode(self, d):
......
which prevented nytimesarticle module to work with python3, throwing TypeError:must be str,not bytes on search function of the api.
I need to access the new information added to data_list in the get_data_list function within the second function which is get_month_averages, but when I try to do that, it says data_list is undefined. How do I do this?
import csv
def get_data_list(data_file):
data_file = open("table.csv", "r")
data_list = []
for line_str in data_file:
data_list.append(line_str.strip().split(','))
return data_list
def get_month_averages(data_list):
date_list = []
vol_list = []
adjclos_list = []
for row in data_list:
date_list.append(row[0])
vol_list.append(row[5])
adjclos_list.append(row[6])
all_list = [date_list, vol_list, adjclos_list]
return all_list
print (get_month_averages(data_list))
This one works:
import csv
def get_data_list():
data_file = open("table.csv", "r")
data_list = []
for line_str in data_file:
data_list.append(line_str.strip().split(','))
return data_list
def get_month_averages(data_list):
date_list = []
vol_list = []
adjclos_list = []
for row in data_list:
date_list.append(row[0])
vol_list.append(row[5])
adjclos_list.append(row[6])
all_list = [date_list, vol_list, adjclos_list]
return all_list
dList = get_data_list()
mAvg = get_month_averages(dList)
print mAvg
output_rdpartition = mp.Queue()
def read_partition_zipfile(infile,stop_words,startline,endline):
# endline = startline + 100
chunk_user_d = defaultdict(lambda: defaultdict(list))
chunk_user_withoutstamp_d = defaultdict(list)
with gzip.open(in_file, "rb") as f:
for j, line in enumerate(f):
if j >= startline and j < endline:
if j%10000==0 : print "processed",j,"lines"
line = line[:-1].split("|:|")
time_stamp = int(line[0])
user_id = line[-1]
keywords=line[1].split(',')
keywords = [item.lower() for item in keywords if len(item)>=2]
keywords = [item for item in keywords if item not in stop_words]
# print 'user_id', user_id
# print 'time_stamp', time_stamp
# print 'keywords',keywords
chunk_user_d[user_id][time_stamp] += keywords
chunk_user_withoutstamp_d[user_id] +=keywords
# print chunk_user_withoutstamp_d,'chunk_user_withoutstamp_d'
# return chunk_user_d, chunk_user_withoutstamp_d
output_rdpartition.put((chunk_user_d,chunk_user_withoutstamp_d))
def main():
start_time = datetime.datetime.now()
print("at the start of main")
user_id ='1ss7fef4'
lenth = 0
tf_idf = defaultdict(int)
key_dic = defaultdict(float)
time_latest = 0
processes_rd = [mp.Process(target = read_partition_zipfile, args =(in_file, stop_words, p_index[j], p_index[j+1])) for j in range(0,3)]
for p in processes_rd:
p.start()
results_rd = [output_rdpartition.get() for p in processes_rd]
# results_rd[0]is the chunkuser ,results_rd[1]is the chunkuser_without stamp
print results_rd
if __name__ == '__main__':
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
stop_words = stop_words.split(",")
in_file = 'uniq.txt.gz'
p_index = range(0,28000000,2800000)
main()
It seems that it is because of the queue issue, i can print within function ,but i can not return the output of the function