Here is the code which I got from the web, when I execute it, It says the following error, I am new to web scraping, so utterly confused about it.
Can anyone tell me where my code went wrong?
Thank you for your help!
from nytimesarticle import articleAPI
api = articleAPI('a0de895aa110431eb2344303c7105a9f')
articles = api.search( q = 'Obama',
fq = {'headline':'Obama', 'source':['Reuters','AP', 'The New York Times']},
begin_date = 20111231 )
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
# locations
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
# subject
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = date + '0101',
end_date = date + '1231',
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
Amnesty_all = []
for i in range(1980,2014):
print ('Processing' + str(i) + '...')
Amnesty_year = get_articles(str(i),'Amnesty International')
Amnesty_all = Amnesty_all + Amnesty_year
import csv
keys = Amnesty_all[0].keys()
with open('amnesty-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(Amnesty_all)
This is the output when I run it on python 3.4:-
OUTPUT:
Traceback (most recent call last):
File "/Users/niharika/Documents/nyt.py", line 7, in <module>
begin_date = 20111231 )
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 111, in search
API_ROOT, response_format, self._options(**kwargs), key
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 84, in _options
v = _format_fq(v)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/nytimesarticle.py", line 69, in _format_fq
d[k] = '"' + v + '"'
TypeError: Can't convert 'bytes' object to str implicitly
>>>
source for code: http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial
The error is telling you to convert v (the bytes object) to a string explicitly.
Basically i copied the code from NYTimesArticleAPI/NYTimesArticleAPI/search_api.py and replaced it with my installed nytimesarticle file nytimesarticle.py
Thus it removed
def _utf8_encode(self, d):
......
which prevented nytimesarticle module to work with python3, throwing TypeError:must be str,not bytes on search function of the api.
Related
Good morning,
Twitter scraper, working fine for months now gets the below error. It also pulls limited Tweets for the date range. Any help would be greatly appreciated.
This is the code I am getting when running on spyder 5.0.5 on Python 3.8.
It was working fine up until September.
ERROR CODE
[SpyderKernelApp] ERROR | Exception in message handler:
Traceback (most recent call last):
File "C:\Users\james.coldman\Anaconda3\lib\site-packages\spyder_kernels\comms\frontendcomm.py", line 164, in poll_one
asyncio.run(handler(out_stream, ident, msg))
File "C:\Users\james.coldman\AppData\Roaming\Python\Python38\site-packages\nest_asyncio.py", line 32, in run
return loop.run_until_complete(future)
File "C:\Users\james.coldman\AppData\Roaming\Python\Python38\site-packages\nest_asyncio.py", line 60, in run_until_complete
f = asyncio.ensure_future(future, loop=self)
File "C:\Users\james.coldman\Anaconda3\lib\asyncio\tasks.py", line 673, in ensure_future
raise TypeError('An asyncio.Future, a coroutine or an awaitable is '
TypeError: An asyncio.Future, a coroutine or an awaitable is required
[SpyderKernelApp] ERROR | Exception in message handler:
Traceback (most recent call last):
File "C:\Users\james.coldman\Anaconda3\lib\site-packages\spyder_kernels\comms\frontendcomm.py", line 164, in poll_one
asyncio.run(handler(out_stream, ident, msg))
File "C:\Users\james.coldman\AppData\Roaming\Python\Python38\site-packages\nest_asyncio.py", line 32, in run
return loop.run_until_complete(future)
File "C:\Users\james.coldman\AppData\Roaming\Python\Python38\site-packages\nest_asyncio.py", line 60, in run_until_complete
f = asyncio.ensure_future(future, loop=self)
File "C:\Users\james.coldman\Anaconda3\lib\asyncio\tasks.py", line 673, in ensure_future
raise TypeError('An asyncio.Future, a coroutine or an awaitable is '
TypeError: An asyncio.Future, a coroutine or an awaitable is required
FULL SCRIPT IN USE
import nest_asyncio
import twint
import pandas as pd
import re
import os
import random
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
nest_asyncio.apply()
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
def strip_emoji(text):
return RE_EMOJI.sub(r'', text)
def sentiment_score(tweet):
score = analyser.polarity_scores(tweet)
return score['neg'], score['neu'], score['pos'], score['compound']
def scrape(search_term, start_date, end_date):
random_code = '%030x' % random.randrange(16**30)
#configuration
config = twint.Config()
config.Search = search_term
config.Lang = "en"
config.Limit = 1000000
config.Since = start_date
config.Until = end_date
config.Store_csv = True
config.Output = "twintTerms"+search_term+str(random_code)+".csv"
#running search
twint.run.Search(config)
#convert csv to excel file and append to df_list
df = pd.read_csv("twintTerms"+search_term+str(random_code)+".csv")
df['Search Term'] = search_term
df['Start Date'] = start_date
df['End Date'] = end_date
en = df[df['language'] == "en"]
#delete csv ready for next search term
if os.path.exists("twintTerms"+search_term+str(random_code)+".csv"):
os.remove("twintTerms"+search_term+str(random_code)+".csv")
else:
pass
return en
def fetch_scrapes(search_dict):
search_terms = search_dict['Search Term']
start_dates = search_dict['Start Date']
end_dates = search_dict['End Date']
responses = []
for key in search_terms.keys(): #build a list of futures for async to run
results = scrape(str(search_terms[key]), str(start_dates[key]), str(end_dates[key]))
responses.append(results)
return responses
if __name__ == "__main__":
print("Twitter Scrape Starting...")
#delete any leftover files
if os.path.exists("twitterResults.xlsx"):
os.remove("twitterResults.xlsx")
else:
pass
#create results directory
if not os.path.exists('data/results'):
os.makedirs('data/results')
#pull in search terms
search_df = pd.read_excel(r"data/TwitterTerms.xlsx")
search_dict = search_df.to_dict()
#run scrape
dataframe_list = fetch_scrapes(search_dict)
tweet_df = pd.concat(dataframe_list)
tweet_df = tweet_df[['date','time','tweet','replies_count','retweets_count','likes_count','Search Term','Start Date','End Date']]
tweet_df = tweet_df.drop_duplicates(['tweet','Search Term'])
#remove emojis
tweet_df['tweet'] = tweet_df['tweet'].apply(strip_emoji)
#sentiment analysis
analyser = SentimentIntensityAnalyzer()
tweet_df[['Negative','Neutral','Positive','Compound']] = tweet_df['tweet'].apply(lambda x : pd.Series(sentiment_score(x)))
tweet_df.to_excel(r"data/results/rawData.xlsx")
grouped = tweet_df.groupby(['Search Term','Start Date','End Date']).agg(['mean','count'])
grouped.to_csv(r'data/results/twitterResults.csv')
#Most frequent words across topics
extra_stopwords_list = [term.split(" ") for term in list(set(tweet_df['Search Term'].tolist()))]
extra_stopwords_list = [i for j in extra_stopwords_list for i in j]
words_list = []
for team in list(set(tweet_df['Search Term'].tolist())):
print(team)
team_df = tweet_df[(tweet_df['Search Term'] == team)]
raw_string = ' '.join(team_df['tweet'])
no_links = re.sub(r'http\S+', '', raw_string)
STOPWORDS = set(list(STOPWORDS) + extra_stopwords_list + ['amp'])
wordcloud = WordCloud(stopwords = STOPWORDS).generate(no_links)
most_popular = list(wordcloud.words_.keys())
most_popular = [word for word in most_popular if len(word) > 2]
perc_list = [[word,len(team_df[team_df['tweet'].str.contains(word)])] for word in most_popular]
words_list.append(perc_list)
df = pd.DataFrame()
df['Word']= [i[0] for i in perc_list]
df['Tweets Featured In'] = [i[1] for i in perc_list]
test = []
for lst in words_list:
for lst2 in lst:
word = lst2[0]
counter = []
for lst3 in words_list:
for lst4 in lst3:
if word == lst4[0]:
counter.append(word)
if len(counter) > 1:
test.append(word)
test = list(set(test))
df_list = []
for team in list(set(tweet_df['Search Term'].tolist())):
team_df = tweet_df[tweet_df['Search Term'] == team]
perc_list = [[word, (len(team_df[team_df['tweet'].str.contains(word)])/len(team_df))*100] for word in test]
df = pd.DataFrame()
df['Word'] = [i[0] for i in perc_list]
df[team] = [i[1] for i in perc_list]
df.set_index(team)
df_list.append(df)
words_df = pd.concat(df_list, axis = 1)
words_df.to_excel(r"data/results/wordFrequencies.xlsx")
print("Scrape Completed...")
We resolved the issue temporarily by downgrading Spyder. We updated our environment.yml and re-created the environment, but if you want to downgrade spyder inside an existing environment you should be able to run this command:
conda install spyder==4.2.5
Looks like there was a change around June to the Spyder kernel that caused a conflict between Spyder and nested-asyncio. A PR with a potential fix was put up early September, but has not been merged yet. The PR is tagged for the 2.1.2 Spyder kernel release. You can follow the issue here
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
The following code applies one update to my project.
tagPath = ["Package_PLC/Tags/CCN_CNV01_MX001_A_FLT"]
alarmConfig = {"BLD_CHN01_VS001_A_FLT_C":[["enabled","Value","0"]]}
system.tag.editAlarmConfig(tagPaths, alarmConfig)
I need to do this hundreds of times.
I am trying to build tagPath and alarmConfig dictionaries from a csv file.
Sample csv:
Equipment,Item Name,Alarm Tag,Alarm Name,Cluster Name,Category,Alarm Desc,Delay,Help,Comment,Variable Tag A,Variable Tag B,Custom 1,Custom 2,Custom 3,Custom 4,Custom 5,Custom 6,Custom 7,Custom 8,Paging,Paging Group,Area,Privilege,Historize,Project,SEQUENCE,TAGGENLINK,EDITCODE,LINKED
"","","BLD_CHN01_VS001_A_FLT_C","BLD_CHN01_VS001_A_FLT_C","","","Catch-up Conveyor / Chain Comms Fault","00:00:00","","BLD_CHN01_VS001_A_FLT_C","BLD_CHN01_VS001_A_FLT_C","KFS_ZNE02_WRM","STUN","","","","","","","","","","1","","","","","","",""
"","","BLD_CHN01_VS001_A_FLT_V","BLD_CHN01_VS001_A_FLT_V","","","Catch-up Conveyor / Chain VSD Fault","00:00:00","","BLD_CHN01_VS001_A_FLT_V","BLD_CHN01_VS001_A_FLT_V","","STUN","","","","","","","","","","1","","","","","","",""
"","","BLD_CHN01_VS001_S_HTY","BLD_CHN01_VS001_S_HTY","","","Catch-up Conveyor / Chain Cicuit Breaker","00:00:00","","BLD_CHN01_VS001_S_HTY","NOT BLD_CHN01_VS001_S_HTY","KFS_ZNE02_WRM","STUN","","","","","","","","","","1","","","","","","",""
This is what I have so far:
import system
import csv
path = system.file.openFile('csv')
if path != None:
print "path found"
f=open(path)
reader = csv.DictReader(f)
path1 = "Package_PLC/Tags/"
tagpath = []
alarmConfig = []
state = 0
comment = ""
for i in reader:
if row['Alarm Tag'] == 'ECN*' || 'FCN*' || 'PAC*':
tagpath.append(path1 + int(row['Alarm Tag']))
alarmname = row[Alarm Tag]
if row[Variable Tag A] == "NOT*":
state = 0
else:
state = 1
comment = row[Alarm Desc]
alarmConfig.append({alarmname: [["setpointA","Value",state],
["displayPath","Value","Packing"],
["notes","Value",comment]]
})
system.tag.editAlarmConfig(tagPaths, alarmConfig)
f.close()
The following error gets thrown.
Traceback (most recent call last):
File "<buffer>", line 28, in <module>
TypeError: list indices must be integers
This worked.
import string
import system
import csv
path = system.file.openFile('csv')
if path != None:
print "path found"
f=open(path)
reader = csv.DictReader(f)
path1 = "Package_PLC/Tags/"
tagpath = []
alarmConfig = {}
state = 0
readerlist = list(reader)
for stuff in readerlist:
if "PAC" in stuff['Alarm Tag'] or "ECN" in stuff['Alarm Tag'] or "CCN" in stuff['Alarm Tag'] or "FCN" in stuff['Alarm Tag'] :
tagpath = []
tagpath.append(str( path1 + stuff['Alarm Tag']))
if "NOT" in stuff['Variable Tag A']:
state = 0
else :
state = 1
display = ['displayPath','Value','Packing']
notes = ['notes','Value',str(stuff['Alarm Desc'])]
setpointA =['setpointA','Value', str(state)]
alarmConfig = {}
alarmConfig[stuff['Alarm Tag']] = [display,notes,setpointA]
system.tag.editAlarmConfig(tagpath, alarmConfig)
f.close()
It's difficult to help you because:
The sample file doesn't trigger anything
You didn't provide the system module
But still here's my attempt:
import os.path
import csv
input_file_name = 'Sample.csv'
if os.path.exists(input_file_name):
with open(input_file_name, newline='') as input_file:
events = csv.DictReader(input_file)
data_extracted = [
(
current_event['Alarm Tag'],
0 if current_event['Variable Tag A'].startswith('NOT') else 1,
current_event['Alarm Desc']
)
for current_event in events
if current_event['Alarm Tag'][:3] in ('ECN', 'FCN', 'PAC')
]
tag_paths = [f'Package_PLC/Tags/{x[0]}' for x in data_extracted]
alarm_config = {
alarm_name: [
['setpointA', 'Value', state],
['displayPath', 'Value', 'Packing'],
['notes', 'value', comment]
]
for (alarm_name, state, comment) in data_extracted
}
system.tag.editAlarmConfig(tag_paths, alarm_config)
For some reason my code(following) has brought up a Value Error which I cannot understand. Please evaluate my code too. You can find the project I am trying to do at
http://www.ocr.org.uk/Images/226767-unit-j276-03-programming-project-task-1-sample-non-exam-assessment.pdf
fileid = "details for nea.txt"
ID = []
surname = []
forename = []
dob = []
addr = []
addrT = []
addrTh = []
addrF = []
addrFi = []
homNum = []
gend = []
tutor = []
schoolEm = []
def Read():
file = open(fileid, "r")
Record = file.readline()
for line in Record:
line = line.strip()
A,B,C,D,E,F,G,H,I,J,K,L,M = line.split(',')
ID.append(A)
surname.append(B)
forename.append(C)
dob.append(D)
addr.append(E)
addrT.append(F)
addrTh.append(G)
addrF.append(H)
addrFi.append(I)
homNum.append(J)
gend.append(K)
tutor.append(L)
schoolEm.append(M)
file.close()
def Save():
Record = []
file = open(fileid,"w")
for i in range(len(ID)):
Record.append(ID[i] +","+surname[i]+","+forename[i]+","+dob[i]+","+addr[i]+","+addrT[i]+","+addrTh[i]+","+addrF[i]+","+addrFi[i]+","+homNum[i]+","+gend[i]+","+tutor[i]+","+schoolEm[i]+"\n")
file.writelines(Record)
file.close()
Read()
print(ID)
print(surname)
The Text File I used goes as following:
01,abe,fat,01/02/02,5,Stoney Lane,Stur,Dorset,DR101LM,0123,M,C,email#sc. The lists titled addr, addrT represent the different lines of address.
put last three lines inside main. Value error should go away
I'm trying to pull all of the 2016 NY Times articles that have the word "economy" in them using the Times' API. I get the following error message at the end of my code:
ValueError: dict contains fields not in fieldnames: 'abstract'
And here is my code:
from nytimesarticle import articleAPI
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New
YorkTimes']},
begin_date = 20151231)
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)
It seems my if statement should prevent the error. Also, if I use "writerow" as I've seen sometimes mentioned on here I get the entire list of details without creating the csv. Any help would be appreciated!
I am not sure what was your problem, but this code creates a file econ-mentions.csv with content.
from nytimesarticle import articleAPI
def parse_articles(articles):
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
all_articles = []
for i in range(0,100):
articles = api.search(q = query,
fq = {'source':['Reuters','AP', 'The New York Times']},
begin_date = 20151231,
end_date = 20160715,
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
if __name__ == "__main__":
api = articleAPI('0282db2f333f4f4095edd19f0660c978')
articles = api.search( q = 'economy',
fq = {'headline':'economy', 'source':['Reuters','AP', 'The New YorkTimes']},
begin_date = 20151231)
econ_all = []
for i in range(2015,2016):
print 'Processing' + str(i) + '...'
econ_year = get_articles(str(i),'economy')
econ_all = econ_all + econ_year
import csv
keys = econ_all[0].keys()
with open('econ-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(econ_all)