I use the codes below to export the number of searches for multiple states in US and get the error "ResponseError: The request failed: Google returned a response with code 400."
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='en-US', tz=360)
colnames = ["keywords"]
df = pd.read_csv("keyword_list.csv", names=colnames)
df2 = df["keywords"].values.tolist()
df2.remove("Keywords")
dataset = []
for x in range(0,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe='2020-01-01 2020-02-01',
geo='US-MA,US-TX,US-NY,US-WA')
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
dataset.append(data)
result = pd.concat(dataset, axis=1)
result.to_csv("US.csv")
Issue is with the geo parameter, You can provide one at a time not all, You can try like this
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='en-US', tz=360)
colnames = ["keywords"]
df = pd.read_csv("keyword_list.csv", names=colnames)
df2 = df["keywords"].values.tolist()
df2.remove("Keywords")
dataset = []
for geo_code in ['US-MA','US-TX','US-NY','US-WA']:
for x in range(0,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe='2020-01-01 2020-02-01',
geo=geo_code)
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
data['geo']=geo_code
dataset.append(data)
result = pd.concat(dataset)
result.to_csv("US.csv")
Related
I need to add the cell for d_from and d_to, making it so that the cell it references is the next one after it, like for d_from, currently it references cell B7, but the next for loop will be referencing B8.
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='en-GB', tz=360)
df = wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1S4WbPwbVCHq5wWmycv6_CUoLe7_kpvEppJVgM9dQ2eU/edit#gid=0')
sheet = wb.sheet1
df2 = sheet.col_values(5)
d_from = sheet.acell('B7').value
d_to = sheet.acell('C8').value
geo1 = sheet.acell('B10').value
dataset = []
for x in range(0,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe= str(d_from + " " + d_to),
geo= str(geo1))
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
dataset.append(data)
result = pd.concat(dataset, axis=1)
result.to_csv('DE 2022 Search Trends.csv')
!cp search_trends_DOWNLOAD_ME.csv "/content/drive/MyDrive/Colab_Notebooks"
executionTime = (time.time() - startTime)
print('Execution time in sec.: ' + str(executionTime))
I used this script to scrape some data:
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls:
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
for container in commentary:
comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
comms.append(comm)
comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
rat = re.findall(r'\d+', str(comm1))
rat1 = (str(rat))[2]
notes.append(rat1)
datereal = container.find("div", class_= "_2fxQ4TOx").text
date = datereal[-9:]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
data.to_csv('table4.csv', sep=';', index=False)
I load the data into my notebook: df4 = pd.read_csv('datatrip/table4.csv', sep = ';')
Here's what my database looks like right now:
database
And I calculate some trigram with those functions:
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = False)
return corpus
trigramlow = get_top_n_gram(df4['comms_clean'], (3,3), 50)
trigramlow = process(trigramlow)
And here's the result (I only give some rows, not the entire dataframe):
trigram
And so, here's my problem, when I used this function:
means = []
for i in range(0,50):
trigrambase = df4[df4['comms_clean'].str.contains(trigramlow.Text[i],regex=False, case=False, na=False)]
mean = round(trigrambase['notes'].mean(), 2)
means.append(mean)
trigramlow['means'] = means
it give me this (I only give some rows, not the entire dataframe):
means
I don't understand why but some means are not correctly calculated..
Like this:
df20 = df4[df4['comms_clean'].str.contains('queensway bayswater tube',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 2.0
With the function, I obtained 1.0.
It seems that most of the means are calculated correctly though:
df20 = df4[df4['comms_clean'].str.contains('worst hotel ever',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.0
df20 = df4[df4['comms_clean'].str.contains('hotel ever stayed',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.11
I cannot figure out where is the problem?
I have no idea if this task can be made from Altair or Pandas, but I'm looking for the documentation to change the date language of my graph.
Here's my code:
import pandas as pd
import altair as alt
from datetime import datetime, timedelta
url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_confirmed_mx.csv'
df = pd.read_csv(url, index_col=0)
#df = pd.read_csv(url)
df = df.loc['Colima','18-03-2020':'18-11-2020']
df = pd.DataFrame(df)
df.index = pd.to_datetime(df.index, format='%d-%m-%Y')
%run urban_theme.py
alt.Chart(df.reset_index()).mark_line().encode(
alt.X('index:T', title = " "),
alt.Y('Colima:Q', title = " "),
).properties(
title = "Casos acumulados",
)
Output:
It's not well documented currently, but there is some relevant information in How to set locale in Altair?.
You can set a Spanish time format locale for your chart like this:
import pandas as pd
import altair as alt
from datetime import datetime, timedelta
from urllib import request
import json
# fetch & enable a Spanish timeFormat locale.
with request.urlopen('https://raw.githubusercontent.com/d3/d3-time-format/master/locale/es-ES.json') as f:
es_time_format = json.load(f)
alt.renderers.set_embed_options(timeFormatLocale=es_time_format)
url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_confirmed_mx.csv'
df = pd.read_csv(url, index_col=0)
#df = pd.read_csv(url)
df = df.loc['Colima','18-03-2020':'18-11-2020']
df = pd.DataFrame(df)
df.index = pd.to_datetime(df.index, format='%d-%m-%Y')
alt.Chart(df.reset_index()).mark_line().encode(
alt.X('index:T', title = " "),
alt.Y('Colima:Q', title = " "),
).properties(
title = "Casos acumulados",
width = 800
)
I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')
I am trying to calculate the duration of the drawdowns and the time to recovery for a stock series. I can calculate the drawdowns but am struggling to the the durations and recovery time for each drawdown. So far I have this code:
import pandas as pd
import pickle
import xlrd
import numpy as np
np.random.seed(0)
df = pd.Series(np.random.randn(2500)*0.7+0.05, index=pd.date_range('1/1/2000', periods=2500, freq='D'))
df= 100*(1+df/100).cumprod()
df=pd.DataFrame(df)
df.columns = ['close']
df['ret'] = df.close/df.close[0]
df['modMax'] = df.ret.cummax()
df['modDD'] = 1-df.ret.div(df['modMax'])
groups = df.groupby(df['modMax'])
dd = groups['modMax','modDD'].apply(lambda g: g[g['modDD'] == g['modDD'].max()])
top10dd = dd.sort_values('modDD', ascending=False).head(10)
top10dd
This gives the 10 highest drawdowns of the series but I also want the duration of the drawdown and time to recovery.
I solved the problem as follows:
def drawdown_group(df,index_list):
group_max,dd_date = index_list
ddGroup = df[df['modMax'] == group_max]
group_length = len(ddGroup)
group_dd = ddGroup['dd'].max()
group_dd_length = len(ddGroup[ddGroup.index <= dd_date])
group_start = ddGroup[0:1].index[0]
group_end = ddGroup.tail(1).index[0]
group_rec = group_length - group_dd_length
#print (group_start,group_end,group_dd,dd_date,group_dd_length,group_rec,group_length)
return group_start,group_end,group_max,group_dd,dd_date,group_dd_length,group_rec,group_length
dd_col = ('start','end','peak', 'dd','dd_date','dd_length','dd_rec','tot_length')
df_dd = pd.DataFrame(columns = dd_col)
for i in range(1,10):
index_list = top10dd[i-1:i].index.tolist()[0]
#print(index_list)
start,end,peak,dd,dd_date,dd_length,dd_rec,tot_length = drawdown_group(df,index_list)
#print(start,end,dd,dd_date,dd_length,dd_rec,tot_length)
df_dd.loc[i-1] = drawdown_group(df,index_list)
Produces this table: