am trying to webscrape the information using selenium ,code is working for single item, but when am passing the list am getting the below output,
Actual Output
Expected output
term=["Atta","Sugar"]
def get_link(term,page):
for term in term:
pin(Pincode)
grocery="https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next=url_template+str(page)
#print(next)
return next
def PID():
for page in range(1,5):
path=get_link(term,page)
driver.get(path)
id=driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results=i.get_attribute('data-id')
#print(results)
PIDs.append(results)
Search_Term.append(term)
PID()
ID={'Query':Search_Term,'PID_s':PIDs}
Output=pd.DataFrame(ID)
print(Output)
May be it would be better to put the for loop for term inside the PID function. Try like below once:
terms = ["Atta", "Sugar"]
def get_link(term, page):
# Not sure what pin(Pincode) line is doing
grocery = "https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
#print(term)
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next = url_template + str(page)
# print(next)
return next
def PID():
for term in terms:
for page in range(1, 5):
path = get_link(term, page)
driver.get(path)
id = driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results = i.get_attribute('data-id')
print(f"{term}:{results}")
# PIDs.append(results)
# Search_Term.append(term)
PID()
Atta:FLRFDPRFNGYJ95KD
Atta:FLRETEFHENWKNJQE
...
Sugar:SUGG4SFGSP6TCQ48
Sugar:SUGEUD25B6YCCNGM
...
Related
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
This is a simple code for get url with search parameters. It actually works, but I think it needs to be optimized.
def target_url(search_term, include_term, intext_term, target_site_in, page):
base_template_0 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}+site:{target_site_in}&hl=en&rlz='
base_template_1 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}&hl=en&rlz='
base_template_2 = f'https://www.google.com/search?q={search_term}+"{include_term}"&hl=en&rlz='
base_template_3 = f'https://www.google.com/search?q={search_term}&hl=en&rlz='
search_term = search_term.replace(' ', '+')
base_url_0 = base_template_0.format(search_term)
base_url_1 = base_template_1.format(search_term)
base_url_2 = base_template_2.format(search_term)
base_url_3 = base_template_3.format(search_term)
url_template_0 = base_url_0 + '&start={}'
url_template_1 = base_url_1 + '&start={}'
url_template_2 = base_url_2 + '&start={}'
url_template_3 = base_url_3 + '&start={}'
if page == 0 and search_term and include_term and intext_term and target_site:
return base_url_0
if page == 0 and search_term and include_term and intext_term:
return base_url_1
if page == 0 and search_term and include_term:
return base_url_2
if page == 0 and search_term:
return base_url_3
else:
if search_term and include_term and intext_term and target_site:
return url_template_0.format(page)
if search_term and include_term and intext_term:
return url_template_1.format(page)
if search_term and include_term:
return url_template_2.format(page)
if search_term:
return url_template_3.format(page)
Four parameters are required: search_term, inclusion_term, input_term, target_site_in - In each case, a conditioned URL was specified differently.
Give me a better idea for optimization.
Instead of having multiple templates strings and selecting on them, you can make a method that gives you the final search query:
def get_search_query(search_term, include_term, intext_term, target_site_in):
response = search_term.replace(' ', '+')
if include_term:
response = f"{response}+{include_term}"
if intext_term:
response = f"{response}+intext:{intext_term}"
if target_site_in:
response = f"{response}+site:{target_site_in}"
return response
now in your method you can call it
def target_url(search_term, include_term, intext_term, target_site_in, page):
query = get_search_query(search_term, include_term, intext_term, target_site_in)
url = f'https://www.google.com/search?q={query}&hl=en&rlz='
if page != 0:
url = f"{url}&page={page}"
return url
I'm just a few hours into learning Python so please go easy with me! I'm just wanting to scrape scores and scorers off a website, I've been able to do that, however, I'm only getting one scorer (if there is one!), when there are multiple goal scorers I am only getting the first. I think I'm trying to look for multiple scorers under '# Home Scorers'.
My code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.skysports.com/football-results"
match_results = {}
match_details = {}
match_no = 0
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
matches = soup.find_all('div',{'class':'fixres__item'})
for match in matches:
try:
match_url_get = match.find('a',{'class':'matches__item matches__link'}).get('href')
match_url = match_url_get if match_url_get else "unknown"
event_id = match_url[-6:]
match_response = requests.get(match_url)
match_data = match_response.text
match_soup = BeautifulSoup(match_data,'html.parser')
# Match Details
match_date = match_soup.find('time',{'class':'sdc-site-match-header__detail-time'}).text
match_location = match_soup.find('span',{'class':'sdc-site-match-header__detail-venue'}).text
match_info = match_soup.find('p',{'class':'sdc-site-match-header__detail-fixture'}).text
# Home Scores & Team
home_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--home'})
for home_detail in home_details:
home_team = home_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
home_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-home'})
home_score = home_score_get.text if home_score_get else "none"
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
# Away Scores & Team
away_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--away'})
for away_detail in away_details:
away_team = away_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
away_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-away'})
away_score = away_score_get.text if away_score_get else "none"
# Home Scorers
away_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-away'})
for away_scorer_detail in away_scorer_details:
away_goal_scorer_get = away_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
away_goal_scorer = away_goal_scorer_get.text if away_goal_scorer_get else "none"
away_goal_score_minute_get = away_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
away_goal_score_minute = away_goal_score_minute_get.text if away_goal_score_minute_get else "none"
print("Match: ",event_id , "Match Date:", match_date, "Match Location:", match_location, "Match Info:", match_info, "\nResult: ", home_team, home_score, away_team, away_score)
print("Home Scorer:", goal_scorer, "Minute:",goal_score_minute, "\nAway Scorer:", away_goal_scorer, "Minute:",away_goal_score_minute)
print(match_date)
except:
pass
match_no+=1
match_results[match_no] = [event_id, home_team, home_score, away_team, away_score, match_url, match_date, match_location, match_info]
match_details[match_no] = [event_id, goal_scorer, goal_score_minute, away_goal_scorer, away_goal_score_minute]
Period = "2021-22"
print("Total Matches: ", match_no)
match_results = pd.DataFrame.from_dict(match_results, orient='index', columns = ['Event_ID:', 'Home Team:','Home Score:','Away Team:','Away Score:','Link:','Match Date:','Match Location:','Match Info:'])
match_results.to_csv("Python/FL/Premier League Results (SkySports.com) " + Period + ".csv")
match_details = pd.DataFrame.from_dict(match_details, orient='index', columns = ['Event_ID:', 'Home Goal:','Home Goal Minute:','Away Goal:','Away Goal Minute:'])
match_details.to_csv("Python/FL/Premier League Details (SkySports.com) " + Period + ".csv")
So the bit that's not working correctly is:
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
Any ideas how I can return multiple rows for that bit?!
Thanks in advance :)
home_scorer_details only has 1 item, the unordered list itself.
To get all the scorers you need to get the items in that list.
The following code, which is pretty rough, will create a list of dictionaries where each dictionary has the name of the scorer and the minute(s) they scored.
You could use similar code to get all the away scorers.
Like I said, this code is rough and needs refined but it should give you a start.
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
home_scorers = []
for home_scorer_detail in home_scorer_details[0].find_all('li'):
goal_scorer = home_scorer_detail.text
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
home_scorers.append({'scorer': goal_scorer, 'minute': goal_score_minute})
print(home_scorers)
When I run the code, it gives me \r\n with space. I have tried to remove \r\n from the result but it didn't. This is code. Please check it out.
def parse_subtitles(self, response):
items = FetchingItem()
Arabic_price = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
Chinese_price = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for names in names_list:
result = [re.sub('\r\n\s+', ' ', text) for text in names]
items['Arabic_price'] = Arabic_price
items['Chinese_price'] = Chinese_price
yield items
Not sure what do you want exactly but this code works:
def parse_subtitles(self, response):
results = {}
results['Arabic_price'] = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
results['Chinese_price'] = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for name in names_list:
results[name] = [re.sub(r'[\r\n\s]+', ' ', text) for text in results[name]]
items['Arabic_price'] = results['Arabic_price']
items['Chinese_price'] = results['Chinese_price']
I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))