When I run the code, it gives me \r\n with space. I have tried to remove \r\n from the result but it didn't. This is code. Please check it out.
def parse_subtitles(self, response):
items = FetchingItem()
Arabic_price = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
Chinese_price = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for names in names_list:
result = [re.sub('\r\n\s+', ' ', text) for text in names]
items['Arabic_price'] = Arabic_price
items['Chinese_price'] = Chinese_price
yield items
Not sure what do you want exactly but this code works:
def parse_subtitles(self, response):
results = {}
results['Arabic_price'] = response.css('.row:nth-child(1) .item-container:nth-child(1) .rate::text').extract()
results['Chinese_price'] = response.css('.row:nth-child(1) .item-container:nth-child(2) .rate::text').extract()
names_list = ['Arabic_price', 'Chinese_price']
for name in names_list:
results[name] = [re.sub(r'[\r\n\s]+', ' ', text) for text in results[name]]
items['Arabic_price'] = results['Arabic_price']
items['Chinese_price'] = results['Chinese_price']
Related
am trying to webscrape the information using selenium ,code is working for single item, but when am passing the list am getting the below output,
Actual Output
Expected output
term=["Atta","Sugar"]
def get_link(term,page):
for term in term:
pin(Pincode)
grocery="https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next=url_template+str(page)
#print(next)
return next
def PID():
for page in range(1,5):
path=get_link(term,page)
driver.get(path)
id=driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results=i.get_attribute('data-id')
#print(results)
PIDs.append(results)
Search_Term.append(term)
PID()
ID={'Query':Search_Term,'PID_s':PIDs}
Output=pd.DataFrame(ID)
print(Output)
May be it would be better to put the for loop for term inside the PID function. Try like below once:
terms = ["Atta", "Sugar"]
def get_link(term, page):
# Not sure what pin(Pincode) line is doing
grocery = "https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
#print(term)
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next = url_template + str(page)
# print(next)
return next
def PID():
for term in terms:
for page in range(1, 5):
path = get_link(term, page)
driver.get(path)
id = driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results = i.get_attribute('data-id')
print(f"{term}:{results}")
# PIDs.append(results)
# Search_Term.append(term)
PID()
Atta:FLRFDPRFNGYJ95KD
Atta:FLRETEFHENWKNJQE
...
Sugar:SUGG4SFGSP6TCQ48
Sugar:SUGEUD25B6YCCNGM
...
From the multi-line string, I am attempting to extract the entire string right of = sign after a match. However, only portion of the string is extracted.
How can I rectify this problem? I am open to other implementations search/extraction operations as well.
import re
s = '''jaguar.vintage.aircards = 2
jaguar.vintage.hw.sdb.size = 512.1 GB
jaguar.vintage.hw.tm.firmware = SWI9X15C_05.05.16.02 r21040 carmd-fwbuild1 2014/03/17 23:49:48
jaguar.vintage.hw.tm.hardware = 1.0
jaguar.vintage.hw.tm.iccid = 8901260591783960689
jaguar.vintage.hw.tm.imei = 359225051166726
jaguar.vintage.hw.tm.imsi = 310260598396068
jaguar.vintage.hw.tm.model = MC7354
jaguar.vintage.hw.wifi1.mac = 00:30:1a:4e:06:7a
jaguar.vintage.hw.wifi2.mac = 00:30:1a:4e:06:79
jaguar.vintage.part = P34110-002
jaguar.vintage.product = P34101
jaguar.vintage.psoc = 0.1.16
jaguar.vintage.serial = 34110002T0021
jaguar.vintage.slavepsoc1 = 0.1.5
jaguar.vintage.sw.app.release = 4.0.0.41387-201902131138git367fbda8e
'''
# print(s)
# release = (s.split('jaguar.vintage.sw.app.release =')[1]).strip()
# print(release)
#part_number = jaguar.vintage.part = P34110-002
pnumsrch = r"jaguar.vintage.part =.*?(?=\w)(\w+)"
part_number = re.findall(pnumsrch, s)
print(part_number[0])
# release_number = jaguar.vintage.sw.app.release = 4.0.0.41387-201902131138git367fbda8e
relnumsrch = r"jaguar.vintage.sw.app.release =.*?(?=\w)(\w+)"
rel_number = re.findall(relnumsrch, s)
print(rel_number[0])
Actual:
P34110
4
Expected:
P34110-002
4.0.0.41387-201902131138git367fbda8e
Since . does not match a newline character by default, you can simply use .* to match the rest of the line:
pnumsrch = r"jaguar.vintage.part = (.*)"
and:
relnumsrch = r"jaguar.vintage.sw.app.release = (.*)"
Just catch everything that's not a new line Demo:
pat = re.compile(r'jaguar\.vintage\.part = ([^\n]+)')
pat2 = re.compile(r'jaguar\.vintage\.sw\.app\.release = ([^\n]+)')
>>> pat.findall(s)
['P34110-002']
>>> pat2.findall(s)
['4.0.0.41387-201902131138git367fbda8e']
You also should escape your periods in your pattern.
As mentioned by #WiktorStribiżew, just . is good enough for the [^\n] portion:
pat = re.compile(r'jaguar\.vintage\.part = (.+)')
pat2 = re.compile(r'jaguar\.vintage\.sw\.app\.release = (.+)')
Is it possible to use a for loop to search through the text of tags that correspond to a certain phrase. I've been trying to create this loop but isn't hasn't been working. Any help is appreciated thanks! Here is my code:
def parse_page(self, response):
titles2 = response.xpath('//div[#id = "mainColumn"]/h1/text()').extract_first()
year = response.xpath('//div[#id = "mainColumn"]/h1/span/text()').extract()[0].strip()
aud = response.xpath('//div[#id="scorePanel"]/div[2]')
a_score = aud.xpath('./div[1]/a/div/div[2]/div[1]/span/text()').extract()
a_count = aud.xpath('./div[2]/div[2]/text()').extract()
c_score = response.xpath('//a[#id = "tomato_meter_link"]/span/span[1]/text()').extract()[0].strip()
c_count = response.xpath('//div[#id = "scoreStats"]/div[3]/span[2]/text()').extract()[0].strip()
info = response.xpath('//div[#class="panel-body content_body"]/ul')
mp_rating = info.xpath('./li[1]/div[2]/text()').extract()[0].strip()
genre = info.xpath('./li[2]/div[2]/a/text()').extract_first()
date = info.xpath('./li[5]/div[2]/time/text()').extract_first()
box = response.xpath('//section[#class = "panel panel-rt panel-box "]/div')
actor1 = box.xpath('./div/div[1]/div/a/span/text()').extract()
actor2 = box.xpath('./div/div[2]/div/a/span/text()').extract()
actor3 = box.xpath('./div/div[3]/div/a/span/text()').extract_first()
for x in info.xpath('//li'):
if info.xpath("./li[x]/div[1][contains(text(), 'Box Office: ')/text()]]
box_office = info.xpath('./li[x]/div[2]/text()')
else if info.xpath('./li[x]/div[1]/text()').extract[0] == "Runtime: "):
runtime = info.xpath('./li[x]/div[2]/time/text()')
Your for loop is completely wrong:
1. You're using info. but searching from the root
for x in info.xpath('.//li'):
2. x is a HTML node element and you can use it this way:
if x.xpath("./div[1][contains(., 'Box Office: ')]"):
box_office = x.xpath('./div[2]/text()').extract_first()
I think you might need re() or re_first() to match the certain phrase.
For example:
elif info.xpath('./li[x]/div[1]/text()').re_first('Runtime:') == "Runtime: "):
runtime = info.xpath('./li[x]/div[2]/time/text()')
And you need to modify your for loop, cuz the variable x in it is actually a Selector but not a number, so it's not right to use it like this: li[x].
gangabass in the last answer made a good point on this.
I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))
I created the following dictionary:
code dictionary = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
I now want to write some Python code that goes through the dictionary's keys and separates out the codes and their corresponding values. So for the first element in the dictionary, I want to end up with:
News: 'contradictory about news', 'something about news'
comment: 'contradictory about news', 'something about news'
negative: 'contradictory about news'
The end result can be a dictionary, list, or tab or comma-separated text.
You can see my attempt to do this here:
from bs4 import BeautifulSoup as Soup
f = open('transcript.xml','r')
soup = Soup(f)
#print soup.prettify()
#searches text for all w:commentrangestart tags and makes a dictionary that matches ids with text
textdict = {}
for i in soup.find_all('w:commentrangestart'):
# variable 'key' is assigned to the tag id
key = i.parent.contents[1].attrs['w:id']
key = str(key)
#variable 'value' is assigned to the tag's text
value= ''.join(i.nextSibling.findAll(text=True))
# key / value pairs are added to the dictionary 'textdict'
textdict[key]=value
print "Transcript Text = " , textdict
# makes a dictionary that matches ids with codes
codedict = {}
for i in soup.find_all('w:comment'):
key = i.attrs['w:id']
key = str(key)
value= ''.join(i.findAll(text=True))
codedict[key]=value
print "Codes = ", codedict
# makes a dictionary that matches all codes with text
output = {}
for key in set(textdict.keys()).union(codedict.keys()):
print "key= ", key
txt = textdict[key]
print "txt = ", txt
ct = codedict[key]
print "ct= ", ct
output[ct] = txt
#print "output = ", output
print "All code dictionary = ", output
#codelist={}
#for key in output:
# codelist =key.split(";")
#print "codelist= " , codelist
code_negative = {}
code_news = {}
print output.keys()
for i in output:
if 'negative' in output.keys():
print 'yay'
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
For some reason though, I keep getting a key error when I run the last function:
code_negative = {}
code_news = {}
for i in output:
if 'negative' in output.keys():
code_negative[i]=textdict[i]
print 'text coded negative: ' , code_negative
if 'News' in i:
code_news[i]=textdict[i]
print 'text coded News: ' ,code_news
Any ideas? Thanks!
The following code should work, if I understood the problem correctly:
from collections import defaultdict
out = defaultdict(list)
for k, v in code_dictionary.viewitems():
for item in k.split('; '):
out[item].append(v)
output = {u'News; comment; negative': u'contradictory about news', u'News; comment': u'something about news'}
negatives = []
comments = []
news = []
for k, v in output.items():
key_parts = k.split('; ')
key_parts = [part.lower() for part in key_parts]
if 'negative' in key_parts:
negatives.append(v)
if 'news' in key_parts:
news.append(v)
if 'comment' in key_parts:
comments.append(v)