您好,我有一个如下所示的dataframe,我需要匹配一个后面跟着其单位的数字,并且只返回数字
我有,毫升,加仑,l等单位
输入:
text
1234567-CAR WA GK5 9x78x90 12L
3456789 TOP-L BD3 195x169x62 TopL 预期输出:
text extract Return
1234567-CAR WA GK5 9x78x90 12L 12L 12
3456789 TOP-L BD3 195x169x62 TopL - -代码:
def names(header):
if re.search('([0-9]+(\.[0-9]*|)(\s|[a-z]*)(\s|[a-z]*)(\s|)ml)',header):
pos_start = re.search('([0-9]+(\.[0-9]*|)(\s|[a-z]*)(\s|[a-z]*)(\s|)ml)', header).start()
pos_end = re.search('([0-9]+(\.[0-9]*|)(\s|[a-z]*)(\s|[a-z]*)(\s|)ml)', header).end()
return header[pos_start:pos_end]
elif re.search('((\d*)l)',header):
pos_start = re.search('((\d*)l)', header).start()
pos_end = re.search('((\d*)l)', header).end()
return header[pos_start:pos_end]
def measure(val):
ml=['ml','ML','mL','Ml']
l=['l','L','Lt','lt']
if any(x in val for x in ml):
return float(re.findall('(\d+\.\d+|\d+)', val)[0])
if any(x in val for x in l):
return float(re.findall('(\d+\.\d+|\d+)', val)[0])*1000
df_result = pd.concat([df['A'],df['text'],df['B'],df['text'].apply(names),(df['text'].apply(names)).dropna().apply(measure)],axis=1)错误:
---> 22 return float(re.findall('(\d+\.\d+|\d+)', val)[0])*1000
IndexError: list index out of range发布于 2019-09-19 17:42:10
def measure(val):
header_l = header.lower()
m = re.match(`(\d+(\.\d+)?)(m?[illi]*l[iters]*)`, header_l, re.IGNORECASE)
if m:
as_ml = 1 if m.group(3).startswith('m') else 1000
return float(m.group(1)) * as_ml
return None发布于 2019-09-19 21:19:26
看看这对你是否有效
df['extract']= [val[-1] for val in df['text'].str.split()]
df['Return']=df['extract'].str.extract(r'(\d+)').fillna('-')
print(df)
text extract Return
0 1234567-CAR WA GK5 9x78x90 12L 12L 12
1 3456789 TOP-L BD3 195x169x62 TopL TopL -https://stackoverflow.com/questions/58007676
复制相似问题