# 中文分词
def fen_ci(string, cut_all=False, append_tag=False, filter_set=[]):
seq = []
if append_tag:
seq.append('start')
for ws in jieba.cut(string, cut_all=cut_all):
if ws not in filter_set:
if Chinese(ws):
for ch in ws:
seq.append(ch)
else:
seq.append(ws)
if append_tag:
seq.append('end')
return seq
def Chinese(str):
if str >= '\u4e00' and str<= '\u9fa5':
return True
else:
return False
直接使用正则:
ls = 'drug:磺胺123甲噁唑amount' p = re.compile(r'[\u4e00-\u9fa5]', re.S) print(re.findall(p,ls))





