import sys
import os
import json
import re
import numpy as np
def PraseRawdata(author = None,constrain = None,src='./chinese-poetry/json/simplified', category="poet.tang"):
def sentenceParse(para):
res, num = re.subn(u'(.*)','',para)
res,num = re.subn(u'{.*}','',res)
res,num = re.subn(u'《.*》','',res)
res,num = re.subn(u'[\]\[]','',res)
r = ''
for i in res:
if i not in set('0123456789-'):
r+=i
r,num = re.subn(u'。。','。',r)
return r
def haddlejson(file):
rst =[]
data = json.loads(open(file).read())
for poetry in data:
pdata =""
if(author is not None and poetry.get("author")!= author):
return None
p = poetry.get("paragraphs")
flag = False
for s in p:
sp = re.split(u"[,!。]", s)
for tr in sp:
if constrain is not None and len(tr) != constra
一种经典的自然语言处理数据预处理方式
最新推荐文章于 2024-11-01 09:48:03 发布
该博客介绍了如何处理中文诗歌数据,包括解析JSON文件,进行文本清理,预处理序列,以及使用Keras的`pad_sequences`进行序列填充,确保所有序列长度一致。此外,还涉及到字典映射和数据保存为二进制文件。


1414

被折叠的 条评论
为什么被折叠?



