torchtext 使用案例

torchtext 使用案例

构造数据集

root = ‘dataset/Long-document-dataset/csv/’
cls_map = {n:i for i,n in enumerate(os.listdir(root))}
print(cls_map)
corpus = []
for P in os.listdir(root):
p = root+P+’/’+P+’/’
print(p)
for fp in os.listdir(p):
with open(p+fp) as f:
text = ‘ ‘.join(f.readlines())
corpus.append([text,[cls_map[P]]])
random.shuffle(corpus)
构造Field、Dataset、Iterator

from torchtext.vocab import GloVe
from torchtext.data import Example, BucketIterator, Iterator

tokenize = lambda x: x.split()
MAX_LEN = 100
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=100)
LABEL = data.Field(sequential=False, use_vocab=False)

class MyDataset(data.Dataset):
def __init__(self, csv_data, text_field, label_field, test=False, aug=False, **kwargs):

# csv_data = pd.read_csv(csv_path) #其实是list:[[text,labeb], …]
fields = [(“id”, None),(“text”, text_field), (“label”, label_field)]

examples = []
for text in csv_data:
# examples.append(data.Example.fromlist([None, text[0][:MAX_LEN], text[1] if not test else None], fields))
examples.append(data.Example.fromlist([None, text[0][:1000], text[1]], fields))

# 上面是一些预处理操作,此处调用super调用父类构造方法,产生标准Dataset
# super(MyDataset, self).__init__(examples, fields, **kwargs)
super(MyDataset, self).__init__(examples, fields)

def shuffle(self, text):
# 序列随机排序
text = np.random.permutation(text.strip().split())
return ‘ ‘.join(text)

def dropout(self, text, p=0.5):
# 随机删除一些文本
text = text.strip().split()
len_ = len(text)
indexs = np.random.choice(len_, int(len_ * p))
for i in indexs:
text[i] = ”
return ‘ ‘.join(text)

def data_iter(TEXT, LABEL):

train = MyDataset(corpus[:20000], text_field=TEXT, label_field=LABEL, test=False, aug=1)
test = MyDataset(corpus[20000:], text_field=TEXT, label_field=LABEL, test=True, aug=1)#label_field=None
# 传入用于构建词表的数据集
# TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
vectors = Vectors(name=’/home1/lihaoyuan/data/NLP/glove/glove.6B.100d.txt’, cache=’.vector_cache’)
TEXT.build_vocab(train, vectors=vectors)
# TEXT.build_vocab(train, vectors=GloVe(name=’6B’, dim=300)) # TEXT.build_vocab(train, vectors=”glove.6B.200d”)
weight_matrix = TEXT.vocab.vectors
print(weight_matrix.shape)
# 只针对训练集构造迭代器
# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)

# 同时对训练集和验证集构造迭代器
# train_iter, val_iter = data.BucketIterator.splits(
# (train, valid),
# batch_sizes=(8, 8),
# # 如果使用gpu,此处将-1更换为GPU的编号
# device=-1,
# # 用来排序的指标
# sort_key=lambda x: len(x.text),
# sort_within_batch=False,
# repeat=False
# )
train_iter = Iterator(train, batch_size=64, device=torch.device(‘cuda:0’), sort=False, sort_within_batch=False, repeat=False)
test_iter = Iterator(test, batch_size=64, device=torch.device(‘cuda:0’), sort=False, sort_within_batch=False, repeat=False)
return train_iter, test_iter, weight_matrix

train_iter, test_iter, weight_matrix = data_iter(TEXT, LABEL)
分类模型

class RNN(nn.Module):

def __init__(self):
super(RNN, self).__init__()
self.word_embeddings = nn.Embedding(len(TEXT.vocab), 100) # embedding之后的shape: torch.Size([b, 100, 100])
self.word_embeddings.weight.data.copy_(weight_matrix)
self.lstm = nn.LSTM(input_size=100, hidden_size=128, bidirectional=True, num_layers=1) # torch.Size([b, 100, 128])
self.decoder = nn.Linear(256, 8)

def forward(self, sentence):
embeds = self.word_embeddings(sentence)
lstm_out = self.lstm(embeds)[0]
# final = lstm_out[-1] # 取*后一个时间步
final = lstm_out.mean(0) # 平均
y = self.decoder(final)
return y

acc_ = []
def main():
model = RNN().cuda()
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
loss_funtion = F.cross_entropy

for epoch in range(10):
for i, batch in enumerate(train_iter):
optimizer.zero_grad()
predicted = model(batch.text)

loss = loss_funtion(predicted, batch.label.view(-1))
loss.backward()
optimizer.step()

acc = (predicted.argmax(1) == batch.label.view(-1)).sum().float() / batch.label.size(0)
acc_.append(acc.item())

print(‘epoch:%d loss:%.3f acc:%.3f’%(epoch+1, loss.item(), np.mean(acc_)))

model.eval()
acc_test = []
for i, batch in enumerate(test_iter):
predicted = model(batch.text).argmax(1)
acc = (predicted == batch.label.view(-1)).sum().float() / batch.label.size(0)
acc_test.append(acc.item())
print(np.mean(acc_test))

if __name__ == ‘__main__’:
main()
参考:

官方文档:https://torchtext.readthedocs.io/en/latest/vocab.html?highlight=vocab

https://blog.csdn.net/nlpuser/article/details/88067167

https://www.cnblogs.com/linzhenyu/p/13277552.html