import numpy as np
import pandas as pd
from collections import Counter
import os
import requests

#目的是将label的数值 -1 缩放到[0,4]之间
def get_label(label):
    label = label - 1
    return label
data["target"] = data['target'].apply(get_label)

data = data.drop_duplicates('text',keep='first')
data = data.reset_index(drop=True)

import re
def clear_character(sentence):
    pattern2 = re.compile(u'[^\s1234567890::' + '\u4e00-\u9fa5]+')
    line1=re.sub(pattern1,'',sentence)   #去除英文字母和数字
    line2=re.sub(pattern2,'',line1)   #去除表情和其他字符
    line3=re.sub(pattern3,'',line2)   #去除去掉残留的冒号及其它符号
    new_sentence=''.join(line3.split()) #去除空白
    return new_sentence


# 导入中文分词包jieba, 并用jieba对原始文本做分词
import jieba
from tqdm import tqdm
def comment_cut(content):
    # TODO: 使用结巴完成对每一个comment的分词
    seg = list(jieba.cut(content.strip()))
    return seg
# 输出进度条
data['text'] = data['text'].progress_apply(comment_cut)
# 观察新的数据的格式

# 停用词可以去网上搜下载的停用词表改为json格式,读取下载的停用词表,并保存在列表中
with open("D:\\shujuji\\1\\stopwords.json","r",encoding='utf-8') as f:
    stopWords = f.read().split("\n")  
# 去除停用词
def rm_stop_word(wordList):
    filtered_words = [word for word in wordList if word not in stopWords]
    return filtered_words
    #return " ".join(filtered_words)
data['text'] = data['text'].progress_apply(rm_stop_word)
# 观察新的数据的格式

# 去除低频词, 去掉词频小于10的单词,并把结果存放在data['comment_processed']里
from collections import Counter
list_set = []
for i in range(len(data)):
    for j in data.iloc[i]['text']: 
words_count = Counter(list_set)
my_dict = {k: v for k, v in words_count.items() if v < min_threshold}
filteredA = Counter(my_dict)
# 去除低频词
def rm_low_frequence_word(wordList):
    # your code, remove stop words
    # TODO
    outstr = ''
    for word in wordList:
        if word not in filteredA:
            if word != '\t':
                outstr += word
                outstr += " "
    #filtered_words = [word for word in wordList if word not in filteredA]
    return outstr
data['text'] = data['text'].progress_apply(rm_low_frequence_word)

import collections
import os
import random
import  time
from tqdm import tqdm
import numpy as np
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
#os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device=torch.device("cuda:6" if torch.cuda.is_available() else "cpu")

word_list=[str(s).split() for s in data["text"]]

from gensim.models.word2vec import Word2Vec
import time
start = time.time()
model_w2v = Word2Vec(word_list, window = 3, iter = 5,size=256,min_count=1)
end = time.time()
print('花费时间:', end - start)

from sklearn.model_selection import train_test_split
Temp_trin, valid_data = train_test_split(data,test_size=0.2, random_state=42) #默认split_ratio=0.7
train_data,test_data = train_test_split(Temp_trin,test_size=0.2, random_state=42)

import torch
import torchtext 
from torchtext.legacy import data
from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset
torch.backends.cudnn.deterministic = True
tokenize = lambda x:x.split()
TEXT = data.Field(sequential=True,tokenize=tokenize)
LABEL = data.Field(sequential=False, dtype=torch.long, use_vocab=False)
fields = [('text',TEXT), ('label',LABEL)]

class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    def sort_key(ex):
        return len(ex.text)

    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
train_df,val_df,test_df = DataFrameDataset.splits(fields, train_df=train_data, val_df=valid_data, test_df = test_data)

# 构建词表
# print(train[0].__dict__.keys())
print(TEXT.vocab.itos[:10]) #查看TEXT单词表

import numpy as np
embedding_dic = dict(zip(model_w2v.wv.index2word, model_w2v.wv.syn0))
embedding_matrix = np.zeros((len(TEXT.vocab), 256))
for w, i in TEXT.vocab.stoi.items():
    embedding_vec = embedding_dic.get(w)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec

from torchtext.legacy.data import Iterator, BucketIterator
train_batch_size = 64
val_batch_size = 64
test_batch_size = 64
# 同时对训练集和验证集进行迭代器构建
train_iterator, valid_iterator = BucketIterator.splits(
    (train_df, val_df),
    batch_sizes=(train_batch_size, val_batch_size),
    sort_key=lambda x: len(x.text),
# 对测试集进行迭代器构建
test_iterator = Iterator(

import torch.nn as nn
import torch.nn.functional as F
class LSTMmodel(nn.Module):
    def __init__(self,embedding_size,hidden_size,output_size):
        self.dropout = nn.Dropout(0.5)
    def attention_net(self, x, query, mask=None): 
        d_k = query.size(-1)     # d_k为query的维度
        # query:[batch, seq_len, hidden_dim*2], x.t:[batch, hidden_dim*2, seq_len]
#         print("query: ", query.shape, x.transpose(1, 2).shape)  # torch.Size([128, 38, 128]) torch.Size([128, 128, 38])
        # 打分机制 scores: [batch, seq_len, seq_len]
        scores = torch.matmul(query, x.transpose(1, 2)) / math.sqrt(d_k)  
#         print("score: ", scores.shape)  # torch.Size([128, 38, 38])
        # 对最后一个维度 归一化得分
        alpha_n = F.softmax(scores, dim=-1) 
#         print("alpha_n: ", alpha_n.shape)    # torch.Size([128, 38, 38])
        # 对权重化的x求和
        # [batch, seq_len, seq_len]·[batch,seq_len, hidden_dim*2] = [batch,seq_len,hidden_dim*2] -> [batch, hidden_dim*2]
        context = torch.matmul(alpha_n, x).sum(1)
        return context, alpha_n
    def forward(self,text):
        #hidden的维度是(num_layers * num_directions, batch, hidden_size)取最后一层的前向和后向输出,[4,64,hidden_size]
        h = torch.cat((hidden[-1, :, :], hidden[-2, :, :]), dim=1)
        output = output.permute(1, 0, 2)  # [batch, seq_len, hidden_dim*2]
        query = self.dropout(output)
        # 加入attention机制
        attn_output, alpha_n = self.attention_net(output, query)
        output = self.fc(attn_output) 
#         output=self.fc(h)
        return output 

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    count = 0
    model.train() #model.train()代表了训练模式
    for batch in iterator: #iterator为train_iterator
        optimizer.zero_grad() #加这步防止梯度叠加
        predictions = model(batch.text)
        #batch.comment_processed comment_processed
        loss = criterion(predictions, batch.label)
        epoch_loss += loss.item()
        loss.backward() #反向传播
        optimizer.step() #梯度下降
        epoch_acc += ((predictions.argmax(axis = 1)) == batch.label).sum().item()
        #(acc.item():一个batch的正确率) *batch数 = 正确数
        total_len += len(batch.label)
        count += 1
    return epoch_loss / total_len, epoch_acc / total_len
    #epoch_loss / total_len :train_iterator所有batch的损失
    #epoch_acc / total_len :train_iterator所有batch的正确率

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    count = 0
    with torch.no_grad():
        for batch in iterator: 
            predictions = model(batch.text)
            loss = criterion(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += ((predictions.argmax(axis = 1)) == batch.label).sum().item()
            total_len += len(batch.label)
            count += 1
    model.train() #调回训练模式   
    return epoch_loss / total_len, epoch_acc / total_len


model = LSTMmodel(embedding_size = EMBEDDING_SIZE,
                 hidden_size = HIDDEN_SIZE,
                 output_size = OUTPUT_SIZE,).to(device)

#from_munpy ndarray和tensor转换

def count_parameters(model): #统计模型参数
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

import torch.optim as optim
optimizer = optim.Adam(model.parameters()) #定义优化器
criterion = nn.CrossEntropyLoss()  #定义损失函数,交叉熵损失函数
model = model.to(device) #送到gpu上去
criterion = criterion.to(device) #送到gpu上去

import time 
def epoch_time(start_time, end_time):  #查看每个epoch的时间
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

import math
best_valid_loss = float('inf') #无穷大
for epoch in tqdm(range (N_EPOCHS),desc='Processing'):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss: #只要模型效果变好,就存模型
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'Best-Checkpoint.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
