传染病传播预测模型

可能认识的人 · 发表于 2022-12-16 19:38:28

运行环境：pycharm 2022.1

建模的几大步骤：
数据准备、数据预处理，特征工程，建模评估，模型优化
数据准备：

老师给的数据集

数据预处理

1.划分数据集

老师给的数据集只有一个，所以我们需要划分数据集。
# 数据集，验证率，种子
def train_valid_split(data_set, valid_ratio, seed):
&#39;&#39;&#39;Split provided training data into training set and validation set&#39;&#39;&#39;
valid_set_size = int(valid_ratio * len(data_set))
train_set_size = len(data_set) - valid_set_size
# 划分训练集，验证集
train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
# 返回np数组
return np.array(train_set), np.array(valid_set)2.先提取目标值和特征值

def select_feat(train_data, valid_data, test_data, select_all=True):
&#39;&#39;&#39;Selects useful features to perform regression&#39;&#39;&#39;
# 抽取训练集，验证集的最后一列数据作为目标值
y_train, y_valid = train_data[:, -1], valid_data[:, -1]
# 抽取训练集，验证集，测试集的第1到n-1列作为特征值
raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data

if select_all:
      # 如果是全选，特征值的索引是列数0--列数-1
      feat_idx = list(range(raw_x_train.shape[1]))
else:
      # 如果不是全选，特征值的索引是0--4
      feat_idx = [0, 1, 2, 3, 4]  # TODO: Select suitable feature columns.
# 返回值是训练集，验证集，测试集的特征值，以及训练集，验证集的目标值
return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid3.去除异常值

class COVID19Dataset(Dataset):
&#39;&#39;&#39;
x: Features.
y: Targets, if none, do prediction.
&#39;&#39;&#39;
def __init__(self, x, y=None):
      if y is None:
         self.y = y
      else:
         self.y = torch.FloatTensor(y)
      self.x = torch.FloatTensor(x)

def __getitem__(self, idx):
      if self.y is None:
         return self.x[idx]
      else:
         return self.x[idx], self.y[idx]

def __len__(self):
      return len(self.x)4.做特征选择

特征值中并不是所有的特征值对目标值的预测都有用，为了减小计算量，我们筛选出32个与目标值相关性最强的特征值.
# 选取最好的32个特征值
def selected_best(x, y = None):

if y is None:
      return x
# 回归任务的标签/功能之间的 F 值
select = SelectKBest(score_func=f_regression, k=32)
z = select.fit_transform(x, y)
# 获取有用特征值的特征
features = select.get_feature_names_out()
print(&#39;打印特征名称&#39;)
# print(select.get_feature_names_out())
print(z.shape)
return z, features建模评估

1.一个五层神经网络

class My_Model(nn.Module):
def __init__(self, input_dim):
      super(My_Model, self).__init__()
      # TODO: modify model&#39;s structure, be aware of dimensions.
      # nn.Sequential可以快速搭建模块
      self.layers = nn.Sequential(
         # 五个线性层，四个ReLU层
         # 五层感知机
         nn.Linear(input_dim, 32),
         nn.ReLU(),
         nn.Linear(32, 8),
         nn.ReLU(),
         nn.Linear(8, 4),
         nn.ReLU(),
         nn.Linear(4, 2),
         nn.ReLU(),
         nn.Linear(2, 1)

      )
# forward函数用来进行网络的前向传播，需要传来相应的tensor
def forward(self, x):

      x = self.layers(x)
      x = x.squeeze(1) # (B, 1) -> (B) 去除size为1的维度
      return x2.训练参数

# 设置训练设备，如果GPU能用就用GPU，不能用就用CPU
device = &#39;cuda&#39; if torch.cuda.is_available() else &#39;cpu&#39;
# 设置一些训练的参数
config = {
&#39;seed&#39;: 5201314,    # Your seed number, you can pick your lucky number. :)
&#39;select_all&#39;: True, # Whether to use all features.
&#39;valid_ratio&#39;: 0.2, # validation_size = train_size * valid_ratio
&#39;n_epochs&#39;: 3000,    # Number of epochs.
&#39;batch_size&#39;: 64,
&#39;learning_rate&#39;: 1e-5,
&#39;early_stop&#39;: 400, # If model has not improved for this many consecutive epochs, stop training.
&#39;save_path&#39;: &#39;./models/model.ckpt&#39;  # Your model will be saved here.
}3.训练函数

# trainer的参数，训练数据集，验证数据，模型，配置，设备
def trainer(train_loader, valid_loader, model, config, device):
# 定义损失函数
criterion = nn.MSELoss(reduction=&#39;mean&#39;)  # Define your loss function, do not modify this.
# Define your optimization algorithm.
# TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
# TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
# 优化器，采用的SGD优化器，使用梯度下降的方法，主要作用是加速模型的收敛，传入的参数有Tensor对象，学习率，权值衰减
optimizer = torch.optim.SGD(model.parameters(), lr=config[&#39;learning_rate&#39;], momentum=0.9)
# 实例化一个SummaryWriter类
writer = SummaryWriter()  # Writer of tensoboard.
# 如果项目中没有models这个文件夹，则创建一个
if not os.path.isdir(&#39;./models&#39;):
      os.mkdir(&#39;./models&#39;)  # Create directory of saving models.
# 设置训练参数
n_epochs, best_loss, step, early_stop_count = config[&#39;n_epochs&#39;], math.inf, 0, 0

# 开始训练
for epoch in range(n_epochs):
      # 设置需要训练的模型
      model.train()  # Set your model to train mode.
      # 生成损失函数的空列表
      loss_record = []

      # tqdm is a package to visualize your training progress.
      train_pbar = tqdm(train_loader, position=0, leave=True)

      # 开始训练
      for x, y in train_pbar:
         # 优化器的梯度设为0
         optimizer.zero_grad()  # Set gradient to zero.
         # 把需要训练的数据放在GPU或CPU上
         x, y = x.to(device), y.to(device)  # Move your data to device.
         # 获取预测值
         pred = model(x)
         # 获取预测值和目标值损失函数的值
         loss = criterion(pred, y)
         loss.backward()  # Compute gradient(backpropagation).
         # 更新优化器参数
         optimizer.step()  # Update parameters.

         step += 1
         # 在损失值得列表中追加这一轮的损失值
         loss_record.append(loss.detach().item())

         # Display current epoch number and loss on tqdm progress bar.
         # 使用tqdm显示出训练过程的进度条
         train_pbar.set_description(f&#39;Epoch [{epoch + 1}/{n_epochs}]&#39;)
         train_pbar.set_postfix({&#39;loss&#39;: loss.detach().item()})
      # 计算损失值列表的平均损失值
      mean_train_loss = sum(loss_record) / len(loss_record)
      # 写入Loss/train这个图表中
      writer.add_scalar(&#39;Loss/train&#39;, mean_train_loss, step)

      # 设置模型的评估方式
      model.eval()  # Set your model to evaluation mode.
      # 生成损失函数的空列表
      loss_record = []

      # 验证集开始，过程跟训练集一样
      for x, y in valid_loader:
         x, y = x.to(device), y.to(device)
         # 这一步我也不懂，好像很多模型验证的时候都有
         with torch.no_grad():

            pred = model(x)
            loss = criterion(pred, y)

         loss_record.append(loss.item())

      mean_valid_loss = sum(loss_record) / len(loss_record)
      print(f&#39;Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}&#39;)

      writer.add_scalar(&#39;Loss/valid&#39;, mean_valid_loss, step)

      # 如果验证集的平均损失值小于最小的损失值，则更新最小损失值，并保存最好的模型，停止步骤设置为0，否则停止步骤+1
      if mean_valid_loss < best_loss:
         best_loss = mean_valid_loss
         torch.save(model.state_dict(), config[&#39;save_path&#39;])  # Save your best model
         print(&#39;Saving model with loss {:.3f}...&#39;.format(best_loss))
         early_stop_count = 0
      else:
         early_stop_count += 1

      # 如果训练400次损失值没有提升，则停止训练
      if early_stop_count >= config[&#39;early_stop&#39;]:
         print(&#39;\nModel is not improving, so we halt the training session.&#39;)
         return测试

测试过程

# 测试
model = My_Model(input_dim=x_train.shape[1]).to(device)

model.load_state_dict(torch.load(config[&#39;save_path&#39;]))

preds = predict(test_loader, model, device)

save_pred(preds, &#39;pred.csv&#39;)
# 预测函数，测试数据，模型，设备
def predict(test_loader, model, device):
# model = My_Model(input_dim=117).to(device)
model.eval() # Set your model to evaluation mode.
preds = []
with torch.no_grad():
      for x in tqdm(test_loader):
         x = x.to(device)

         pred = model(x)
         preds.append(pred.detach().cpu())
preds = torch.cat(preds, dim=0).numpy()
return preds
# 将预测结果写入文件
def save_pred(preds, file):
&#39;&#39;&#39; Save predictions to specified file &#39;&#39;&#39;
with open(file, &#39;w&#39;) as fp:
      writer = csv.writer(fp)
      writer.writerow([&#39;id&#39;, &#39;tested_positive&#39;])
      for i, p in enumerate(preds):
         writer.writerow([i, p])测试结果

CV爵士天才：传染病传播预测模型

匹马又西风 · 发表于 2025-6-17 20:15:21

垃圾内容，路过为证。

		自动登录	找回密码
密码			立即注册