pytorch实现titanic数据预测

用最近流行的pytorch实现一遍

import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset

# Configurations
OLD_INDEX = ['Pclass', 'Sex', 'Age', 'UknAge', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']
NEW_INDEX = ['Age', 'UknAge', 'Fare',
             'Pclass_0', 'Pclass_1', 'Pclass_2',
             'Sex_0', 'Sex_1',
             'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8',
             'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9',
             'Embarked_0', 'Embarked_1', 'Embarked_2',
             'Survived'
             ]
MAP_Sex = {'male': 0, 'female': 1}
MAP_Embarked = {'C': 0, 'Q': 1, 'S': 2}
ONE_HOT = [[1, 0], [0, 1]]
FEATURES = 26

PATH = ""


# 数据预处理
def preprocess(data, flag):
    # Data Cleaning
    data = pd.DataFrame(data, columns=OLD_INDEX)
    data['UknAge'] = data['UknAge'].fillna(0)
    data['Survived'] = data['Survived'].fillna(0)
    #### print(data[data['Age'].isnull()])
    data.loc[data['Age'].isnull(), 'UknAge'] = 1
    data['Age'] = data['Age'].fillna(0)
    #### print(data[data['Fare'].isnull()])
    data['Fare'] = data['Fare'].fillna(14.4)
    #### print(data[data['Embarked'].isnull()])
    data['Embarked'] = data['Embarked'].fillna('C')
    #### One-hot Encoding
    data['Pclass'] -= 1
    data['Sex'] = data['Sex'].map(MAP_Sex)
    data['Embarked'] = data['Embarked'].map(MAP_Embarked)
    data = pd.get_dummies(data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
    data = pd.DataFrame(data, columns=NEW_INDEX)
    data = data.fillna(0)
    #### Normalization
    for col in NEW_INDEX:
        pass
        maximum = data[col].max()
        if maximum > 0:
            data[col] /= maximum
    #### To List
    temp = np.array(data, dtype=np.float32)
    if flag == "train":
        x_data = torch.from_numpy(temp[:, :-1])
        y_data = torch.from_numpy(temp[:, [-1]])
        return x_data, y_data
    elif flag == "test":
        x_data = torch.from_numpy(temp[:, :-1])
        return x_data


# prepare the data
class TitanicDataSets(Dataset):
    def __init__(self, filepath,flag):
        xy = preprocess(pd.read_csv(filepath + "train.csv"), flag="train")

        if flag == "train":
            self.x_data = xy[0][:800]
            self.y_data = xy[1][:800]
            self.len = self.x_data.shape[0]
        if flag == "test":
            self.x_data = xy[0][800:892]
            self.y_data = xy[1][800:892]
            self.len = self.x_data.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len


train_dataset = TitanicDataSets(filepath=PATH,flag="train")
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=0)

test_dataset = TitanicDataSets(filepath=PATH,flag="test")
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, num_workers=0)


# 构建模型
class TitanicModel(torch.nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()
        self.linear1 = torch.nn.Linear(26, 13)
        self.linear2 = torch.nn.Linear(13, 6)
        self.linear3 = torch.nn.Linear(6, 4)
        self.linear4 = torch.nn.Linear(4, 2)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = self.linear4(x)
        return x

titanic_model = TitanicModel()


# 构建损失函数和优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=titanic_model.parameters(), lr=0.01, momentum=0.56)


# 构建循环
def train(epochs):
    for epoch in range(epochs):
        for i, data in enumerate(train_loader, 0):
            # prepare data
            inputs, labels = data
            # 前馈
            y_predict = titanic_model(inputs)
            target = torch.Tensor([x.tolist()[0] for x in labels])
            target = target.type(torch.long)
            loss = criterion(y_predict, target)
            # 反馈
            optimizer.zero_grad()
            loss.backward()
            # 更新
            optimizer.step()


def test():
    total = 0
    correct = 0
    with torch.no_grad():
        for data in test_loader:
            target = torch.Tensor([x.tolist()[0] for x in data[1]])
            outputs = titanic_model(data[0])
            _, predicted = torch.max(outputs, dim=1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    acc = 100 * correct / total
    print("acc:{}%".format(acc))
    return acc

ACC = []
EPOCH = []
for i in range(20):
    train(epochs=10)
    acc = test()
    EPOCH.append(i)
    ACC.append(acc)

plt.plot(EPOCH, ACC, ls="-.", lw=2, c="c", label="plot figure")
plt.xlabel('num of train')
plt.ylabel('loss')
plt.grid()  # 网格
plt.show()


# 输出数据
out_data = preprocess(pd.read_csv(PATH + "test.csv"), flag="test")

with torch.no_grad():
    outputs = titanic_model(out_data)
    _, predicted = torch.max(outputs, dim=1)
    submission = pd.DataFrame({'PassengerId': list(range(892, 1310)), 'Survived':predicted.tolist()})
    submission.to_csv(PATH+"submission.csv", index=0)

推荐阅读更多精彩内容