import pandas as pd
import numpy as np
# 泰克尼克号获救信号
'''
'PassengerId' 乘客id
'Survived' 标签值
'Pclass' 仓位等级
'Name' 姓名
'Sex' 性别
'Age' 年龄
'SibSp' 兄弟姐妹数量
'Parch' 老人孩子总数
'Ticket' 票号
'Fare' 票价
'Cabin' 座位
'Embarked' 登船地点
'''
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
titanic_survival = pd.read_csv("titanic_train.csv")
print(titanic_survival.columns)
print(titanic_survival.head())
# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
# Age没有值的数量
age = titanic_survival["Age"]
print(age)
age_is_null = pd.isnull(age) # pd.isnull true:是缺失值 flase:不是缺失值
print(age_is_null)
age_null_true = age[age_is_null]
print(len(age_null_true)) # Age缺失的数量 177
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age) # nan
good_ages = titanic_survival["Age"][age_is_null == False]
print(sum(good_ages) / len(good_ages)) # 29.6991176471
correct_mean_age = titanic_survival["Age"].mean() # mean,平均年龄,自动筛选出没有age的数据
print(correct_mean_age) # 29.69911764705882
# 每个仓位票价的均价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
fare_for_class = pclass_rows["Fare"].mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
# pivot_table 数据统计
# index 按照那个分组
# values 是我们要应用计算的列
# aggfunc 指定我们要执行的计算
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print(passenger_Fare)
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare") # 默认求平均值
print(passenger_Fare)
# 按照Embarked进行分组,求Fare,Survived的和
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare", "Survived"], aggfunc=np.sum)
print(port_stats)
print(titanic_survival)
# 指定axis= 1或axis='columns' 删除任何具有空值的列
print(titanic_survival.fillna(0)) # 缺失值都填充为0
print(titanic_survival.dropna(axis=1)) # 删除有空值的列
print(titanic_survival.dropna(axis=0)) # 删除有空值的行
print(titanic_survival.dropna(axis=0, subset=["Age", "Sex"])) # 删除"Age", "Sex"有空值的行
row_index_83_age = titanic_survival.loc[83, "Age"] # 行号、列名
row_index_1000_pclass = titanic_survival.loc[766, "Pclass"]
print(titanic_survival.loc[83])
print(row_index_83_age)
print(titanic_survival.loc[766])
print(row_index_1000_pclass)
new_titanic_survival = titanic_survival.sort_values("Age", ascending=True)
print(new_titanic_survival)
print(new_titanic_survival.loc[:10])
itanic_reindexed = new_titanic_survival.reset_index(drop=True) # 重建索引,返回重建索引后的矩阵
print(itanic_reindexed.loc[:10])
# 返回一系列的第一百个项目
def hundredth_row(column):
return column.loc[99]
# apply运行自定义函数
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
# 缺失值的数量
def not_null_count(column):
column_null = pd.isnull(column)
return len(titanic_survival[column_null])
print(pd.isnull(titanic_survival))
print(len(pd.isnull(titanic_survival))) # 891
column_null_count = titanic_survival.apply(not_null_count, axis=0) # axis=0 列,axis=1 行
print(column_null_count)
def which_class(row):
pclass = row["Pclass"]
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print(classes)
def is_minor(row):
if row["Age"] > 18:
return True
else:
return False
ages = titanic_survival.apply(is_minor, axis=1)
print(ages)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)
titanic_survival['age_labels'] = age_labels
# 成年、未成年的获救率
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)