import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


df = pd.read_csv('C:/Users/sarac/Desktop/Projects/KagglePrivate/Titanic_Data/titanic/train.csv')
df


def missing_matrix(dataframe):
    missing = dataframe.isnull().sum().reset_index()
    missing.columns = (["column","count"])
    missing["ratio"] = missing["count"]/len(dataframe.index)
    missing = missing.loc[missing["count"] > 0].sort_values(by=["ratio"], ascending = False)
    return missing


missing_matrix(df)


dftest = pd.read_csv('C:/Users/sarac/Desktop/Projects/KagglePrivate/Titanic_Data/titanic/test.csv')
dftest.insert(1, "Survived", np.nan)


dfcomb = pd.concat([df, dftest], ignore_index = True)
dfcomb


dfclean = dfcomb


if "Title" not in dfclean.columns:
    dfclean.insert(3, "Title", dfclean["Name"]) #add the new column


for i in dfclean:
    dfclean["Title"] = dfclean["Name"].str.extract('([A-Za-z]+)\.')
    
print(dfclean["Title"].unique())  #We took every string combination that came before the dot

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'Countess' 'Jonkheer' 'Dona']


def replace_titles(dataframe):
    title = dataframe["Title"]
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Sir']:
        return 'Mr'
    elif title in ['Countess', 'Mme', 'Lady', 'Dona']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if dataframe['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title


dfclean["Title"]=dfclean.apply(replace_titles, axis=1)
dfclean[["Name","Title"]]


dfclean[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

                
dfclean['Title'] = dfclean['Title'].map( {"Mr": 1, "Master": 2, "Mrs": 4, "Miss": 3} ).astype(int)


import requests
from bs4 import BeautifulSoup
from pprint import pprint


titanic_page = requests.get("https://en.wikipedia.org/wiki/Passengers_of_the_Titanic")
titanic_tree = BeautifulSoup(titanic_page.content, "html.parser")
column_names = ["NameWeb", "AgeWeb"]
NameW = []

for info in titanic_tree.find_all("tr"):    
    col = 0
    for td in info.find_all('td'):
        td_text = td.get_text().strip()
        if col == 0:
            Name = td_text
            NameW.append(Name)
        col += 1

last = NameW.index("Zimmermann, Mr. Leo") #the last element we need is number 1310
#pprint(NameW)
Name = NameW[:last + 1]

#We managed to retrieve all the names but also some info we do not need that's why we stop the list at "Mr Zimmerman"


AgeW = []

for info in titanic_tree.find_all("tr"):
    col = 0
    for td in info.find_all('td'):
        td_text = td.get_text().strip()
        if col == 1:
            Age = td_text
            AgeW.append(Age)
        col += 1
        

Age = AgeW[:last + 1]

def num_there(s):
    return any(i.isdigit() for i in s)

def list_cleaner(list):
    for x in list:
        ind = list.index(x)
        if x == "–":
            list[ind] = round(dfclean["Age"].mean(),0)
        elif "mo." in x:
            months = ''.join(filter(str.isdigit, x))
            list[ind] = round(int(months)/12,2)
        elif num_there(x) == False:
            list[ind] = list[ind - 1]
    return list

list_cleaner(Age)

Age.insert(19, "24")  #some more random cleaning because teh html layout of the wiki is not perfect
del Age[-1]
Age.insert(293, "48")
del Age[-1]
Age.insert(442, "24")
del Age[-1]

#pprint(Age)

#print(len(Age)) #  =1311 

for n in Name:
    indMiss = Name.index(n)
    if "Miss" in n:
        a = n.replace("Miss", "Miss.")
        a = a.replace("Miss..", "Miss.")
        Name[indMiss] = a

w = {'Name':Name,'Age':Age}
Webdf = pd.DataFrame(w)

Webdf
miss = missing_matrix(dfclean)
miss


indiceslist = []
indexMiss = 0
count = 0
for i in dfclean["Age"]:
    if np.isnan(i):
        if any(dfclean.at[indexMiss, "Name"] in string for string in Webdf["Name"]):
            indice = [i for i, s in enumerate(Webdf["Name"]) if dfclean.at[indexMiss, "Name"] in s]
            #print(indice[0])         #if there are more than 1 this could be a bad idea but in our case the only
            indiceslist.append(indice)     #moment it happens, we take the first one so its ok
            
            #print(dfclean.at[indexMiss, "Name"])
            dfclean.at[indexMiss, "Age"] = Webdf.at[indice[0], "Age"]
            count += 1

    indexMiss += 1


def checkIfDuplicates(listOfElems):
    ''' Check if given list contains any duplicates '''    
    for elem in listOfElems:
        if listOfElems.count(elem) > 1:
            return True
    return False


checkIfDuplicates(indiceslist)

from nltk import flatten
indices = flatten(indiceslist) #because we actually have a list of list


missing_matrix(dfclean)


#{"Mr": 1, "Master": 2, "Mrs": 4, "Miss": 3}

dfclean.loc[(dfclean.Age.isnull())&(dfclean.Title==1),"Age"] = round(dfclean.Age[dfclean.Title==1].mean(),0)
dfclean.loc[(dfclean.Age.isnull())&(dfclean.Title==4),"Age"] = round(dfclean.Age[dfclean.Title==4].mean(),0)
dfclean.loc[(dfclean.Age.isnull())&(dfclean.Title==3),"Age"] = round(dfclean.Age[dfclean.Title==3].mean(),0)
dfclean.loc[(dfclean.Age.isnull())&(dfclean.Title==2),"Age"] = round(dfclean.Age[dfclean.Title==2].mean(),0)

missing_matrix(dfclean)


dfclean.insert(7, "AgeRange", dfclean['Age'], allow_duplicates=True)


dfclean.loc[ dfclean['AgeRange'] <= 20, 'AgeRange'] = 0
dfclean.loc[(dfclean['AgeRange'] > 20) & (dfclean['AgeRange'] <= 40), 'AgeRange'] = 1
dfclean.loc[(dfclean['AgeRange'] > 40) & (dfclean['AgeRange'] <= 60), 'AgeRange'] = 2
dfclean.loc[(dfclean['AgeRange'] > 60) & (dfclean['AgeRange'] <= 80), 'AgeRange'] = 3
dfclean.loc[ dfclean['AgeRange'] > 80, 'AgeRange']

Series([], Name: AgeRange, dtype: float64)


#Let's create our "Family" variable

if "Family" not in dfclean.columns:
    dfclean["Family"] = dfclean["SibSp"] + dfclean["Parch"]
    #dfclean = dfclean.drop(["Parch"], axis="columns")
    #dfclean = dfclean.drop(["SibSp"], axis="columns")
    
#And our "Group" variable

difftickets = dfclean['Ticket'].value_counts(dropna=False)

if "Group" not in dfclean.columns:
    dfclean.insert(11, "Group", 1)
  
for t in difftickets.index:
    dfclean.loc[(dfclean.Ticket==t),"Group"] = difftickets[t] - 1  #we want to count "friends" only not the person itself

pd.set_option("display.max_rows", 10)
#dfclean.sort_values(by=['Group', 'Ticket'], ascending=False)


if 'Alone' not in dfclean.columns:
    dfclean.insert(13, "Alone", 0, allow_duplicates=True)
    dfclean.loc[dfclean['Group'] == 0, 'Alone'] = 1


dfclean['Embarked'] = dfclean['Embarked'].fillna('S')
dfclean['Embarked'] = dfclean['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)


dffare0 = dfclean[dfclean["Fare"] == 0]

indexlist = dffare0.index

dfclean.loc[(dfclean.Fare == 0 ) & (dfclean.Pclass==1),"Fare"] = round(dfclean.Fare[dfclean.Pclass==1].mean(),0)
dfclean.loc[(dfclean.Fare == 0 ) & (dfclean.Pclass==2),"Fare"] = round(dfclean.Fare[dfclean.Pclass==2].mean(),0)
dfclean.loc[(dfclean.Fare == 0 ) & (dfclean.Pclass==3),"Fare"] = round(dfclean.Fare[dfclean.Pclass==3].mean(),0)

ind = dfclean[dfclean['Fare'].isna()].index
ind
dfclean.at[ind[0],'Fare'] = round(dfclean.Fare[dfclean.Pclass==3].mean(),0)


import re

deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

if "Cabin" in dfclean.columns:
    dfclean["Cabin"] = dfclean["Cabin"].fillna("U0")
    dfclean["Deck"] = dfclean['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dfclean["Deck"] = dfclean['Deck'].map(deck)
    dfclean['Deck'] = dfclean['Deck'].fillna(0)
    dfclean['Deck'] = dfclean['Deck'].astype(int)
 
if "Cabin" in dfclean.columns:
    dfclean = dfclean.drop("Cabin", axis = 1)


dfclean["FarePerCap"] = round(dfclean["Fare"]/(dfclean["Group"] + 1),2)


if 1 in dfclean["Sex"]:
    dfclean['Sex'] = dfclean['Sex'].replace([1, 0], ['female', 'male'])


dfcleanTrain = dfclean.iloc[:891, :]
dfcleanTest = dfclean.iloc[891:, :]


dfgender = dfcleanTrain.groupby('Sex').aggregate(Count = ("Survived", 'count'),
                                      Survivors = ("Survived", 'sum'),
                                      Mean_age = ('Age', 'mean'),
                                      Mean_fare = ('FarePerCap', 'mean')).round(2)

dfgender.insert(2, 'Surv %', (dfgender.Survivors/dfgender.Count * 100).round(2))

dfgender


from statsmodels.stats.weightstats import ztest


test = ztest(x1 = dfcleanTrain[dfcleanTrain['Sex'] == "female"]['Survived'] ,
    x2 = dfcleanTrain[dfcleanTrain['Sex'] == "male"]['Survived'],
             alternative = 'two-sided')  #our n = 891 > 30
zstat = test[0].round(2)
pvalue = test[1] #we reject H0


dfclass = dfcleanTrain.groupby(['Sex','Pclass'], as_index = False).aggregate(Count = ("Survived", 'count'),
                                      Survivors = ("Survived", 'sum'),
                                      Mean_age = ('Age', 'mean'),
                                      Mean_fare = ('FarePerCap', 'mean')).round(2)

dfclass.insert(2, '% Surv', (dfclass.Survivors/dfclass.Count * 100).round(2))

dfclass


import matplotlib.patches as mpatches

yMS = dfclass.loc[(dfclass.Sex == 'male'), "Survivors"]
yFS = dfclass.loc[(dfclass.Sex == 'female'), "Survivors"]
yMT = dfclass.loc[(dfclass.Sex == 'male'), "Count"]
yFT = dfclass.loc[(dfclass.Sex == 'female'), "Count"]
yPerc = dfclass.sort_values(['Pclass', 'Sex'], ascending = [True, False])
yPerc = yPerc['% Surv']

yMSFS = dfclass.sort_values(['Pclass', 'Sex'], ascending = [True, False])
yMSFS = yMSFS['Survivors'].reset_index(drop=True)

X = np.arange(3)
fig = plt.figure(figsize=(7,5), dpi = 80)
ax = fig.add_axes([0,0,1,1])
ax.bar(X + 0.0, yMT, color = '#55AA5F', width = 0.2)
ax.bar(X + 0.2, yMS, color = '#A8D4AD', width = 0.2)
ax.bar(X + 0.45, yFT, color = '#AA55A0', width = 0.2)
ax.bar(X + 0.65, yFS, color = '#d7b0d3', width = 0.2)

xline = np.arange(0, 3, 3/6)
#plt.plot(xline, yPerc*3, color='#CCCCCC')

men = mpatches.Patch(color='#55AA5F', label='Men Total')
women = mpatches.Patch(color='#AA55A0', label='Women Total')
menS = mpatches.Patch(color='#A8D4AD', label='Men Survived')
womenS = mpatches.Patch(color='#d7b0d3', label='Women Survived')


a = 0
for i, v in enumerate(yPerc):
    if i == 1 or i == 3:
        ax.text(0.12 + a, yMSFS[i] + 5, str(v) + '%' , color='#4F4F4F', fontsize = 10)
        a += 0.55
    else:
        ax.text(0.12 + a, yMSFS[i] + 5, str(v) + '%', color='#4F4F4F', fontsize = 10)
        a += 0.455
        


plt.legend(handles=[men, women,menS, womenS], loc = 'upper left', fontsize = 11)

plt.grid(b=True, which='both', axis='y', color='lightgrey', linestyle='--', linewidth=0.5)
plt.title('Survival rate per gender and class\n', fontsize = 17)
plt.xticks(X + 0.325 , ['\n1st Class', '\n2nd Class', '\n3rd Class'], fontsize = 12)
plt.yticks(fontsize=12, rotation=0)
plt.tick_params(bottom = False)


plt.show()


dfcleanTrain['Sex'] =  dfcleanTrain['Sex'].map( {'female': 1, 'male': 0} ).astype(int).copy()
#works but gives an annoying warning

corr = dfcleanTrain.corr()
    
corr.round(2)


dfcleanTrain['Age'].describe()

count    891.000000
mean      29.577071
std       13.776625
min        0.420000
25%       21.000000
50%       28.000000
75%       37.000000
max       80.000000
Name: Age, dtype: float64


labels = ['0-20', '21-40', '41-60', '61-80']

if 'AgeRange' not in dfcleanTrain.columns:
    dfcleanTrain.insert(7, 'AgeRange', pd.cut(dfcleanTrain['Age'], bins=[0, 20, 40, 60, 80], labels = labels))
    

#dfcleanTrain


dfage = dfcleanTrain.groupby(['AgeRange'], as_index = False).aggregate(Count = ("Survived", 'count'), 
                                                  Survivors = ("Survived", 'sum'),
                                                  Women = ('Sex', 'sum'),
                                                  Mean_fare = ('FarePerCap', 'mean')).round(2)

dfage['AgeRange'].replace([0,1,2,3],['0-20', '21-40', '41-60', '61-80'],inplace=True)

dfage.insert(3, '% Surv', (dfage.Survivors/dfage.Count * 100).round(2))

dfage.insert(4, '% Women', (dfage.Women/dfage.Count * 100).round(2))
dfage = dfage.drop("Women", axis = 'columns')


display(dfage)


dfagegender = dfcleanTrain.groupby(['AgeRange', 'Sex'], as_index = False).aggregate(Count = ("Survived", 'count'),
                                                  Survivors = ("Survived", 'sum'),
                                                  Mean_fare = ('FarePerCap', 'mean')).round(2)


dfagegender['Sex'].replace([0,1],['male','female'],inplace=True)
dfagegender['AgeRange'].replace([0,1,2,3],['0-20', '21-40', '41-60', '61-80'],inplace=True)



dfagegender.insert(4, '% Surv', (dfagegender.Survivors/dfagegender.Count * 100).round(2))

display(dfagegender.sort_values(['Sex'], ascending = [False]))


import matplotlib.patches as mpatches

#creating our X and Y variables
y2MS = dfagegender.loc[(dfagegender.Sex == 'male'), "Survivors"]
y2FS = dfagegender.loc[(dfagegender.Sex == 'female'), "Survivors"]
y2MT = dfagegender.loc[(dfagegender.Sex == 'male'), "Count"]
y2FT = dfagegender.loc[(dfagegender.Sex == 'female'), "Count"]
y2Perc = dfagegender.sort_values(['Sex', 'AgeRange'], ascending = [False, True])
y2MPerc = y2Perc.loc[(y2Perc.Sex == 'male'), "% Surv"]
y2FPerc = y2Perc.loc[(y2Perc.Sex == 'female'), "% Surv"]

y2MSFS = dfagegender.sort_values(['Sex', 'AgeRange'], ascending = [False, True])
y2MSFS = y2MSFS['Survivors'].reset_index(drop=True)

fig2, (ax1, ax2) = plt.subplots(1,2, figsize=(15,7), dpi = 80)

X2 = np.arange(4)
Y2 = np.arange(5)

plt.sca(ax1)

#creating the bars + spaces between
ax1.bar(X2 + 0.0, y2MT, color = '#55AA5F', width = 0.2)
ax1.bar(X2 + 0.2, y2MS, color = '#A8D4AD', width = 0.2)
ax1.bar(X2 + 0.45, y2FT, color = '#AA55A0', width = 0.2)
ax1.bar(X2 + 0.65, y2FS, color = '#d7b0d3', width = 0.2)

#legend
ax1.legend(handles=[men, women,menS, womenS], loc = 'upper right', fontsize = 12)
men = mpatches.Patch(color='#55AA5F', label='Men Total')
women = mpatches.Patch(color='#AA55A0', label='Women Total')
menS = mpatches.Patch(color='#A8D4AD', label='Men Survived')
womenS = mpatches.Patch(color='#d7b0d3', label='Women Survived')

#other parameters
plt.grid(b=True, which='both', axis='y', color='lightgrey', linestyle='--', linewidth=0.5)
plt.title('Survivors and total per age group and gender\n', fontsize = 17)

plt.xticks(X2 + 0.325, labels, fontsize = 13)
plt.xlabel('\nAge Groups',  fontsize = 15)
plt.ylabel('',  fontsize = 15)
plt.yticks(fontsize=13, rotation=0)
ax1.tick_params(bottom = False)

#Second plot

plt.sca(ax2)
xline = np.arange(0, 4, 1)
ax2.plot(xline, y2MPerc, color='#55AA5F', linewidth = 4,  marker='o')
ax2.plot(xline, y2FPerc, color='#AA55A0', linewidth = 4, marker='o')
plt.title('Survival rate (%) per age group and gender\n', fontsize = 17)

plt.xticks(X2, labels, fontsize = 13)
plt.yticks(np.arange(0,110, 10), fontsize=13, rotation=0)
ax1.tick_params(bottom = False)

ax2.legend(labels = ['Men','Women'], loc = 'upper left', fontsize = 12)
plt.grid(b=True, which='both', axis='both', color='lightgrey', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.show()


dfcleanTrain2 = dfcleanTrain
#dfcleanTrain2['Sex'].replace(['male','female'],[0,1], inplace=True)


if 'Pclass_1' not in dfcleanTrain2.columns:
    classes = pd.get_dummies(dfcleanTrain2.Pclass, prefix='Pclass')
    dfcleanTrain2 = pd.concat([dfcleanTrain2, classes], axis=1)
    dfcleanTrain2.drop(columns=['Pclass'])
    
corr2 = dfcleanTrain2.corr()

corr2.round(2)


dfageclass = dfcleanTrain2.groupby(['Sex','AgeRange'], as_index = False).aggregate(Total = ("Pclass_1", 'count'),
                                                Class1 = ("Pclass_1", 'sum'),
                                                Class2 = ("Pclass_2", 'sum'),
                                                Class3 = ("Pclass_3", 'sum'))

if 1 in dfageclass["Sex"]:
    dfageclass['Sex'] = dfageclass['Sex'].replace([1, 0], ['female', 'male'])

a=0
for i in range(1,4):
    dfageclass.insert(6 + a, '% Class' + str(i), (dfageclass['Class' + str(i)]/dfageclass.Total * 100).round(2))
    a += 1
    
dfageclass2 = dfcleanTrain.groupby(['Sex','Pclass'], as_index = False).aggregate(Count = ("Age", 'count'),
                                                MaxAge = ("Age", 'max'),
                                                MinAge = ("Age", 'min'),
                                                AvgAge = ("Age", 'mean')).round(1)   
    
display(dfageclass)
display(dfageclass2)

##boxplot and lines


dfcleanTrain.loc[dfcleanTrain["Sex"] == 0,"Sex"] = 'male'
dfcleanTrain.loc[dfcleanTrain["Sex"] == 1,"Sex"] = 'female'


from matplotlib.lines import Line2D

M1 = dfageclass.loc[(dfageclass.Sex == 'male'), "% Class1"]
F1 = dfageclass.loc[(dfageclass.Sex == 'female'), "% Class1"]
M2 = dfageclass.loc[(dfageclass.Sex == 'male'), "% Class2"]
F2 = dfageclass.loc[(dfageclass.Sex == 'female'), "% Class2"]
M3 = dfageclass.loc[(dfageclass.Sex == 'male'), "% Class3"]
F3 = dfageclass.loc[(dfageclass.Sex == 'female'), "% Class3"]

fig3, (ax3, ax4) = plt.subplots(1,2, figsize=(15,7), dpi = 80)

databox= []
for i in ['female', 'male']:
    for j in [1,2,3]:
        f = dfcleanTrain.loc[(dfcleanTrain.Sex == str(i)) &(dfcleanTrain.Pclass == j),"Age"]
        databox.append(f)
        
#first plot
plt.sca(ax3)
plt.title('Boxplot for men and women in each class\n', fontsize = 15)
labelsbox=[' ','1st', '2nd', '3rd','1st', '2nd', '3rd']

flierprops = dict(marker='o', markerfacecolor='lightgrey', markersize=8,
                  linestyle=':', markeredgecolor='w')

medianprops = dict(linewidth=2, color='black')

box = ax3.boxplot(databox, widths=0.4, medianprops=medianprops, showfliers=False, patch_artist=True)

men = mpatches.Patch(color='#55AA5F', label='Men')
women = mpatches.Patch(color='#AA55A0', label='Women')

ax3.legend(handles = [men, women], loc = 'upper right', fontsize = 12)
plt.grid(b=True, which='both', axis='both', color='lightgrey', linestyle='--', linewidth=0.5)
plt.xlabel('\nClasses',  fontsize = 13)
plt.ylabel('\nAge',  fontsize = 13)

plt.xticks(np.arange(7), labelsbox, fontsize = 12)
plt.yticks(fontsize = 12)

colors = ['#AA55A0','#AA55A0','#AA55A0', '#55AA5F','#55AA5F','#55AA5F']
for patch, color in zip(box['boxes'], colors):
        patch.set_facecolor(color)


#second plot
plt.sca(ax4)
plt.title('Share (%) of women and men in 1st and 3rd class\n', fontsize = 15)
xline = np.arange(0, 4, 1)

ax4.plot(xline, M1, color='#55AA5F', linewidth = 2)
ax4.plot(xline, F1, color='#AA55A0', linewidth = 2)
ax4.plot(xline, M3, color='#55AA5F', linewidth = 2, linestyle='-.')
ax4.plot(xline, F3, color='#AA55A0', linewidth = 2, linestyle='-.' )

ax4.text(0.1, 16.3, '1st Women', fontsize = 11, color='#AA55A0', rotation=25)
ax4.text(0.1, 4, '1st Men', fontsize = 11, color='#55AA5F', rotation=25)
ax4.text(0.1, 55, '3rd Women', fontsize = 11, color='#AA55A0', rotation=-33)
ax4.text(0.1, 73.5, '3rd Men', fontsize = 11, color='#55AA5F', rotation=-31)

plt.xlabel('\n Age Groups',  fontsize = 13)

plt.xticks(X2, labels, fontsize = 12)
plt.yticks(fontsize = 12)

plt.grid(b=True, which='both', axis='both', color='lightgrey', linestyle='--', linewidth=0.5)


if 'Deck_0' not in dfcleanTrain.columns:
    dfcleanTrain = pd.get_dummies(dfcleanTrain, columns=['Deck'])


dfdeck = dfcleanTrain.groupby(['Sex', 'Survived'], as_index = False).aggregate(Total = ("Deck_0", 'count'),
                                                Deck_0 = ("Deck_0", 'sum'),Deck_1 = ("Deck_1", 'sum'),
                                                Deck_2 = ("Deck_2", 'sum'),Deck_3 = ("Deck_3", 'sum'),
                                                Deck_4 = ('Deck_4', 'sum'),Deck_5 = ("Deck_5", 'sum'),
                                                Deck_6 = ('Deck_6', 'sum'),Deck_7 = ("Deck_7", 'sum'),
                                                Deck_8 = ('Deck_8', 'sum'))
                                                                             
dfdeck


#Let's create groups because we don't have enough data for so many different group sizes.

labelsGroup = ['Alone', '1', '2', '3', '4+']

if 'GroupSize' not in dfcleanTrain.columns:
    dfcleanTrain.insert(12, 'GroupSize', pd.cut(dfcleanTrain['Group'], bins=[-1, 0, 1, 2, 3, 100],
                                                labels = labelsGroup, right=True))
    
labelsGroup2 = ['Alone', 'not Alone']

if 'GroupSize2' not in dfcleanTrain.columns:
    dfcleanTrain.insert(12, 'GroupSize2', pd.cut(dfcleanTrain['Group'], bins=[-1, 0, 100],
                                                labels = labelsGroup2, right=True))


dfgroup = dfcleanTrain.groupby(['Pclass','GroupSize'], as_index = False).aggregate(Total = ("Survived", 'count'),
                                                Survivors = ("Survived", 'sum'))

dfgroup = dfcleanTrain.groupby(['Sex','GroupSize'], as_index = False).aggregate(Total = ("Survived", 'count'),
                                                Survivors = ("Survived", 'sum'))

dfgroup.insert(3, '% Surv', (dfgroup.Survivors/dfgroup.Total * 100).round(2))

with pd.option_context('display.max_rows', None):
  display(dfgroup)


GF = dfgroup.loc[(dfgroup.Sex == 'female'), "% Surv"]
GM = dfgroup.loc[(dfgroup.Sex == 'male'), "% Surv"]


fig3, ax5 = plt.subplots(figsize=(7,5), dpi = 80)

xlineG = np.arange(0, 5, 1)
ax5.plot(xlineG, GM, color='#55AA5F', linewidth = 2,  marker='o')
ax5.plot(xlineG, GF, color='#AA55A0', linewidth = 2, marker='o')
plt.title('Survival rate (%) per gender and group size\n', fontsize = 15)

XG = np.arange(5)

plt.xticks(XG, labelsGroup, fontsize = 13)
plt.yticks(np.arange(0,110, 10), fontsize=13, rotation=0)
ax5.tick_params(bottom = False)

ax5.legend(labels = ['Men','Women'], loc = 'upper left', fontsize = 11)
plt.grid(b=True, which='both', axis='both', color='lightgrey', linestyle='--', linewidth=0.5)

plt.xlabel('\nGroup Size',  fontsize = 13)

plt.show()


dfemb = dfcleanTrain.groupby(['Sex','Embarked'], as_index = False).aggregate(Total = ("Survived", 'count'),
                                                Survivors = ("Survived", 'sum'),
                                                 AvgFare = ("FarePerCap", 'mean') )

dfemb.insert(4, '% Surv', (dfemb.Survivors/dfemb.Total * 100).round(2))

dfemb


#TitanicTableau = dfcleanTrain[['PassengerId', 'Survived', 'Pclass', 'Title', 'Name', 'Sex', 'Age',
       #'AgeRange', 'SibSp', 'Parch', 'Ticket', 'Group', 'GroupSize2',
       #'GroupSize', 'Fare', 'Alone', 'Embarked', 'Family', 'FarePerCap']]

#TitanicTableau['Survived'] = TitanicTableau['Survived'].astype(int)
#TitanicTableau['Fare'] = TitanicTableau['Fare'].astype(float)
#TitanicTableau['Age'] = TitanicTableau['Age'].astype(float)


#TitanicTableau.to_csv('C:/Users/sarac/Desktop/Projects/KagglePrivate/Titanic_Data/titanic/TitanicTableau.csv', index=False)


from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE


dfcleanTrain2.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Title', 'Name', 'Sex', 'Age',
       'AgeRange', 'SibSp', 'Parch', 'Ticket', 'Group', 'Fare', 'Alone',
       'Embarked', 'Family', 'Deck', 'FarePerCap', 'Pclass_1', 'Pclass_2',
       'Pclass_3'],
      dtype='object')


X = dfcleanTrain2.drop(['Survived', 'Name',
                        'Ticket',
                        'PassengerId'], axis = 1)
y = dfcleanTrain2["Survived"]


rfe_selector = RFE(estimator=DecisionTreeClassifier(),n_features_to_select = 5, step = 1)
rfe_selector.fit(X, y)
X.columns[rfe_selector.get_support()]

Index(['Title', 'Age', 'Fare', 'FarePerCap', 'Pclass_3'], dtype='object')


X = dfcleanTrain2[['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex', 'Alone']]

clf = GridSearchCV(DecisionTreeClassifier(random_state=0), {
    "criterion": ['gini', 'entropy'],
    "splitter": ['best', 'random'],
    "max_depth": range(5, 10)},
                   cv = 5, return_train_score = False)

clf.fit(X, y)
clf.cv_results_

dftree = pd.DataFrame(clf.cv_results_).head(5)
dftree[["param_criterion","param_max_depth",
        "param_splitter","mean_test_score"]].sort_values(by=['mean_test_score'], ascending=False)


from sklearn.linear_model import LogisticRegression

rfe_selector = RFE(estimator=LogisticRegression(),n_features_to_select = 8, step = 1)
rfe_selector.fit(X, y)
X.columns[rfe_selector.get_support()]

Index(['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex',
       'Alone'],
      dtype='object')


X = dfcleanTrain2[['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex', 'Alone']]

clf2 = GridSearchCV(LogisticRegression(solver='liblinear', multi_class = 'auto'), {
    "C": range(5, 15),
    "fit_intercept": [True, False],
    "penalty": ['l1', 'l2']},
                   cv = 5, return_train_score = False)

clf2.fit(X, y)
clf2.cv_results_

dflog = pd.DataFrame(clf2.cv_results_).head(5)

dflog[["param_C", "param_fit_intercept", 'param_penalty'
       , "mean_test_score"]].sort_values(by=['mean_test_score'], ascending=False)


from sklearn.neural_network import MLPClassifier

rfe_selector = RFE(estimator=MLPClassifier(),n_features_to_select = 8, step = 1)
rfe_selector.fit(X, y)
X.columns[rfe_selector.get_support()]

Index(['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex',
       'Alone'],
      dtype='object')


X = dfcleanTrain2[['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex', 'Alone']]

clf3 = GridSearchCV(MLPClassifier(random_state=0, max_iter = 500), {
    "hidden_layer_sizes": [(50,50,50), (50,100,50)],
    'alpha': [0.0001, 0.05],
    'activation': ['tanh'],
    "learning_rate" : ['constant', 'adaptive'],
    "solver": ['sgd', 'adam']},
                   cv = 5, return_train_score = False)

clf3.fit(X, y)
clf3.cv_results_

dfNN = pd.DataFrame(clf3.cv_results_).head(5)

dfNN[["param_hidden_layer_sizes", "param_learning_rate", 'param_activation'
       ,'param_alpha','param_solver', "mean_test_score"]].sort_values(by=['mean_test_score'], ascending=False)


from sklearn.ensemble import RandomForestClassifier

rfe_selector = RFE(estimator=RandomForestClassifier(),n_features_to_select = 6, step = 1)
rfe_selector.fit(X, y)
X.columns[rfe_selector.get_support()]


X = dfcleanTrain2[['Pclass', 'AgeRange', 'Fare', 'Embarked', 'Title', 'Sex', 'Alone', 'Deck']]

clf4 = GridSearchCV(RandomForestClassifier(random_state=0), {
    "criterion": ['gini', 'entropy'],
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    "max_depth": [5,10]},
                   cv = 5, return_train_score = False)

clf4.fit(X, y)
clf4.cv_results_

dfRF = pd.DataFrame(clf4.cv_results_).head(5)

dfRF[["param_criterion", "param_n_estimators", 'param_max_features',
       'param_max_depth', "mean_test_score"]].sort_values(by=['mean_test_score'], ascending=False)


from sklearn.svm import SVC

rfe_selector = RFE(estimator=SVC(),n_features_to_select = 7, step = 1)
rfe_selector.fit(X, y)
X.columns[rfe_selector.get_support()]


X = dfcleanTrain2[['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex', 'Alone']]

clf5 = GridSearchCV(SVC(probability=True, kernel = 'linear' ),{
    "gamma": ['scale', 'auto']},
                   cv = 5, return_train_score = False)

clf5.fit(X, y)
clf5.cv_results_

dflog = pd.DataFrame(clf5.cv_results_).head(5)

dflog[['param_gamma'
       , "mean_test_score"]].sort_values(by=['mean_test_score'], ascending=False)


#dfcleanTest['Sex'].replace(['male','female'],[0,1],inplace=True)


bestmodel = DecisionTreeClassifier(random_state=0, criterion = 'gini',
                                   splitter = 'best', max_depth = 5)

cross_val_score(bestmodel, X, y, cv=5).mean().round(4)


bestmodel2 = RandomForestClassifier(random_state=0, criterion = 'gini', n_estimators= 500,
    max_features = 'auto', max_depth = 5)

cross_val_score(bestmodel2, X, y, cv=5).mean().round(4)


bestmodel2.fit(X, y)


Xpred = dfcleanTest[['Pclass', 'AgeRange', 'FarePerCap', 'Embarked', 'Title', 'Sex', 'Alone']]
ypred = bestmodel2.predict(Xpred)


#pd.set_option("display.max_rows", None)
#Xpred


#np.where(np.isnan(Xpred))


dfcleanTest = dfcleanTest.drop('Survived', 1)

dfcleanTest['Survived'] = ypred


Final = dfcleanTest[['PassengerId','Survived']]


Final['Survived'] = Final['Survived'].astype(int)
Final.dtypes


Final.to_csv('C:/Users/sarac/Desktop/Projects/KagglePrivate/Titanic_Data/titanic/Final4.csv', index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0.0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1.0	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1.0	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1.0	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0.0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
1304	1305	NaN	3	Spector, Mr. Woolf	male	NaN	0	0	A.5. 3236	8.0500	NaN	S
1305	1306	NaN	1	Oliva y Ocana, Dona. Fermina	female	39.0	0	0	PC 17758	108.9000	C105	C
1306	1307	NaN	3	Saether, Mr. Simon Sivertsen	male	38.5	0	0	SOTON/O.Q. 3101262	7.2500	NaN	S
1307	1308	NaN	3	Ware, Mr. Frederick	male	NaN	0	0	359309	8.0500	NaN	S
1308	1309	NaN	3	Peter, Master. Michael J	male	NaN	1	1	2668	22.3583	NaN	C

	Name	Title
0	Braund, Mr. Owen Harris	Mr
1	Cumings, Mrs. John Bradley (Florence Briggs Th...	Mrs
2	Heikkinen, Miss. Laina	Miss
3	Futrelle, Mrs. Jacques Heath (Lily May Peel)	Mrs
4	Allen, Mr. William Henry	Mr
...	...	...
1304	Spector, Mr. Woolf	Mr
1305	Oliva y Ocana, Dona. Fermina	Mrs
1306	Saether, Mr. Simon Sivertsen	Mr
1307	Ware, Mr. Frederick	Mr
1308	Peter, Master. Michael J	Master

	Sex	Pclass	% Surv	Count	Survivors	Mean_age	Mean_fare
0	female	1	96.81	94	91.0	34.68	35.61
1	female	2	92.11	76	70.0	28.98	11.47
2	female	3	50.00	144	72.0	22.86	7.02
3	male	1	36.89	122	45.0	41.01	35.70
4	male	2	15.74	108	17.0	30.34	11.75
5	male	3	13.54	347	47.0	26.85	7.50

	PassengerId	Survived	Pclass	Title	Sex	Age	AgeRange	SibSp	Parch	Group	Fare	Alone	Embarked	Family	Deck	FarePerCap
PassengerId	1.00	-0.01	-0.04	-0.03	-0.04	0.04	0.03	-0.06	-0.00	-0.01	0.02	0.02	-0.03	-0.04	-0.03	0.03
Survived	-0.01	1.00	-0.34	0.56	0.54	-0.06	-0.06	-0.04	0.08	0.06	0.25	-0.25	0.11	0.02	-0.29	0.24
Pclass	-0.04	-0.34	1.00	-0.16	-0.13	-0.37	-0.32	0.08	0.02	-0.04	-0.56	0.27	0.05	0.07	0.75	-0.77
Title	-0.03	0.56	-0.16	1.00	0.93	-0.07	-0.05	0.17	0.30	0.22	0.18	-0.39	0.09	0.27	-0.13	0.09
Sex	-0.04	0.54	-0.13	0.93	1.00	-0.09	-0.07	0.11	0.25	0.18	0.17	-0.31	0.12	0.20	-0.11	0.08
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Alone	0.02	-0.25	0.27	-0.39	-0.31	0.13	0.12	-0.44	-0.47	-0.68	-0.44	1.00	0.02	-0.54	0.21	-0.18
Embarked	-0.03	0.11	0.05	0.09	0.12	-0.04	-0.04	-0.06	-0.08	-0.07	0.05	0.02	1.00	-0.08	-0.04	0.05
Family	-0.04	0.02	0.07	0.27	0.20	-0.29	-0.24	0.89	0.78	0.82	0.21	-0.54	-0.08	1.00	0.01	-0.09
Deck	-0.03	-0.29	0.75	-0.13	-0.11	-0.28	-0.23	0.04	-0.03	-0.06	-0.53	0.21	-0.04	0.01	1.00	-0.69
FarePerCap	0.03	0.24	-0.77	0.09	0.08	0.32	0.25	-0.09	-0.05	0.03	0.80	-0.18	0.05	-0.09	-0.69	1.00

Titanic data analysis and prediction¶

1. Data Cleaning¶

We can go through variables to see what we can use, drop, create for the prediction and analysis part. But our intuition may be wrong that's why we need the analysis part to confirm, reject or discover relations.¶

2. Data Analysis¶

Gender¶

3. Prediction¶

Decision tree¶

Linear regression¶

Neural Network¶

Random Forest¶

SVC¶

Let's predict with our best model¶

	column	count	ratio
11	Cabin	1014	0.774637
1	Survived	418	0.319328
6	Age	263	0.200917
12	Embarked	2	0.001528
10	Fare	1	0.000764

	AgeRange	Count	Survivors	% Surv	% Women	Mean_fare
0	0-20	205	87.0	42.44	38.54	10.06
1	21-40	521	196.0	37.62	35.70	15.07
2	41-60	141	54.0	38.30	32.62	21.26
3	61-80	24	5.0	20.83	12.50	22.83

	AgeRange	Sex	Count	Survivors	% Surv	Mean_fare
0	0-20	male	126	33.0	26.19	8.84
2	21-40	male	335	55.0	16.42	14.07
4	41-60	male	95	19.0	20.00	20.31
6	61-80	male	21	2.0	9.52	22.49
1	0-20	female	79	54.0	68.35	12.01
3	21-40	female	186	141.0	75.81	16.87
5	41-60	female	46	35.0	76.09	23.21
7	61-80	female	3	3.0	100.00	25.19

	Sex	AgeRange	Total	Class1	Class2	Class3	% Class1	% Class2	% Class3
0	male	0.0	126	7	21	98	5.56	16.67	77.78
1	male	1.0	335	56	66	213	16.72	19.70	63.58
2	male	2.0	95	47	18	30	49.47	18.95	31.58
3	male	3.0	21	12	3	6	57.14	14.29	28.57
4	female	0.0	79	14	16	49	17.72	20.25	62.03
5	female	1.0	186	54	46	86	29.03	24.73	46.24
6	female	2.0	46	24	14	8	52.17	30.43	17.39
7	female	3.0	3	2	0	1	66.67	0.00	33.33

	Sex	Pclass	Count	MaxAge	MinAge	AvgAge
0	0	1	122	80.0	0.9	41.0
1	0	2	108	70.0	0.7	30.3
2	0	3	347	74.0	0.4	26.9
3	1	1	94	63.0	2.0	34.7
4	1	2	76	57.0	2.0	29.0
5	1	3	144	63.0	0.8	22.9

	Sex	Survived	Total	Deck_0	Deck_1	Deck_2	Deck_3	Deck_4	Deck_5	Deck_6	Deck_7	Deck_8
0	female	0.0	81	0	0	0	3	0	1	0	2	75.0
1	female	1.0	233	0	1	27	24	18	14	5	2	142.0
2	male	0.0	468	1	8	12	21	8	7	5	0	406.0
3	male	1.0	109	0	6	8	11	7	10	3	0	64.0

	Sex	GroupSize	Total	% Surv	Survivors
0	female	Alone	103	70.87	73.0
1	female	1	92	80.43	74.0
2	female	2	55	89.09	49.0
3	female	3	24	95.83	23.0
4	female	4+	40	35.00	14.0
5	male	Alone	378	15.08	57.0
6	male	1	89	21.35	19.0
7	male	2	46	36.96	17.0
8	male	3	20	45.00	9.0
9	male	4+	44	15.91	7.0

	Sex	Embarked	Total	Survivors	% Surv	AvgFare
0	female	0	205	142.0	69.27	14.695366
1	female	1	73	64.0	87.67	26.154521
2	female	2	36	27.0	75.00	8.555278
3	male	0	441	77.0	17.46	13.157370
4	male	1	95	29.0	30.53	22.043789
5	male	2	41	3.0	7.32	8.087561

	param_criterion	param_max_depth	param_splitter	mean_test_score
0	gini	5	best	0.826031
2	gini	6	best	0.822685
4	gini	7	best	0.819308
3	gini	6	random	0.805819
1	gini	5	random	0.796849

	param_C	param_fit_intercept	param_penalty	mean_test_score
0	5	True	l1	0.808097
4	6	True	l1	0.808097
1	5	True	l2	0.806974
2	5	False	l1	0.793522
3	5	False	l2	0.792398

	Count	Survivors	Surv %	Mean_age	Mean_fare
Sex
female	314	233.0	74.20	27.88	16.66
male	577	109.0	18.89	30.50	14.26