In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
import pandas as pd
# Intenta detectar el delimitador automáticamente
df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=None, engine='python')
# Si conoces el delimitador, especifícalo directamente
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=',') # Si es una coma
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=';') # Si es un punto y coma
df.shape
Out[ ]:
(553, 52)
In [ ]:
# variables categóricas
label_encoder_NumHuevo = LabelEncoder()
label_encoder_NumPicho = LabelEncoder()
label_encoder_Evid_Saque = LabelEncoder()
label_encoder_Depredado = LabelEncoder()
df['NomCien'] = label_encoder_NumHuevo.fit_transform(df['NumHuevo'])
df['Sitio'] = label_encoder_NumPicho.fit_transform(df['NumPicho'])
df['Evid_Saque'] = label_encoder_Evid_Saque.fit_transform(df['Evid_Saque'])
df['Depredado'] = label_encoder_Depredado.fit_transform(df['Depredado'])
print(df.head())
ID CodArbol NomCien NomCom EspePsitac Sitio Año Temporada \ 0 1 PSSG1 4 Genizaro LNA 0 2019 Oct-Ene 1 2 PSAP2 4 Panama LNA 1 2019 Oct-Ene 2 4 PSAP3 2 Panama LNA 4 2019 Oct-Ene 3 5 PHCJ4 1 Jabillo LNA 4 2019 Oct-Ene 4 6 PBAO5 2 Ojoche LNA 1 2019 Oct-Ene MesEscal Escalado ... ProfEntCavid DAPArbol Clima \ 0 Nov Si ... 74 324 Soleado viento 1 Dic Si ... 85 13.2 50%nublado mucho viento 2 Nov Si ... 80 11.2 30%nublado mucho viento 3 Dic Si ... 70 4.6 20% nublado sin viento 4 Dic Si ... 29 3.7 Soleado Nubosidad Nubosid% Soleado Solead% Precip Viento ActivoAno 0 NaN NaN Soleado NaN NaN Poco No 1 Nublado 50.0 NaN NaN NaN Fuerte Si 2 Nublado 30.0 NaN NaN NaN Fuerte Si 3 Nublado 20.0 NaN NaN NaN NaN Si 4 NaN NaN Soleado NaN NaN NaN Si [5 rows x 52 columns]
In [ ]:
df.head()
Out[ ]:
ID | CodArbol | NomCien | NomCom | EspePsitac | Sitio | Año | Temporada | MesEscal | Escalado | ... | ProfEntCavid | DAPArbol | Clima | Nubosidad | Nubosid% | Soleado | Solead% | Precip | Viento | ActivoAno | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | PSSG1 | 4 | Genizaro | LNA | 0 | 2019 | Oct-Ene | Nov | Si | ... | 74 | 324 | Soleado viento | NaN | NaN | Soleado | NaN | NaN | Poco | No |
1 | 2 | PSAP2 | 4 | Panama | LNA | 1 | 2019 | Oct-Ene | Dic | Si | ... | 85 | 13.2 | 50%nublado mucho viento | Nublado | 50.0 | NaN | NaN | NaN | Fuerte | Si |
2 | 4 | PSAP3 | 2 | Panama | LNA | 4 | 2019 | Oct-Ene | Nov | Si | ... | 80 | 11.2 | 30%nublado mucho viento | Nublado | 30.0 | NaN | NaN | NaN | Fuerte | Si |
3 | 5 | PHCJ4 | 1 | Jabillo | LNA | 4 | 2019 | Oct-Ene | Dic | Si | ... | 70 | 4.6 | 20% nublado sin viento | Nublado | 20.0 | NaN | NaN | NaN | NaN | Si |
4 | 6 | PBAO5 | 2 | Ojoche | LNA | 1 | 2019 | Oct-Ene | Dic | Si | ... | 29 | 3.7 | Soleado | NaN | NaN | Soleado | NaN | NaN | NaN | Si |
5 rows × 52 columns
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 553 entries, 0 to 552 Data columns (total 52 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 553 non-null int64 1 CodArbol 553 non-null object 2 NomCien 553 non-null int64 3 NomCom 553 non-null object 4 EspePsitac 553 non-null object 5 Sitio 553 non-null int64 6 Año 553 non-null int64 7 Temporada 548 non-null object 8 MesEscal 553 non-null object 9 Escalado 553 non-null object 10 CodNido 553 non-null object 11 ConCavi 533 non-null object 12 ConCavi_B 533 non-null object 13 ConCavi_R 533 non-null object 14 ConCavi_M 533 non-null float64 15 Humedo 533 non-null object 16 PocoHume 533 non-null object 17 Seca 533 non-null object 18 Evid_Activ 15 non-null float64 19 Evid_Saque 553 non-null int64 20 Depredado 553 non-null int64 21 CompPadre 471 non-null object 22 PresePadr 472 non-null float64 23 UbicPadr 148 non-null object 24 CompPadr 171 non-null object 25 NumHuevo 117 non-null float64 26 NumPicho 258 non-null float64 27 PichoSaque 53 non-null object 28 PichoMuerto 14 non-null float64 29 PichoVuelEx 202 non-null float64 30 PichoRehab 7 non-null float64 31 EstadPicho 203 non-null object 32 Edad1Dia 183 non-null float64 33 Edad2Dia 121 non-null float64 34 Edad3Dia 49 non-null float64 35 Edad4Dia 1 non-null float64 36 AltCavidad 494 non-null float64 37 DirCavidad 441 non-null object 38 DirGrado 382 non-null float64 39 DirPuntCard 404 non-null object 40 AltEntCavid 459 non-null float64 41 AncEntCavid 473 non-null float64 42 ProfEntCavid 469 non-null object 43 DAPArbol 490 non-null object 44 Clima 523 non-null object 45 Nubosidad 270 non-null object 46 Nubosid% 224 non-null float64 47 Soleado 211 non-null object 48 Solead% 85 non-null float64 49 Precip 11 non-null object 50 Viento 347 non-null object 51 ActivoAno 546 non-null object dtypes: float64(18), int64(6), object(28) memory usage: 224.8+ KB
In [ ]:
df.describe()
Out[ ]:
ID | NomCien | Sitio | Año | ConCavi_M | Evid_Activ | Evid_Saque | Depredado | PresePadr | NumHuevo | ... | Edad1Dia | Edad2Dia | Edad3Dia | Edad4Dia | AltCavidad | DirGrado | AltEntCavid | AncEntCavid | Nubosid% | Solead% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 553.000000 | 553.000000 | 553.000000 | 553.000000 | 533.000000 | 15.000000 | 553.000000 | 553.000000 | 472.000000 | 117.000000 | ... | 183.000000 | 121.000000 | 49.000000 | 1.0 | 494.000000 | 382.000000 | 459.000000 | 473.000000 | 224.000000 | 85.000000 |
mean | 300.781193 | 3.471971 | 2.556962 | 2021.094033 | 0.120075 | 0.733333 | 1.115732 | 1.133816 | 0.792373 | 1.504274 | ... | 29.628415 | 27.884298 | 24.285714 | 60.0 | 14.214730 | 178.378796 | 29.188017 | 14.375264 | 55.553571 | 85.470588 |
std | 173.578703 | 1.111383 | 1.638783 | 1.902092 | 0.325355 | 0.457738 | 0.426889 | 0.450639 | 0.406039 | 0.961536 | ... | 15.321954 | 15.259527 | 13.604534 | NaN | 5.919842 | 106.788897 | 28.088252 | 9.143355 | 31.790186 | 25.208931 |
min | 1.000000 | 0.000000 | 0.000000 | 2017.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 2.000000 | 1.000000 | 0.000000 | 60.0 | 0.451389 | 0.000000 | 1.300000 | 4.000000 | 5.000000 | 20.000000 |
25% | 150.000000 | 4.000000 | 1.000000 | 2020.000000 | 0.000000 | 0.500000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 18.000000 | 15.000000 | 16.000000 | 60.0 | 10.600000 | 94.000000 | 14.000000 | 9.000000 | 20.000000 | 90.000000 |
50% | 300.000000 | 4.000000 | 4.000000 | 2021.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 30.000000 | 28.000000 | 21.000000 | 60.0 | 13.900000 | 180.000000 | 20.000000 | 12.000000 | 50.000000 | 100.000000 |
75% | 453.000000 | 4.000000 | 4.000000 | 2023.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | ... | 42.000000 | 40.000000 | 34.000000 | 60.0 | 17.285000 | 270.000000 | 33.000000 | 17.000000 | 90.000000 | 100.000000 |
max | 595.000000 | 4.000000 | 4.000000 | 2024.000000 | 1.000000 | 1.000000 | 3.000000 | 3.000000 | 1.000000 | 3.000000 | ... | 62.000000 | 60.000000 | 60.000000 | 60.0 | 100.000000 | 713.000000 | 240.000000 | 130.000000 | 100.000000 | 100.000000 |
8 rows × 24 columns
In [ ]:
bottle_df = bottle_df[:][:500] # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
NumHuevo | NumPicho | |
---|---|---|
0 | 0.0 | 1.0 |
1 | 0.0 | 2.0 |
2 | 2.0 | 2.0 |
3 | 1.0 | 2.0 |
4 | 2.0 | 2.0 |
Distributions
2-d distributions
Time series
Values
In [ ]:
# Extract 2 columns 'NomCom','Año'
bottle_df = bottle[['NomCom','Año']]
# And called again
bottle_df.columns = ['NomCom','Año']
In [ ]:
bottle_df = bottle_df[:][:500] # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
NomCom | Año | |
---|---|---|
0 | Genizaro | 2019 |
1 | Panama | 2019 |
2 | Panama | 2019 |
3 | Jabillo | 2019 |
4 | Ojoche | 2019 |
Categorical distributions
Time series
In [ ]:
from matplotlib import pyplot as plt
import seaborn as sns
_df_7.groupby('NomCom').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
In [ ]:
# @title Año
from matplotlib import pyplot as plt
bottle_df['Año'].plot(kind='line', figsize=(8, 4), title='Año')
plt.gca().spines[['top', 'right']].set_visible(False)
In [ ]:
# @title Año
from matplotlib import pyplot as plt
bottle_df['Año'].plot(kind='hist', bins=20, title='Año')
plt.gca().spines[['top', 'right',]].set_visible(False)
In [ ]:
# Extract 2 columns 'Sitio','Temporada'
bottle_df = bottle[['Sitio','Temporada']]
# And called again
bottle_df.columns = ['Sitio','Temporada']
In [ ]:
bottle_df = bottle_df[:][:500] # Tomemos el límite para el cálculo de la regresión de velocidad
bottle_df.head()
Out[ ]:
Sitio | Temporada | |
---|---|---|
0 | Peña inculta | Oct-Ene |
1 | Peña inculta | Oct-Ene |
2 | Peña inculta | Oct-Ene |
3 | Peña inculta | Oct-Ene |
4 | Peña inculta | Oct-Ene |
No charts were generated by quickchart
In [ ]:
# @title Temporada
from matplotlib import pyplot as plt
import seaborn as sns
bottle_df.groupby('Temporada').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
Datos¶
In [ ]:
# Separar las características (features) y el objetivo (target)
X = df[['NumHuevo']]
y = df['NumPicho']
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Separar las características (features) y el objetivo (target)
X = df[['Evid_Saque']]
y = df['Depredado']
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [ ]:
# Crear el modelo de Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Entrenar el modelo
model.fit(X_train, y_train)
# Hacer predicciones en el conjunto de prueba
y_pred = model.predict(X_test)
# Evaluar el modelo
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
precision recall f1-score support 0 0.00 0.00 0.00 1 1 0.88 1.00 0.94 97 2 0.00 0.00 0.00 12 3 1.00 1.00 1.00 1 accuracy 0.88 111 macro avg 0.47 0.50 0.48 111 weighted avg 0.78 0.88 0.83 111 [[ 0 1 0 0] [ 0 97 0 0] [ 0 12 0 0] [ 0 0 0 1]]
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Grafica de dispersion¶
In [ ]:
plt.figure(figsize=(15, 8))
sns.scatterplot(x='NumHuevo', y='NumPicho', hue='Temporada', data=df)
plt.title('Num Huevos vs Num Pichones por Temporada')
plt.xlabel('NumPicho')
plt.ylabel('NumHuevo')
plt.legend()
plt.show()
In [ ]:
plt.figure(figsize=(15, 8))
sns.scatterplot(x='Evid_Saque', y='Depredado', hue='Temporada', data=df)
plt.title('Evidencia de Saqueo vs Depredados por Temporada')
plt.xlabel('Evid_Saque')
plt.ylabel('Depredado')
plt.legend()
plt.show()
Grafico de dispersion vs 2¶
In [ ]:
# Separar las características (features) y los objetivos (targets)
X = df[['NumPicho', 'NumHuevo']]
y_evid_saque = df['Evid_Saque']
y_depredado = df['Depredado']
# Codificar los objetivos si son categóricos
label_encoder_evid_saque = LabelEncoder()
label_encoder_depredado = LabelEncoder()
y_evid_saque = label_encoder_evid_saque.fit_transform(y_evid_saque)
y_depredado = label_encoder_depredado.fit_transform(y_depredado)
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train_evid_saque, y_test_evid_saque = train_test_split(X, y_evid_saque, test_size=0.2, random_state=42)
X_train, X_test, y_train_depredado, y_test_depredado = train_test_split(X, y_depredado, test_size=0.2, random_state=42)
In [ ]:
# Manejar valores faltantes antes de entrenar el modelo
X = df[['NumPicho', 'NumHuevo']].fillna({'NumPicho': 0, 'NumHuevo': 0}) # Reemplazar NaN con 0 para 'NumPicho' y ¨NumHuevo¨
Y = df[['NumPicho', 'NumHuevo']].fillna({'Evid_Saque': "No", 'Depredado': "No"}) # Reemplazar NaN con "No" para 'Evid_Saque' y ¨Depredado¨
In [ ]:
# Manejar valores faltantes en X_train y X_test
X_train[['NumPicho', 'NumHuevo']] = X_train[['NumPicho', 'NumHuevo']].fillna(0) # Reemplazar NaN con 0 para 'NumPicho' y 'NumHuevo' en X_train
X_test[['NumPicho', 'NumHuevo']] = X_test[['NumPicho', 'NumHuevo']].fillna(0) # Reemplazar NaN con 0 para 'NumPicho' y 'NumHuevo' en X_test
# Crear el modelo de regresión lineal
model_evid_saque = LinearRegression()
model_depredado = LinearRegression()
# Entrenar el modelo
model_evid_saque.fit(X_train, y_train_evid_saque)
model_depredado.fit(X_train, y_train_depredado)
# Hacer predicciones en el conjunto de prueba
y_pred_evid_saque = model_evid_saque.predict(X_test)
y_pred_depredado = model_depredado.predict(X_test)
In [ ]:
plt.figure(figsize=(5, 3))
plt.scatter(X_test['NumHuevo'], y_test_evid_saque, color='blue', label='Datos Reales')
plt.scatter(X_test['NumHuevo'], y_pred_evid_saque, color='orange', label='Predicciones')
plt.title('Regresión Lineal: Num de Huevos vs Depredado')
plt.xlabel('NumHuevo')
plt.ylabel('Depredado')
plt.legend()
plt.show()
plt.figure(figsize=(5, 3))
plt.scatter(X_test['NumPicho'], y_test_evid_saque, color='blue', label='Datos Reales')
plt.scatter(X_test['NumPicho'], y_pred_evid_saque, color='orange', label='Predicciones')
plt.title('Regresión Lineal: Evid_Saque vs Num de Pichones')
plt.xlabel('NumPicho')
plt.ylabel('Evid_Saque')
plt.legend()
plt.show()
Grafico de correlaciones¶
In [ ]:
# Selecciona solo columnas numéricas para el cálculo de correlación
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(10, 6))
correlation_matrix = numeric_df.corr() # Calcula la correlación en datos numéricos
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Mapa de Calor de Correlaciones')
plt.show()
In [ ]:
# Selecciona las columnas que te interesan
columnas_especificas = ['NumHuevo', 'NumPicho', 'Evid_Saque', 'Depredado']
df_especifico = df[columnas_especificas]
# Calcula la matriz de correlación
correlation_matrix = df_especifico.corr()
# Crea el mapa de calor
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Mapa de Calor de Correlaciones (Columnas Específicas)')
plt.show()
Grafico pastel¶
In [ ]:
# Contar las ocurrencias de cada categoría en Evid_Saque
evid_saque_counts = df['Evid_Saque'].value_counts()
# Crear el gráfico de pastel con colores personalizados
colors = ['Skyblue','Pink',]
plt.figure(figsize=(8, 8))
plt.pie(evid_saque_counts, labels=evid_saque_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Distribución de Evidencia de Saque')
plt.show()
In [ ]:
# Contar las ocurrencias de cada categoría en Depredado
depredado_counts = df['Depredado'].value_counts()
# Crear el gráfico de pastel
colors = ['Skyblue','Purple']
plt.figure(figsize=(8, 8))
plt.pie(depredado_counts, labels=depredado_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Distribución de Depredado')
plt.show()
Modelo Regresion lineal¶
In [ ]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
import operator
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
# Intenta detectar el delimitador automáticamente
df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=None, engine='python')
# Si conoces el delimitador, especifícalo directamente
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=',') # Si es una coma
# df = pd.read_csv('/content/drive/MyDrive/DATOS_LNA.csv', sep=';') # Si es un punto y coma
df.shape
bottle.head()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Out[ ]:
ID | CodArbol | NomCien | NomCom | EspePsitac | Sitio | Año | Temporada | MesEscal | Escalado | ... | ProfEntCavid | DAPArbol | Clima | Nubosidad | Nubosid% | Soleado | Solead% | Precip | Viento | ActivoAno | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | PSSG1 | Samanea saman | Genizaro | LNA | Peña inculta | 2019 | Oct-Ene | Nov | Si | ... | 74 | 324 | Soleado viento | NaN | NaN | Soleado | NaN | NaN | Poco | No |
1 | 2 | PSAP2 | Sterculia apetala | Panama | LNA | Peña inculta | 2019 | Oct-Ene | Dic | Si | ... | 85 | 13.2 | 50%nublado mucho viento | Nublado | 50.0 | NaN | NaN | NaN | Fuerte | Si |
2 | 4 | PSAP3 | Sterculia apetala | Panama | LNA | Peña inculta | 2019 | Oct-Ene | Nov | Si | ... | 80 | 11.2 | 30%nublado mucho viento | Nublado | 30.0 | NaN | NaN | NaN | Fuerte | Si |
3 | 5 | PHCJ4 | Hura crepitans | Jabillo | LNA | Peña inculta | 2019 | Oct-Ene | Dic | Si | ... | 70 | 4.6 | 20% nublado sin viento | Nublado | 20.0 | NaN | NaN | NaN | NaN | Si |
4 | 6 | PBAO5 | Brosimum alicastrum | Ojoche | LNA | Peña inculta | 2019 | Oct-Ene | Dic | Si | ... | 29 | 3.7 | Soleado | NaN | NaN | Soleado | NaN | NaN | NaN | Si |
5 rows × 52 columns
In [ ]:
bottle.describe()
Out[ ]:
ID | Año | ConCavi_M | Evid_Activ | PresePadr | NumHuevo | NumPicho | PichoMuerto | PichoVuelEx | PichoRehab | Edad1Dia | Edad2Dia | Edad3Dia | Edad4Dia | AltCavidad | DirGrado | AltEntCavid | AncEntCavid | Nubosid% | Solead% | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 553.000000 | 553.000000 | 533.000000 | 15.000000 | 472.000000 | 117.000000 | 258.000000 | 14.000000 | 202.000000 | 7.000000 | 183.000000 | 121.000000 | 49.000000 | 1.0 | 494.000000 | 382.000000 | 459.000000 | 473.000000 | 224.000000 | 85.000000 |
mean | 300.781193 | 2021.094033 | 0.120075 | 0.733333 | 0.792373 | 1.504274 | 1.906977 | 1.357143 | 1.891089 | 0.857143 | 29.628415 | 27.884298 | 24.285714 | 60.0 | 14.214730 | 178.378796 | 29.188017 | 14.375264 | 55.553571 | 85.470588 |
std | 173.578703 | 1.902092 | 0.325355 | 0.457738 | 0.406039 | 0.961536 | 0.803125 | 0.841897 | 0.796772 | 0.899735 | 15.321954 | 15.259527 | 13.604534 | NaN | 5.919842 | 106.788897 | 28.088252 | 9.143355 | 31.790186 | 25.208931 |
min | 1.000000 | 2017.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 0.000000 | 60.0 | 0.451389 | 0.000000 | 1.300000 | 4.000000 | 5.000000 | 20.000000 |
25% | 150.000000 | 2020.000000 | 0.000000 | 0.500000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 18.000000 | 15.000000 | 16.000000 | 60.0 | 10.600000 | 94.000000 | 14.000000 | 9.000000 | 20.000000 | 90.000000 |
50% | 300.000000 | 2021.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 2.000000 | 1.000000 | 30.000000 | 28.000000 | 21.000000 | 60.0 | 13.900000 | 180.000000 | 20.000000 | 12.000000 | 50.000000 | 100.000000 |
75% | 453.000000 | 2023.000000 | 0.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 1.750000 | 2.000000 | 1.500000 | 42.000000 | 40.000000 | 34.000000 | 60.0 | 17.285000 | 270.000000 | 33.000000 | 17.000000 | 90.000000 | 100.000000 |
max | 595.000000 | 2024.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 | 4.000000 | 3.000000 | 4.000000 | 2.000000 | 62.000000 | 60.000000 | 60.000000 | 60.0 | 100.000000 | 713.000000 | 240.000000 | 130.000000 | 100.000000 | 100.000000 |
In [ ]:
# Extraer dos columnas 'NumHuevo','NumPicho'
bottle_df = bottle[['NumHuevo','NumPicho']]
# And called again
bottle_df.columns = ['NumHuevo', 'NumPicho']
In [ ]:
bottle_df = bottle_df[:][:500]
bottle_df.head()
Out[ ]:
NumHuevo | NumPicho | |
---|---|---|
0 | NaN | 1.0 |
1 | NaN | 2.0 |
2 | 2.0 | NaN |
3 | 1.0 | NaN |
4 | 2.0 | 2.0 |
In [ ]:
# See picture with scatter or plot method
# sns.lmplot(x="NumHuevo", y="NumPicho", data=bottle_df, order=2, ci=None);
sns.pairplot(bottle_df, kind="reg")
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7b1021d40d00>
In [ ]:
# see how many null values we have
bottle_df.isnull().sum()
Out[ ]:
0 | |
---|---|
NumHuevo | 391 |
NumPicho | 266 |
In [ ]:
# Drop NaN or missing input numbers
bottle_df.fillna(method='ffill', inplace=True)
#bottle_df.isnull().sum()
<ipython-input-196-5497b30dd99e>:3: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. bottle_df.fillna(method='ffill', inplace=True)
In [ ]:
# Features chose
X = np.array(bottle_df['NumHuevo']).reshape(-1, 1)
y = np.array(bottle_df['NumPicho']).reshape(-1, 1)
In [ ]:
# Split data as %20 is test and %80 is train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
Grafico Regresion Lineal¶
In [ ]:
# Drop NaN or missing input numbers
bottle_df.fillna(method='ffill', inplace=True)
# Features chose
X = np.array(bottle_df['NumHuevo']).reshape(-1, 1)
y = np.array(bottle_df['NumPicho']).reshape(-1, 1)
# Split data as %20 is test and %80 is train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
# Check for NaNs in the training and testing sets and replace them with 0
X_train = np.nan_to_num(X_train, nan=0)
y_train = np.nan_to_num(y_train, nan=0)
X_test = np.nan_to_num(X_test, nan=0)
y_test = np.nan_to_num(y_test, nan=0)
#Fit the model
from sklearn.linear_model import LinearRegression
lin_df = LinearRegression()
lin_df.fit(X_train, y_train)
<ipython-input-207-52cba31ba30a>:2: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead. bottle_df.fillna(method='ffill', inplace=True)
Out[ ]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [ ]:
y_pred = lin_df.predict(X_test) # Predict Linear Model
accuracy_score = lin_df.score(X_test, y_test) # Accuracy score
print("Linear Regression Model Accuracy Score: " + "{:.1%}".format(accuracy_score))
Linear Regression Model Accuracy Score: -0.5%
In [ ]:
from sklearn.metrics import mean_squared_error,r2_score
print("R2 Score: " +"{:.3}".format(r2_score(y_test, y_pred)));
R2 Score: -0.00496
In [ ]:
# Dibujar el modelo
plt.scatter(X_test, y_test, color='Skyblue')
plt.plot(X_test, y_pred, color='purple')
plt.show()
Regresion poligonal¶
In [ ]:
from sklearn.preprocessing import PolynomialFeatures
poly_df = PolynomialFeatures(degree = 4)
transform_poly = poly_df.fit_transform(X_train)
linreg2 = LinearRegression()
linreg2.fit(transform_poly,y_train)
polynomial_predict = linreg2.predict(transform_poly)
In [ ]:
rmse = np.sqrt(mean_squared_error(y_train,polynomial_predict))
r2 = r2_score(y_train,polynomial_predict)
print("RMSE Score for Test set: " +"{:.2}".format(rmse))
print("R2 Score for Test set: " +"{:.2}".format(r2))
RMSE Score for Test set: 0.81 R2 Score for Test set: 0.024
In [ ]:
plt.scatter(X_train, y_train, s=50)
# sort the values of x before line plot
sort_axis = operator.itemgetter(0)
sorted_zip = sorted(zip(X_train,polynomial_predict), key=sort_axis)
X_train, polynomial_predict = zip(*sorted_zip)
plt.plot(X_train, polynomial_predict, color='m')
plt.show()
In [ ]: