!pip install numpy
!pip install pandas
!pip install seaborn
!pip install matplotlib

Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.25.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.0.3)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.4)
Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)
Requirement already satisfied: numpy>=1.21.0 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.25.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.10/dist-packages (0.13.1)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /usr/local/lib/python3.10/dist-packages (from seaborn) (1.25.2)
Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.10/dist-packages (from seaborn) (2.0.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /usr/local/lib/python3.10/dist-packages (from seaborn) (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.53.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.0)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.2->seaborn) (2023.4)
Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.2->seaborn) (2024.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.53.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.5)
Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.25.2)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Montar Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

### Carga de datos ###

## Set de datos
dataset = "/content/drive/MyDrive/Proyecto curso python/SraRunTable_CEU.csv"

df = pd.read_csv(dataset)
df

## eigenvectors para PCA
eigenvec = "/content/drive/MyDrive/Proyecto curso python/CEU-2m-pruned_topca.eigenvec"
pca_data = pd.read_table(eigenvec,sep=" ",header=None)

## Set de datos dos (incluye 'uso')
dataset2 = "/content/drive/MyDrive/Proyecto curso python/41467_2019_9135_MOESM4_ESM.xlsx"
df2 = pd.read_excel(dataset2)

## Ver los dataframes
print(df.head())
print(pca_data.head())
print(df2.head())

          Run Assay Type  AvgSpotLen          Bases   BioProject  \
0          S1        WGS         300  not collected          NaN   
1          S2        WGS         300  not collected          NaN   
2          S3        WGS         300  not collected          NaN   
3          S4        WGS         300  not collected          NaN   
4  SRR5891597        WGS         300     7284698100  PRJNA393611   

      BioSample BioSampleModel         Bytes                     Center Name  \
0           NaN          Plant           NaN                       CINVESTAV   
1           NaN          Plant           NaN                       CINVESTAV   
2           NaN          Plant           NaN                       CINVESTAV   
3           NaN          Plant           NaN                       CINVESTAV   
4  SAMN07341438          Plant  2.808682e+09  YUNNAN AGRICULTURAL UNIVERSITY   

  Consent  ... LibrarySource Number        Organism  Platform  \
0  public  ...       GENOMIC  10231  Vitis vinifera  ILLUMINA   
1  public  ...       GENOMIC  10232  Vitis vinifera  ILLUMINA   
2  public  ...       GENOMIC  10233  Vitis vinifera  ILLUMINA   
3  public  ...       GENOMIC  10234  Vitis vinifera  ILLUMINA   
4  public  ...       GENOMIC  10235  Vitis vinifera  ILLUMINA   

            ReleaseDate Sample Name  SRA Study Tissue           create_date  \
0  2020-03-12T00:00:00Z         E12        NaN   leaf  2023-03-05T09:00:00Z   
1  2020-03-12T00:00:00Z        E11A        NaN   leaf  2023-03-05T09:00:00Z   
2  2020-03-12T00:00:00Z          E1        NaN   leaf  2023-03-05T09:00:00Z   
3  2020-03-12T00:00:00Z         E15        NaN   leaf  2023-03-05T09:00:00Z   
4  2019-03-22T00:00:00Z      TA-298  SRP114740   leaf  2017-08-03T05:56:00Z   

  version  
0       1  
1       1  
2       1  
3       1  
4       1  

[5 rows x 33 columns]
      0      1         2         3         4         5         6         7   \
0    E12    E12 -0.014686 -0.016190  0.053925 -0.074761 -0.014066 -0.070570   
1   E11A   E11A -0.014524 -0.015733  0.054707 -0.076237 -0.014130 -0.071402   
2     E1     E1 -0.011885 -0.027222  0.073832 -0.020151  0.062249 -0.057693   
3    E15    E15 -0.014885 -0.023335  0.060087 -0.004371  0.019015 -0.029426   
4  TA-29  TA-29  0.044947 -0.065460  0.070841 -0.055787 -0.104528  0.265947   

         8         9   ...        12        13        14        15        16  \
0 -0.001969 -0.043521  ...  0.003277 -0.092974 -0.006528 -0.002712 -0.068238   
1 -0.002164 -0.043265  ...  0.003843 -0.094429 -0.009198 -0.004783 -0.068545   
2 -0.002218 -0.077083  ...  0.059356 -0.056765 -0.021118  0.022149 -0.038081   
3  0.001844 -0.074204  ...  0.013128 -0.005396 -0.008955  0.007440 -0.020285   
4  0.002019 -0.030907  ... -0.141452  0.167367 -0.139908  0.324391  0.077352   

         17        18        19        20        21  
0  0.074481  0.059498 -0.001976 -0.061126 -0.166349  
1  0.074812  0.060030 -0.001830 -0.060826 -0.167435  
2  0.049547  0.063412 -0.034344 -0.080176 -0.112781  
3  0.060897  0.003371 -0.046752 -0.030797 -0.011192  
4  0.100560  0.076298 -0.233575  0.158972  0.071994  

[5 rows x 22 columns]
  Database ID\n(U: US-National Plant Germplasm System\nV: VitisInternational Variety Catalogue)  \
0                                                NaN                                              
1                                                NaN                                              
2                                                NaN                                              
3                                                NaN                                              
4                                            V. 2624                                              

       Latin Name Cultivar Name\n(Common Name) Sample ID  \
0  Vitis vinifera                Listan Prieto        E1   
1  Vitis vinifera                Rosa del Peru      E11A   
2  Vitis vinifera                Rosa del Peru       E12   
3  Vitis vinifera                     Palomino       E15   
4  Vitis vinifera               Christmas Rose     TA-10   

  Taxonomic Comment \nAccording to \nPhylogenetic Tree  Country of Origin  \
0                                                NaN               Mexico   
1                                                NaN               Mexico   
2                                                NaN               Mexico   
3                                                NaN               Mexico   
4                                                NaN        United States   

  Cultivation\n(W: Wild;  C: Cultivated)  Usage Genetic Classification  \
0                                      C   Wine                    NaN   
1                                      C   Wine                    NaN   
2                                      C   Wine                    NaN   
3                                      C   Wine                    NaN   
4                                      C  Table   Intraspecific Hybrid   

                      Parent 1                          Parent 2 Ploidy\n(N)  \
0                          NaN                               NaN           2   
1                          NaN                               NaN           2   
2                          NaN                               NaN           2   
3                          NaN                               NaN           2   
4  (Hunisa X Emperor) X Nocera  (Hunisa X Emperor) X Pirovano 75           2   

  Fruit Skin Color  Sex of Flower  Berry Weight\n(g)  Sample Source Note  
0            Rouge  Hermaphrodite                NaN         Mexico  NaN  
1            Rouge  Hermaphrodite                NaN         Mexico  NaN  
2            Rouge  Hermaphrodite                NaN         Mexico  NaN  
3            Blanc  Hermaphrodite                NaN         Mexico  NaN  
4            Rouge  Hermaphrodite              8.648  United States  NaN

## Columnas de interés
columnas = ['Sample Name', 'BioSample', 'Center Name', 'Cultivar', 'geo_loc_name_country',
            'geo_loc_name_country_continent', 'Number', 'Organism', 'Tissue']
## Columnas del df antes
columnas_antes = df.columns
print(columnas_antes)

## Columnas del df después
df = df[columnas]
print(df.columns)

Index(['Run', 'Assay Type', 'AvgSpotLen', 'Bases', 'BioProject', 'BioSample',
       'BioSampleModel', 'Bytes', 'Center Name', 'Consent', 'Cultivar',
       'DATASTORE filetype', 'DATASTORE provider', 'DATASTORE region',
       'dev_stage', 'Experiment', 'geo_loc_name_country',
       'geo_loc_name_country_continent', 'geo_loc_name', 'Instrument',
       'Library Name', 'LibraryLayout', 'LibrarySelection', 'LibrarySource',
       'Number', 'Organism', 'Platform', 'ReleaseDate', 'Sample Name',
       'SRA Study', 'Tissue', 'create_date', 'version'],
      dtype='object')
Index(['Sample Name', 'BioSample', 'Center Name', 'Cultivar',
       'geo_loc_name_country', 'geo_loc_name_country_continent', 'Number',
       'Organism', 'Tissue'],
      dtype='object')

for i in range(len(df)):
    condition = df2["Sample ID"] == df.loc[i, "Sample Name"]
    try:
        value = df2.loc[condition, "Usage"].astype(str).values[0]
        if pd.isna(value) or value == 'nan':  # Check for NaN or 'nan' string
            df.loc[i, "Usage"] = "Unknown"
        else:
            df.loc[i, "Usage"] = value
    except (IndexError, TypeError):
        df.loc[i, "Usage"] = "Unknown"

fields = ["Sample Name", "Usage"]
df.loc[:,fields]

df["Usage"].head(20)

0                 Unknown
1                 Unknown
2                 Unknown
3                 Unknown
4                   Table
5                    Wine
6                   Table
7                    Wine
8                   Table
9                   Table
10                  Table
11            Wine, Table
12            Wine, Table
13                   Wine
14                  Table
15          Table, Raisin
16                  Table
17                  Table
18          Table, Raisin
19    Wine, Table, Raisin
Name: Usage, dtype: object

print(df["geo_loc_name_country"].value_counts)
condicion = df["geo_loc_name_country"] == "uncalculated"


df = df[-condicion]

print(df["geo_loc_name_country"].value_counts)

<bound method IndexOpsMixin.value_counts of 0            Mexico
1            Mexico
2            Mexico
3            Mexico
4             Japan
           ...     
219           Japan
220    uncalculated
221    uncalculated
222    uncalculated
223           Japan
Name: geo_loc_name_country, Length: 224, dtype: object>
<bound method IndexOpsMixin.value_counts of 0       Mexico
1       Mexico
2       Mexico
3       Mexico
4        Japan
        ...   
216    Lebanon
217    Moldova
218    Moldova
219      Japan
223      Japan
Name: geo_loc_name_country, Length: 129, dtype: object>

# Esto se hizo para generar el nombre de las columnas PC
PC_columns = []
for i in range(1,len(pca_data.columns)-1): # aunque son dos columnas las que no son PC, se quita solo 1, por el valor que no considera el len (que iria de 0 al final)
  caracter = str(i)
  name = 'PC'+ caracter
  PC_columns.append(name) # Esto va añadiendo a la lista cada salida del ciclo
PC_columns

['PC1',
 'PC2',
 'PC3',
 'PC4',
 'PC5',
 'PC6',
 'PC7',
 'PC8',
 'PC9',
 'PC10',
 'PC11',
 'PC12',
 'PC13',
 'PC14',
 'PC15',
 'PC16',
 'PC17',
 'PC18',
 'PC19',
 'PC20']

pca_data.columns = ['Sample Name','Sample ID'] + PC_columns
pca_data.columns
pca_data

## Se pegó la columna de país del df, utilizando la columna en común Sample Name
pca_data = pd.merge(pca_data,df[['Sample Name','geo_loc_name_country',]],on='Sample Name')
pca_data

## Se pegó la columna de Usage del df2, utilizando la columna en común Sample ID
pca_data = pd.merge(pca_data,df2[['Sample ID','Usage',]],on='Sample ID')
pca_data

# Agrupaciones
mex = ['Mexico']
nor_am = ['USA']
sou_am = ['Brazil']
eur_occ = ['United Kingdom', 'Spain', 'France', 'Italy', 'Switzerland', 'Germany']
eur_cen = ['Albania', 'Austria', 'Bulgaria', 'Georgia', 'Greece', 'Hungary', 'Moldova', 'Romania', 'Russia', 'Turkey']
africa = ['Algeria']
asia_or = ['Japan', 'Korea', 'China', 'Uzbekistan', 'Azerbaijan', 'Armenia', 'Lebanon']

# Función para categorizar
def asignar_region(fila):
  for i in range(0,len(eur_occ)-1):
    if fila['geo_loc_name_country'] == eur_occ[i] :
      return "eur_occ"
  for i in range(0,len(eur_cen)-1):
    if fila['geo_loc_name_country'] == eur_cen[i]:
      return "eur_cen"
  for i in range(0,len(asia_or)):
    if fila['geo_loc_name_country'] == asia_or[i]:
      return "asia_or"
  if fila['geo_loc_name_country'] == nor_am[0]:
    return "nor_am"
  elif fila['geo_loc_name_country'] == sou_am[0]:
    return "sou_am"
  elif fila['geo_loc_name_country'] == africa[0]:
    return "africa"
  elif fila['geo_loc_name_country'] == mex[0]:
    return "mex"

# Aplicar la función a cada fila y agregar una nueva columna "region"
pca_data["region"] = pca_data.apply(asignar_region, axis=1)

# Resultado
pca_data[['Sample Name','region']]

## Para eliminar las filas que no tienen valores en región, ya que no tenían información de país
region_NAindex = pca_data[pd.isna(pca_data['region'])].index
pca_data = pca_data.drop(region_NAindex,axis=0) # axis=1 es para columnas
pca_data[['Sample Name','region']]

pca_data = pca_data.drop("Sample Name",axis=1) # axis=1 es para columnas
pca_data.columns

Index(['Sample ID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8',
       'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17',
       'PC18', 'PC19', 'PC20', 'geo_loc_name_country', 'Usage', 'region'],
      dtype='object')

# Clustering
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt

#Se calcula el error cuadrado con k diferentes (distorción)
pca_data.dropna(inplace=True)
data2 = pca_data.drop(columns =["Sample ID","region","geo_loc_name_country","Usage"])
data2 = data2.to_numpy()

distortions = []
for i in range(1, 11):
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0)
    km.fit(data2)
    distortions.append(km.inertia_)

# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Número de agrupamientos')
plt.ylabel('Distorción')
plt.show()

frequency = df["Usage"].value_counts()

# Create bar plot
plt.figure(figsize=(15, 5))
plt.bar(frequency.index, frequency.values, color='lightblue')
plt.xlabel('Usage')
plt.ylabel('Frecuencia')
plt.title('Uso de la Uva')
plt.show()

frequency = pca_data["region"].value_counts()

# Create bar plot
plt.figure(figsize=(15, 5))
plt.bar(["Asia Oriental", "Europa occidental", "Europa central", "Norte América", "Sur América", "África", "México"], frequency.values, color='green')
plt.xlabel('Continente')
plt.ylabel('Frecuencia')
plt.title('Origen de la Uva')
plt.show()

con = df["geo_loc_name_country"] == "USA"
df.loc[con,"geo_loc_name_country"] = "United States of America"
frequency = df["geo_loc_name_country"].value_counts()
frequency

geo_loc_name_country
China                       29
Japan                       24
United States of America    15
France                      15
Italy                        5
Romania                      4
Hungary                      3
Greece                       3
Uzbekistan                   3
Germany                      3
Georgia                      3
Lebanon                      2
Moldova                      2
Armenia                      2
Azerbaijan                   2
Spain                        1
Switzerland                  1
Brazil                       1
Russia                       1
United Kingdom               1
Bulgaria                     1
Algeria                      1
Albania                      1
Austria                      1
Turkey                       1
Name: count, dtype: int64

import geopandas as gpd

data = {
    'Country': frequency.index,
    'Frequency': frequency.values
}
data = pd.DataFrame(data)

# Load world shapefile from geopandas datasets
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Merge your data with the world GeoDataFrame
world = world.merge(data, how='left', left_on='name', right_on='Country')

# Aggregate data by continent
continent_freq = world.groupby('continent').agg({'Frequency': 'sum'}).reset_index()

base = world.boundary.plot(edgecolor='gray', figsize=(20, 15))

# Merge the continent frequency data back to the world dataframe for mapping
world = world.merge(continent_freq, on='continent', suffixes=('', '_continent'))

world = world[world['continent'] != "Antarctica"]

# Plot
world.plot(column='Frequency_continent', ax=base, legend=True,
           legend_kwds={'label': "Frequency by Continent"},
           cmap='OrRd')  # Color map can be changed according to preference

plt.title('Continent Frequency Map')
plt.show()

<ipython-input-117-17dd33b5ead8>:10: FutureWarning: The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.
  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

colors_PC23 = ["#3366CC","#DC3912","#FF9900","#109618","#990099","#0099C6","#DD4477"]
plt.figure(figsize=(12, 6))
sns.scatterplot(data=pca_data, x="PC1", y="PC2", hue="region",palette=colors_PC23)

<Axes: xlabel='PC1', ylabel='PC2'>

colors_PC23 = ["#990099","#DC3912","#DE9ED6","#FF9900","#0099C6","#3366CC","#109618"]
plt.figure(figsize=(12, 6))
pca_plot = sns.scatterplot(data=pca_data, x="PC2", y="PC3", hue="region",palette=colors_PC23)

plt.figure(figsize=(12, 6))
sns.scatterplot(data=pca_data, x="PC1", y="PC2", hue="Usage")

<Axes: xlabel='PC1', ylabel='PC2'>

plt.figure(figsize=(12, 6))
sns.scatterplot(data=pca_data, x="PC1", y="PC3", hue="Usage")

<Axes: xlabel='PC1', ylabel='PC3'>

# Generación de clusters

Z = linkage(data2, method='complete')
max_clusters = 6
clusters = fcluster(Z, max_clusters, criterion='maxclust')
plt.figure(figsize=(10, 7))
dendrogram(Z)

# Agregar una línea horizontal para mostrar el corte
max_d = Z[-(max_clusters-1), 2]  # max_clusters-1 porque Z tiene una combinación menos que el número de clusters
plt.axhline(y=max_d, c='k')

plt.title("Dendrograma con corte en {} clusters".format(max_clusters))
plt.xlabel("Muestras")
plt.ylabel("Distancia")
plt.show()

# Imprimir las asignaciones de clusters
pca_data["cluster"] = clusters

cross_tab = pd.crosstab(pca_data['cluster'], pca_data['region'])

# Mostrar la tabla cruzada
print(cross_tab)

region   africa  asia_or  eur_cen  eur_occ  nor_am  sou_am
cluster                                                   
1             0        2        0        0       0       0
2             1       58       18       22      15       1
3             0        1        0        0       0       0
4             0        1        0        0       0       0
5             0        0        0        1       0       0
6             0        0        1        0       0       0

Análisis de cuatro variedades mexicanas de Vitis vinifera y su relación con variedades de vid cultivadas en Europa, Asia y Norte América¶

Integrantes:¶

Descripción y justificación¶

Antecedentes¶

Descripción del problema y objetivos¶

Descripción del conjunto de datos¶

Procesamiento y visualización de los datos¶

Instalación de librerías¶

Importar los datos¶

Procesamiento set de datos¶

Añadir columna "Usage"¶

Eliminar filas donde la columna 'country' tiene valores "uncalculated".¶

Procesamiento datos para PCA¶

Procesamiento datos para K-means a partir de variables del PCA¶

Resultados¶

Gráficos descriptivos de los datasets utilizados¶

Uso de las Uvas analizadas¶

Origen de la Uva Analizada¶

Mapa con la frecuencias de las uvas según el país de origen¶

Principal Component Analysis¶

Resultados del K-means¶

Conclusiones¶

Referencias¶

	Sample Name	Sample ID	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	...	PC11	PC12	PC13	PC14	PC15	PC16	PC17	PC18	PC19	PC20
0	E12	E12	-0.014686	-0.016190	0.053925	-0.074761	-0.014066	-0.070570	-0.001969	-0.043521	...	0.003277	-0.092974	-0.006528	-0.002712	-0.068238	0.074481	0.059498	-0.001976	-0.061126	-0.166349
1	E11A	E11A	-0.014524	-0.015733	0.054707	-0.076237	-0.014130	-0.071402	-0.002164	-0.043265	...	0.003843	-0.094429	-0.009198	-0.004783	-0.068545	0.074812	0.060030	-0.001830	-0.060826	-0.167435
2	E1	E1	-0.011885	-0.027222	0.073832	-0.020151	0.062249	-0.057693	-0.002218	-0.077083	...	0.059356	-0.056765	-0.021118	0.022149	-0.038081	0.049547	0.063412	-0.034344	-0.080176	-0.112781
3	E15	E15	-0.014885	-0.023335	0.060087	-0.004371	0.019015	-0.029426	0.001844	-0.074204	...	0.013128	-0.005396	-0.008955	0.007440	-0.020285	0.060897	0.003371	-0.046752	-0.030797	-0.011192
4	TA-29	TA-29	0.044947	-0.065460	0.070841	-0.055787	-0.104528	0.265947	0.002019	-0.030907	...	-0.141452	0.167367	-0.139908	0.324391	0.077352	0.100560	0.076298	-0.233575	0.158972	0.071994
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
219	TA-283	TA-283	-0.017452	-0.016879	0.036911	-0.088469	-0.053740	0.047101	-0.000239	-0.009757	...	-0.017475	-0.022053	0.013440	0.009233	-0.030485	0.052002	-0.022209	0.049497	0.012616	-0.075352
220	TA-209	TA-209	-0.025009	0.012676	-0.037408	-0.002282	-0.065865	-0.003752	-0.014067	0.075457	...	-0.011797	-0.045304	-0.083485	0.083388	0.011899	-0.047824	-0.032725	0.019612	0.007137	-0.003353
221	TA-204	TA-204	-0.020803	0.007552	-0.035712	0.015591	-0.085044	-0.007878	-0.012688	0.084355	...	-0.025399	-0.033390	-0.060646	0.097038	0.014038	-0.081245	-0.044973	0.037149	-0.005514	0.022293
222	TA-205	TA-205	-0.019320	0.004735	-0.035951	0.006743	-0.088148	0.001385	-0.007459	0.077632	...	-0.018559	-0.034754	-0.052400	0.100654	0.013964	-0.063865	-0.048938	0.044306	-0.003964	0.005114
223	TA-88	TA-88	-0.015556	-0.012067	0.029640	-0.100086	-0.015574	0.063558	0.002726	0.001665	...	-0.064565	-0.032204	0.044884	-0.019625	-0.070008	-0.024771	-0.028793	0.079466	0.078044	-0.031704

	Sample Name	region
0	E12	mex
1	E11A	mex
2	E1	mex
3	E15	mex
4	TA-29	asia_or
...	...	...
219	TA-283	asia_or
220	TA-209	None
221	TA-204	None
222	TA-205	None
223	TA-88	asia_or