diff --git a/codigo/Tarea8.ipynb b/codigo/Tarea8.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..37d038018e26cc802361b74a8660fa9d945da43d --- /dev/null +++ b/codigo/Tarea8.ipynb @@ -0,0 +1,559 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook de datos\n", + "# Tarea Clase 8\n", + "## @britod" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Importamos las librerÃas básicas\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creamos el camini hacia los datos:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "path_data = './data/data_surveys.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>record_id</th>\n", + " <th>month</th>\n", + " <th>day</th>\n", + " <th>year</th>\n", + " <th>plot_id</th>\n", + " <th>species_id</th>\n", + " <th>sex</th>\n", + " <th>hindfoot_length</th>\n", + " <th>weight</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>7</td>\n", + " <td>16</td>\n", + " <td>1977</td>\n", + " <td>2</td>\n", + " <td>NL</td>\n", + " <td>M</td>\n", + " <td>32.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>7</td>\n", + " <td>16</td>\n", + " <td>1977</td>\n", + " <td>3</td>\n", + " <td>NL</td>\n", + " <td>M</td>\n", + " <td>33.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>7</td>\n", + " <td>16</td>\n", + " <td>1977</td>\n", + " <td>2</td>\n", + " <td>DM</td>\n", + " <td>F</td>\n", + " <td>37.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>7</td>\n", + " <td>16</td>\n", + " <td>1977</td>\n", + " <td>7</td>\n", + " <td>DM</td>\n", + " <td>M</td>\n", + " <td>36.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " <td>7</td>\n", + " <td>16</td>\n", + " <td>1977</td>\n", + " <td>3</td>\n", + " <td>DM</td>\n", + " <td>M</td>\n", + " <td>35.0</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35544</td>\n", + " <td>35545</td>\n", + " <td>12</td>\n", + " <td>31</td>\n", + " <td>2002</td>\n", + " <td>15</td>\n", + " <td>AH</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35545</td>\n", + " <td>35546</td>\n", + " <td>12</td>\n", + " <td>31</td>\n", + " <td>2002</td>\n", + " <td>15</td>\n", + " <td>AH</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35546</td>\n", + " <td>35547</td>\n", + " <td>12</td>\n", + " <td>31</td>\n", + " <td>2002</td>\n", + " <td>10</td>\n", + " <td>RM</td>\n", + " <td>F</td>\n", + " <td>15.0</td>\n", + " <td>14.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35547</td>\n", + " <td>35548</td>\n", + " <td>12</td>\n", + " <td>31</td>\n", + " <td>2002</td>\n", + " <td>7</td>\n", + " <td>DO</td>\n", + " <td>M</td>\n", + " <td>36.0</td>\n", + " <td>51.0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>35548</td>\n", + " <td>35549</td>\n", + " <td>12</td>\n", + " <td>31</td>\n", + " <td>2002</td>\n", + " <td>5</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>35549 rows × 9 columns</p>\n", + "</div>" + ], + "text/plain": [ + " record_id month day year plot_id species_id sex hindfoot_length \\\n", + "0 1 7 16 1977 2 NL M 32.0 \n", + "1 2 7 16 1977 3 NL M 33.0 \n", + "2 3 7 16 1977 2 DM F 37.0 \n", + "3 4 7 16 1977 7 DM M 36.0 \n", + "4 5 7 16 1977 3 DM M 35.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "35544 35545 12 31 2002 15 AH NaN NaN \n", + "35545 35546 12 31 2002 15 AH NaN NaN \n", + "35546 35547 12 31 2002 10 RM F 15.0 \n", + "35547 35548 12 31 2002 7 DO M 36.0 \n", + "35548 35549 12 31 2002 5 NaN NaN NaN \n", + "\n", + " weight \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "35544 NaN \n", + "35545 NaN \n", + "35546 14.0 \n", + "35547 51.0 \n", + "35548 NaN \n", + "\n", + "[35549 rows x 9 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Leemos la data usando pandas\n", + "surveys_data = pd.DataFrame(pd.read_csv(path_data))\n", + "surveys_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vamos a definir la función que que calcule los promedios de las últimas dos colummas" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_values_hw(data_frame):\n", + " v = True\n", + " # Corroboramos que las entradas son correctas\n", + " if type(data_frame) == pd.DataFrame:\n", + " print('OK, Data frame aceptado')\n", + " else:\n", + " v = False\n", + " print('Por favor ingresar un data frame')\n", + "\n", + " \n", + " # Aqi viene el algoritmo para calculo de promedios:\n", + " no_value_weight = 0 # Contador de valores que no tienen información en el peso\n", + " for i in range(0,len(data_frame),1): # Recorremos cada uno de los Ãndices del data-frame\n", + " if pd.isnull(data_frame['weight'][i]) == True:\n", + " no_value_weight = no_value_weight +1\n", + " # Vamos a calcular el promedio de los pesos sin considerar los campos nulos\n", + " mean_value_weight = data_frame['weight'].sum()/(len(data_frame)-no_value_weight)\n", + " \n", + " \n", + " no_value_hindfoot = 0 # Contador de valores que no tienen información en el peso\n", + " for i in range(0,len(data_frame),1): # Recorremos cada uno de los Ãndices del data-frame\n", + " if pd.isnull(data_frame['hindfoot_length'][i]) == True:\n", + " no_value_hindfoot = no_value_hindfoot +1\n", + " # Vamos a calcular el promedio de los talones\n", + " mean_value_hindfoot = data_frame['hindfoot_length'].sum()/(len(data_frame)-no_value_hindfoot)\n", + "\n", + " if v == True:\n", + " return( print('Aquà tiene el promedio del peso', mean_value_weight ),\n", + " print('Aquà tiene el promedio del talón', mean_value_hindfoot ))\n", + " else:\n", + " return(print('Empieza otra vez.'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Valores medios de las últimas dos columnas" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OK, Data frame aceptado\n", + "Aquà tiene el promedio del peso 42.672428212991356\n", + "Aquà tiene el promedio del talón 29.287931802277498\n" + ] + }, + { + "data": { + "text/plain": [ + "(None, None)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_values_hw(surveys_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002B2C01FEDC8>]],\n", + " dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "surveys_data.hist(column='weight')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000002B2BFF509C8>]],\n", + " dtype=object)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "surveys_data.hist(column='hindfoot_length')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Funcion que calcula valores promedios segun los años " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Promedio del año 1977 46.47565543071161\n", + "Promedio del año 1978 67.8594470046083\n", + "Promedio del año 1979 63.291079812206576\n", + "Promedio del año 1980 62.401365705614566\n", + "Promedio del año 1981 65.81075110456554\n", + "Promedio del año 1982 53.74361759913091\n", + "Promedio del año 1983 55.08226221079691\n", + "Promedio del año 1984 50.93181818181818\n", + "Promedio del año 1985 46.659382064807836\n", + "Promedio del año 1986 54.98946135831382\n", + "Promedio del año 1987 49.41858932102834\n", + "Promedio del año 1988 45.0241145440844\n", + "Promedio del año 1989 35.72550382209868\n", + "Promedio del año 1990 35.47642679900744\n", + "Promedio del año 1991 32.03104575163399\n", + "Promedio del año 1992 33.29125138427464\n", + "Promedio del año 1993 34.205607476635514\n", + "Promedio del año 1994 34.48479427549195\n", + "Promedio del año 1995 29.50321987120515\n", + "Promedio del año 1996 28.20160791589363\n", + "Promedio del año 1997 31.748760330578513\n", + "Promedio del año 1998 34.805734767025086\n", + "Promedio del año 1999 36.46933085501859\n", + "Promedio del año 2000 32.37214137214137\n", + "Promedio del año 2001 36.444290657439446\n", + "25\n" + ] + }, + { + "data": { + "text/plain": [ + "43.857890384645245" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "year_sum_value = 0\n", + "\n", + "j=0 # Contador de campos nulos\n", + "index_interval = []\n", + "mean_value_list = []\n", + "\n", + "for i in range(1,len(surveys_data)-1):\n", + " \n", + " if surveys_data['year'][i-1] == surveys_data['year'][i]:\n", + " if pd.isnull(surveys_data['weight'][i-1]) == True:\n", + " j = j + 1\n", + " else:\n", + " year_sum_value = year_sum_value +surveys_data['weight'][i-1]\n", + " else:\n", + " if surveys_data['year'][i-1] != max(surveys_data['year']):\n", + " index_interval.append(i)\n", + " if len(index_interval) < 2:\n", + " year_mean_value = year_sum_value/(index_interval[-1] - j) \n", + " print('Promedio del año ', surveys_data['year'][i-1], year_mean_value )\n", + " mean_value_list.append(year_mean_value)\n", + " j = 0 \n", + " year_sum_value = 0\n", + " else:\n", + " year_mean_value = year_sum_value/(index_interval[-1] - index_interval[-2] - j) \n", + " print('Promedio del año ', surveys_data['year'][i-1], year_mean_value )\n", + " mean_value_list.append(year_mean_value)\n", + " j = 0 \n", + " year_sum_value = 0\n", + " else:\n", + " j = 0\n", + " year_sum_value = 0\n", + " if pd.isnull(surveys_data['weight'][i-1]) == True:\n", + " j = j + 1\n", + " else:\n", + " print('Aca estoy')\n", + " lastyear_sum_value = year_sum_value + surveys_data['year'][i]\n", + " index_interval.append(len(surveys_data))\n", + " lastyear_mean_value = lastyear_sum_value / (len(surveys_data) - index_interval[-2] - j)\n", + " print('Promedio del año ', surveys_data['year'][i], lastyear_mean_value )\n", + " mean_value_list.append(lastyear_mean_value)\n", + " \n", + " \n", + "print(len(mean_value_list)) \n", + "sum(mean_value_list)/(len(mean_value_list))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2002" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max(surveys_data['year'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}