Este notebook es para limpiar la base de datos que estoy usando para que ROOT pueda leer las columnas de strings.
import pandas as pd
# -- leer datos
datao = pd.read_csv('data-used/meteorite-landings.csv')
datao.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45716 entries, 0 to 45715 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 45716 non-null object 1 id 45716 non-null int64 2 nametype 45716 non-null object 3 recclass 45716 non-null object 4 mass 45585 non-null float64 5 fall 45716 non-null object 6 year 45428 non-null float64 7 reclat 38401 non-null float64 8 reclong 38401 non-null float64 9 GeoLocation 38401 non-null object dtypes: float64(4), int64(1), object(5) memory usage: 3.5+ MB
# -- quedarme con todas las columnas menos 'GeoLocation'
datao = datao[['name','id','nametype','recclass','mass','fall', 'year', 'reclat', 'reclong']]
# -- eliminar datos incorrectos, según la documentación
datao = datao[(datao['year'] <= 2016) & (datao['year'] >= 860)]
datao = datao[(datao['reclong'] != 0) & (datao['reclat'] != 0)]
datao = datao[(datao['reclong'] != None) & (datao['reclat'] != None)]
datao = datao.reset_index(drop=True)
# -- eliminar filas con valores NaN
datao = datao.dropna()
# -- reemplazar comas y espacios por guiones al piso _
datao.replace(', ','_', regex=True, inplace=True)
datao.replace(',','_', regex=True, inplace=True)
datao.replace(' ','_', regex=True, inplace=True)
# -- binarizar columnas de strings que solo tienen dos valores
datao.replace('Found','1', regex=True, inplace=True)
datao.replace('Fell','0', regex=True, inplace=True)
datao.replace('Relict','1', regex=True, inplace=True)
datao.replace('Valid','0', regex=True, inplace=True)
# -- convertir valores binarizados a enteros
datao['nametype'] = datao['nametype'].astype(int)
datao['fall'] = datao['fall'].astype(int)
datao
name | id | nametype | recclass | mass | fall | year | reclat | reclong | |
---|---|---|---|---|---|---|---|---|---|
0 | Aachen | 1 | 0 | L5 | 21.0 | 0 | 1880.0 | 50.77500 | 6.08333 |
1 | Aarhus | 2 | 0 | H6 | 720.0 | 0 | 1951.0 | 56.18333 | 10.23333 |
2 | Abee | 6 | 0 | EH4 | 107000.0 | 0 | 1952.0 | 54.21667 | -113.00000 |
3 | Acapulco | 10 | 0 | Acapulcoite | 1914.0 | 0 | 1976.0 | 16.88333 | -99.90000 |
4 | Achiras | 370 | 0 | L6 | 780.0 | 0 | 1902.0 | -33.16667 | -64.95000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
39010 | Zillah_002 | 31356 | 0 | Eucrite | 172.0 | 1 | 1990.0 | 29.03700 | 17.01850 |
39011 | Zinder | 30409 | 0 | Pallasite_ungrouped | 46.0 | 1 | 1999.0 | 13.78333 | 8.96667 |
39012 | Zlin | 30410 | 0 | H4 | 3.3 | 1 | 1939.0 | 49.25000 | 17.66667 |
39013 | Zubkovsky | 31357 | 0 | L6 | 2167.0 | 1 | 2003.0 | 49.78917 | 41.50460 |
39014 | Zulu_Queen | 30414 | 0 | L3.7 | 200.0 | 1 | 1976.0 | 33.98333 | -115.68333 |
31705 rows × 9 columns
# -- checkear el tipo de cada columna
datao.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 31705 entries, 0 to 39014 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 31705 non-null object 1 id 31705 non-null int64 2 nametype 31705 non-null int64 3 recclass 31705 non-null object 4 mass 31705 non-null float64 5 fall 31705 non-null int64 6 year 31705 non-null float64 7 reclat 31705 non-null float64 8 reclong 31705 non-null float64 dtypes: float64(4), int64(3), object(2) memory usage: 2.4+ MB
# -- guardar data limpia como archivo csv
datao.to_csv('data-used/meteorite-landings_clean.csv')
# https://github.com/artfisica/hackathon2017bigdata/blob/master/analysis_notebooks/notebook_analysis_agro_equipment_CVS_to_ROOT_cpp_hackathon_01.ipynb
# http://opendata.atlas.cern/release/2020/documentation/datasets/intro.html
# https://cds.cern.ch/record/2707171/files/ANA-OTRC-2019-01-PUB-updated.pdf
# TString dir = gSystem->UnixPathName(__FILE__);
# dir.ReplaceAll("meteorite.C","");
# dir.ReplaceAll("/./","/");
# TFile *f = new TFile("meteorite.root","RECREATE");
# TTree *tree = new TTree("data","data from csv file");
# tree->ReadFile("meteorite-landings_clean.csv","entry/I:name/C:id/I:nametype/I:recclass/C:mass/F:fall/I:year/F:reclat/F:reclong/F",',');
# #tree->ReadFile("meteorite-landings_clean.csv","entry/I:id/I:nametype/I:mass/F:fall/I:year/F:reclat/F:reclong/F",',');
# f->Write();
# TFile *_file0 = TFile::Open("meteorite.root");