febrero 26, 2013

Resumiendo datos


Resumir datos es útil para encontrar valores perdidos, valores fuera del rango esperado, valores con diferentes unidades, y variables mal identificadas o clasificadas.  Los siguientes códigos para resumir datos los copie de MOOC Coursera: Data Análisis (2003)—Summarizing Data.


# 4-7 Summarizing Data

# downloading data directly from URL: Earthquake data
fileUrl <- "http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt"

# saving data as .csv
download.file(fileUrl,destfile="earthquakeData.csv")

# date downloaded
dateDownloaded <- date()
dateDownloaded
[1] "Mon Jun 10 09:45:08 2013"

eData <- read.csv("earthquakeData.csv")
head(eData)
  Src     Eqid Version                           Datetime   Lat    Lon
1  ak 10733493       1 Monday, June 10, 2013 01:28:45 UTC 64.40 -147.6
2  nc 72005295       0 Monday, June 10, 2013 01:14:03 UTC 38.83 -122.8
3  nc 72005290       0 Monday, June 10, 2013 01:01:02 UTC 38.57 -122.6
4  ak 10733481       1 Monday, June 10, 2013 00:37:07 UTC 63.32 -151.2
5  nc 72005280       0 Monday, June 10, 2013 00:29:48 UTC 36.56 -121.1
6  ci 15357673       2 Monday, June 10, 2013 00:29:31 UTC 32.97 -115.6
  Magnitude Depth NST              Region
1       1.1   2.1   8      Central Alaska
2       1.0   3.8  22 Northern California
3       1.7   7.1  13 Northern California
4       2.3   0.0  22      Central Alaska
5       1.4   7.9   9  Central California
6       1.6   4.4  18 Southern California

# looking at data 
# names of the data frame
names(eData)
 [1] "Src"       "Eqid"      "Version"   "Datetime"  "Lat"      
 [6] "Lon"       "Magnitude" "Depth"     "NST"       "Region"   

# dimension of rows and columns
# rows
nrow(eData)
[1] 1115

# columns
ncol(eData)
[1] 10

# rows and column
dim(eData)
[1] 1115   10

# quantiles
quantile(eData$Lat)
    0%    25%    50%    75%   100% 
-61.16  33.24  38.81  58.26  74.78 

# descriptive statistics
summary(eData)
      Src            Eqid         Version   
 ak     :363   00414081:   1   2      :396  
 ci     :266   00414206:   1   1      :205  
 nc     :185   00414228:   1   0      :143  
 us     :109   00414229:   1   4      :143  
 pr     : 50   00414235:   1   3      : 92  
 hv     : 42   00414239:   1   5      : 42  
 (Other):100   (Other) :1109   (Other): 94  
                                Datetime         Lat             Lon      
 Friday, June  7, 2013 14:20:00 UTC :   2   Min.   :-61.2   Min.   :-180  
 Tuesday, June  4, 2013 09:55:35 UTC:   2   1st Qu.: 33.2   1st Qu.:-150  
 Friday, June  7, 2013 00:03:53 UTC :   1   Median : 38.8   Median :-121  
 Friday, June  7, 2013 00:15:24 UTC :   1   Mean   : 40.4   Mean   :-111  
 Friday, June  7, 2013 00:16:51 UTC :   1   3rd Qu.: 58.3   3rd Qu.:-116  
 Friday, June  7, 2013 00:37:08 UTC :   1   Max.   : 74.8   Max.   : 180  
 (Other)                            :1107                                 
   Magnitude       Depth            NST       
 Min.   :1.0   Min.   :  0.0   Min.   :  0.0  
 1st Qu.:1.3   1st Qu.:  3.0   1st Qu.: 12.0  
 Median :1.6   Median :  9.7   Median : 19.0  
 Mean   :2.0   Mean   : 27.8   Mean   : 25.6  
 3rd Qu.:2.3   3rd Qu.: 33.1   3rd Qu.: 30.0  
 Max.   :6.1   Max.   :621.4   Max.   :481.0  

                                         Region   
 Southern California                        :177  
 Central Alaska                             :138  
 Northern California                        :132  
 Southern Alaska                            : 98  
 Central California                         : 82  
 Andreanof Islands, Aleutian Islands, Alaska: 38  
 (Other)                                    :450  

# checking type of data
class(eData)
[1] "data.frame"

# looking at the classification of each individual column
sapply(eData[1,], class)
      Src      Eqid   Version  Datetime       Lat       Lon Magnitude 
 "factor"  "factor"  "factor"  "factor" "numeric" "numeric" "numeric" 
    Depth       NST    Region 
"numeric" "integer"  "factor" 

# looking at values of each variable
unique(eData$Src)
 [1] ak nc ci uw pr us nn hv mb se uu nm
Levels: ak ci hv mb nc nm nn pr se us uu uw

# length of unique values
length(unique(eData$Src))
[1] 12

# table of the qualitative variable
table(eData$Src)

 ak  ci  hv  mb  nc  nm  nn  pr  se  us  uu  uw 
363 266  42   9 185   1  39  50   3 109  19  29 

# looking at relationships
table(eData$Src,eData$Version)

       0   1   2   3   4   5   6   7   8   9   A   B   C   D
  ak   0  84 251  28   0   0   0   0   0   0   0   0   0   0
  ci   0   1  83  39 107  14  16   3   3   0   0   0   0   0
  hv   0  14  18   0   8   1   0   0   0   0   1   0   0   0
  mb   0   2   7   0   0   0   0   0   0   0   0   0   0   0
  nc  93  52  17  14   4   3   0   1   0   0   1   0   0   0
  nm   0   0   0   0   0   0   0   0   0   0   1   0   0   0
  nn   0  39   0   0   0   0   0   0   0   0   0   0   0   0
  pr  50   0   0   0   0   0   0   0   0   0   0   0   0   0
  se   0   0   0   0   0   0   0   0   0   0   3   0   0   0
  us   0   0   0   4  16  24  24  13  11   7   3   2   2   3
  uu   0   0  10   1   8   0   0   0   0   0   0   0   0   0
  uw   0  13  10   6   0   0   0   0   0   0   0   0   0   0

# looking at the frist 10 values of "Lat"
eData$Lat[1:10]
 [1] 64.40 38.83 38.57 63.32 36.56 32.97 33.05 33.48 57.97 36.87

# which values are greater than 40
eData$Lat[1:10] > 40
 [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE

# checking if any of the values are greatet than 40
any(eData$Lat[1:10] > 40)
[1] TRUE

# Are all the values greater than 40?
eData$Lat[1:10] > 40
 [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
all(eData$Lat[1:10] > 40)
[1] FALSE

# looking at subsets (only two columns with values greater than zero)
head(eData[eData$Lat > 0 & eData$Lon > 0,c("Lat", "Lon")])
       Lat    Lon
42  54.362 161.41
75  36.237  69.51
122 36.649  77.05
145 10.723 126.93
156  5.898 126.84
168 10.616 126.81

# either the latitude or the longitude is greater than zero
head(eData[eData$Lat > 0 | eData$Lon > 0,c("Lat", "Lon")])
    Lat    Lon
1 64.40 -147.6
2 38.83 -122.8
3 38.57 -122.6
4 63.32 -151.2
5 36.56 -121.1
6 32.97 -115.6








No hay comentarios.: