Resumir datos es útil
para encontrar valores perdidos, valores fuera del rango esperado, valores con
diferentes unidades, y variables mal identificadas o clasificadas. Los siguientes códigos para resumir datos los
copie de MOOC Coursera: Data Análisis
(2003)—Summarizing Data.
# 4-7 Summarizing Data
# downloading data directly from URL: Earthquake data
fileUrl <- "http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt"
# saving data as .csv
download.file(fileUrl,destfile="earthquakeData.csv")
# date downloaded
dateDownloaded <- date()
dateDownloaded
->->
[1] "Mon Jun 10 09:45:08 2013"
eData <- read.csv("earthquakeData.csv")
head(eData)
->
Src Eqid Version Datetime Lat Lon
1 ak 10733493 1 Monday, June 10, 2013 01:28:45 UTC 64.40 -147.6
2 nc 72005295 0 Monday, June 10, 2013 01:14:03 UTC 38.83 -122.8
3 nc 72005290 0 Monday, June 10, 2013 01:01:02 UTC 38.57 -122.6
4 ak 10733481 1 Monday, June 10, 2013 00:37:07 UTC 63.32 -151.2
5 nc 72005280 0 Monday, June 10, 2013 00:29:48 UTC 36.56 -121.1
6 ci 15357673 2 Monday, June 10, 2013 00:29:31 UTC 32.97 -115.6
Magnitude Depth NST Region
1 1.1 2.1 8 Central Alaska
2 1.0 3.8 22 Northern California
3 1.7 7.1 13 Northern California
4 2.3 0.0 22 Central Alaska
5 1.4 7.9 9 Central California
6 1.6 4.4 18 Southern California
# looking at data
# names of the data frame
names(eData)
[1] "Src" "Eqid" "Version" "Datetime" "Lat"
[6] "Lon" "Magnitude" "Depth" "NST" "Region"
# dimension of rows and columns
# rows
nrow(eData)
[1] 1115
# columns
ncol(eData)
[1] 10
# rows and column
dim(eData)
[1] 1115 10
# quantiles
quantile(eData$Lat)
0% 25% 50% 75% 100%
-61.16 33.24 38.81 58.26 74.78
# descriptive statistics
summary(eData)
Src Eqid Version
ak :363 00414081: 1 2 :396
ci :266 00414206: 1 1 :205
nc :185 00414228: 1 0 :143
us :109 00414229: 1 4 :143
pr : 50 00414235: 1 3 : 92
hv : 42 00414239: 1 5 : 42
(Other):100 (Other) :1109 (Other): 94
Datetime Lat Lon
Friday, June 7, 2013 14:20:00 UTC : 2 Min. :-61.2 Min. :-180
Tuesday, June 4, 2013 09:55:35 UTC: 2 1st Qu.: 33.2 1st Qu.:-150
Friday, June 7, 2013 00:03:53 UTC : 1 Median : 38.8 Median :-121
Friday, June 7, 2013 00:15:24 UTC : 1 Mean : 40.4 Mean :-111
Friday, June 7, 2013 00:16:51 UTC : 1 3rd Qu.: 58.3 3rd Qu.:-116
Friday, June 7, 2013 00:37:08 UTC : 1 Max. : 74.8 Max. : 180
(Other) :1107
Magnitude Depth NST
Min. :1.0 Min. : 0.0 Min. : 0.0
1st Qu.:1.3 1st Qu.: 3.0 1st Qu.: 12.0
Median :1.6 Median : 9.7 Median : 19.0
Mean :2.0 Mean : 27.8 Mean : 25.6
3rd Qu.:2.3 3rd Qu.: 33.1 3rd Qu.: 30.0
Max. :6.1 Max. :621.4 Max. :481.0
Region
Southern California :177
Central Alaska :138
Northern California :132
Southern Alaska : 98
Central California : 82
Andreanof Islands, Aleutian Islands, Alaska: 38
(Other) :450
# checking type of data
class(eData)
[1] "data.frame"
# looking at the classification of each individual column
sapply(eData[1,], class)
Src Eqid Version Datetime Lat Lon Magnitude
"factor" "factor" "factor" "factor" "numeric" "numeric" "numeric"
Depth NST Region
"numeric" "integer" "factor"
# looking at values of each variable
unique(eData$Src)
[1] ak nc ci uw pr us nn hv mb se uu nm
Levels: ak ci hv mb nc nm nn pr se us uu uw
# length of unique values
length(unique(eData$Src))
[1] 12
# table of the qualitative variable
table(eData$Src)
ak ci hv mb nc nm nn pr se us uu uw
363 266 42 9 185 1 39 50 3 109 19 29
# looking at relationships
table(eData$Src,eData$Version)
0 1 2 3 4 5 6 7 8 9 A B C D
ak 0 84 251 28 0 0 0 0 0 0 0 0 0 0
ci 0 1 83 39 107 14 16 3 3 0 0 0 0 0
hv 0 14 18 0 8 1 0 0 0 0 1 0 0 0
mb 0 2 7 0 0 0 0 0 0 0 0 0 0 0
nc 93 52 17 14 4 3 0 1 0 0 1 0 0 0
nm 0 0 0 0 0 0 0 0 0 0 1 0 0 0
nn 0 39 0 0 0 0 0 0 0 0 0 0 0 0
pr 50 0 0 0 0 0 0 0 0 0 0 0 0 0
se 0 0 0 0 0 0 0 0 0 0 3 0 0 0
us 0 0 0 4 16 24 24 13 11 7 3 2 2 3
uu 0 0 10 1 8 0 0 0 0 0 0 0 0 0
uw 0 13 10 6 0 0 0 0 0 0 0 0 0 0
# looking at the frist 10 values of "Lat"
eData$Lat[1:10]
[1] 64.40 38.83 38.57 63.32 36.56 32.97 33.05 33.48 57.97 36.87
# which values are greater than 40
eData$Lat[1:10] > 40
[1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
# checking if any of the values are greatet than 40
any(eData$Lat[1:10] > 40)
[1] TRUE
# Are all the values greater than 40?
eData$Lat[1:10] > 40
[1] TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE
all(eData$Lat[1:10] > 40)
[1] FALSE
# looking at subsets (only two columns with values greater than zero)
head(eData[eData$Lat > 0 & eData$Lon > 0,c("Lat", "Lon")])
Lat Lon
42 54.362 161.41
75 36.237 69.51
122 36.649 77.05
145 10.723 126.93
156 5.898 126.84
168 10.616 126.81
# either the latitude or the longitude is greater than zero
head(eData[eData$Lat > 0 | eData$Lon > 0,c("Lat", "Lon")])
Lat Lon
1 64.40 -147.6
2 38.83 -122.8
3 38.57 -122.6
4 63.32 -151.2
5 36.56 -121.1
6 32.97 -115.6
No hay comentarios.:
Publicar un comentario