Anαlisis: febrero 2013

# Publicaciones técnicas y científicas de 1981 al 2009 en países de América.  
# Fuente: Datos del Banco Mundial http://data.worldbank.org/indicator/IP.JRN.ARTC.SC 
ePubli <- read.csv("C:/Users/Administrator/Desktop/publicaciones.csv")

# mostrando datos
ePubli

                 pais   total
1           Argentina   59598
2              Belize      37
3             Bolivia     666
4              Brazil  144295
5              Canada  602876
6               Chile   27867
7            Colombia    6305
8          Costa Rica    1783
9                Cuba    4470
10 Dominican Republic     169
11            Ecuador     839
12        El Salvador      78
13          Guatemala     469
14              Haiti      91
15           Honduras     158
16             Mexico   61203
17          Nicaragua     214
18             Panama    1085
19           Paraguay     175
20               Peru    2163
21      United States 4940670
22            Uruguay    3134
23      Venezuela, RB   10460


# transponer datos
ePubliT = setNames(data.frame(t(ePubli[,-1])), ePubli[,1])

# mostrando datos transpuestos
ePubliT

  Argentina Belize Bolivia Brazil Canada Chile Colombia Costa Rica Cuba
1     59598     37     666 144295 602876 27867     6305       1783 4470
  Dominican Republic Ecuador El Salvador Guatemala Haiti Honduras Mexico
1                169     839          78       469    91      158  61203
  Nicaragua Panama Paraguay Peru United States Uruguay Venezuela, RB
1       214   1085      175 2163       4940670    3134         10460

Resumir datos es útil para encontrar valores perdidos, valores fuera del rango esperado, valores con diferentes unidades, y variables mal identificadas o clasificadas. Los siguientes códigos para resumir datos los copie de MOOC Coursera: Data Análisis (2003)—Summarizing Data.

# 4-7 Summarizing Data

# downloading data directly from URL: Earthquake data
fileUrl <- "http://earthquake.usgs.gov/earthquakes/catalogs/eqs7day-M1.txt"

# saving data as .csv
download.file(fileUrl,destfile="earthquakeData.csv")

# date downloaded
dateDownloaded <- date()
dateDownloaded

[1] "Mon Jun 10 09:45:08 2013"


eData <- read.csv("earthquakeData.csv")
head(eData)

  Src     Eqid Version                           Datetime   Lat    Lon
1  ak 10733493       1 Monday, June 10, 2013 01:28:45 UTC 64.40 -147.6
2  nc 72005295       0 Monday, June 10, 2013 01:14:03 UTC 38.83 -122.8
3  nc 72005290       0 Monday, June 10, 2013 01:01:02 UTC 38.57 -122.6
4  ak 10733481       1 Monday, June 10, 2013 00:37:07 UTC 63.32 -151.2
5  nc 72005280       0 Monday, June 10, 2013 00:29:48 UTC 36.56 -121.1
6  ci 15357673       2 Monday, June 10, 2013 00:29:31 UTC 32.97 -115.6
  Magnitude Depth NST              Region
1       1.1   2.1   8      Central Alaska
2       1.0   3.8  22 Northern California
3       1.7   7.1  13 Northern California
4       2.3   0.0  22      Central Alaska
5       1.4   7.9   9  Central California
6       1.6   4.4  18 Southern California


# looking at data 
# names of the data frame
names(eData)

 [1] "Src"       "Eqid"      "Version"   "Datetime"  "Lat"      
 [6] "Lon"       "Magnitude" "Depth"     "NST"       "Region"


# dimension of rows and columns
# rows
nrow(eData)

[1] 1115


# columns
ncol(eData)

[1] 10


# rows and column
dim(eData)

[1] 1115   10


# quantiles
quantile(eData$Lat)

    0%    25%    50%    75%   100% 
-61.16  33.24  38.81  58.26  74.78


# descriptive statistics
summary(eData)

      Src            Eqid         Version   
 ak     :363   00414081:   1   2      :396  
 ci     :266   00414206:   1   1      :205  
 nc     :185   00414228:   1   0      :143  
 us     :109   00414229:   1   4      :143  
 pr     : 50   00414235:   1   3      : 92  
 hv     : 42   00414239:   1   5      : 42  
 (Other):100   (Other) :1109   (Other): 94  
                                Datetime         Lat             Lon      
 Friday, June  7, 2013 14:20:00 UTC :   2   Min.   :-61.2   Min.   :-180  
 Tuesday, June  4, 2013 09:55:35 UTC:   2   1st Qu.: 33.2   1st Qu.:-150  
 Friday, June  7, 2013 00:03:53 UTC :   1   Median : 38.8   Median :-121  
 Friday, June  7, 2013 00:15:24 UTC :   1   Mean   : 40.4   Mean   :-111  
 Friday, June  7, 2013 00:16:51 UTC :   1   3rd Qu.: 58.3   3rd Qu.:-116  
 Friday, June  7, 2013 00:37:08 UTC :   1   Max.   : 74.8   Max.   : 180  
 (Other)                            :1107                                 
   Magnitude       Depth            NST       
 Min.   :1.0   Min.   :  0.0   Min.   :  0.0  
 1st Qu.:1.3   1st Qu.:  3.0   1st Qu.: 12.0  
 Median :1.6   Median :  9.7   Median : 19.0  
 Mean   :2.0   Mean   : 27.8   Mean   : 25.6  
 3rd Qu.:2.3   3rd Qu.: 33.1   3rd Qu.: 30.0  
 Max.   :6.1   Max.   :621.4   Max.   :481.0  

                                         Region   
 Southern California                        :177  
 Central Alaska                             :138  
 Northern California                        :132  
 Southern Alaska                            : 98  
 Central California                         : 82  
 Andreanof Islands, Aleutian Islands, Alaska: 38  
 (Other)                                    :450


# checking type of data
class(eData)

[1] "data.frame"


# looking at the classification of each individual column
sapply(eData[1,], class)

      Src      Eqid   Version  Datetime       Lat       Lon Magnitude 
 "factor"  "factor"  "factor"  "factor" "numeric" "numeric" "numeric" 
    Depth       NST    Region 
"numeric" "integer"  "factor"


# looking at values of each variable
unique(eData$Src)

 [1] ak nc ci uw pr us nn hv mb se uu nm
Levels: ak ci hv mb nc nm nn pr se us uu uw


# length of unique values
length(unique(eData$Src))

[1] 12


# table of the qualitative variable
table(eData$Src)


 ak  ci  hv  mb  nc  nm  nn  pr  se  us  uu  uw 
363 266  42   9 185   1  39  50   3 109  19  29


# looking at relationships
table(eData$Src,eData$Version)


       0   1   2   3   4   5   6   7   8   9   A   B   C   D
  ak   0  84 251  28   0   0   0   0   0   0   0   0   0   0
  ci   0   1  83  39 107  14  16   3   3   0   0   0   0   0
  hv   0  14  18   0   8   1   0   0   0   0   1   0   0   0
  mb   0   2   7   0   0   0   0   0   0   0   0   0   0   0
  nc  93  52  17  14   4   3   0   1   0   0   1   0   0   0
  nm   0   0   0   0   0   0   0   0   0   0   1   0   0   0
  nn   0  39   0   0   0   0   0   0   0   0   0   0   0   0
  pr  50   0   0   0   0   0   0   0   0   0   0   0   0   0
  se   0   0   0   0   0   0   0   0   0   0   3   0   0   0
  us   0   0   0   4  16  24  24  13  11   7   3   2   2   3
  uu   0   0  10   1   8   0   0   0   0   0   0   0   0   0
  uw   0  13  10   6   0   0   0   0   0   0   0   0   0   0


# looking at the frist 10 values of "Lat"
eData$Lat[1:10]

 [1] 64.40 38.83 38.57 63.32 36.56 32.97 33.05 33.48 57.97 36.87


# which values are greater than 40
eData$Lat[1:10] > 40

 [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE


# checking if any of the values are greatet than 40
any(eData$Lat[1:10] > 40)

[1] TRUE


# Are all the values greater than 40?
eData$Lat[1:10] > 40

 [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE

all(eData$Lat[1:10] > 40)

[1] FALSE


# looking at subsets (only two columns with values greater than zero)
head(eData[eData$Lat > 0 & eData$Lon > 0,c("Lat", "Lon")])

       Lat    Lon
42  54.362 161.41
75  36.237  69.51
122 36.649  77.05
145 10.723 126.93
156  5.898 126.84
168 10.616 126.81


# either the latitude or the longitude is greater than zero
head(eData[eData$Lat > 0 | eData$Lon > 0,c("Lat", "Lon")])

    Lat    Lon
1 64.40 -147.6
2 38.83 -122.8
3 38.57 -122.6
4 63.32 -151.2
5 36.56 -121.1
6 32.97 -115.6

Anαlisis

Páginas

febrero 28, 2013

Muestreo en R

febrero 27, 2013

Transponiendo datos en R

febrero 26, 2013

Resumiendo datos

febrero 25, 2013

Gestión de archivos

febrero 24, 2013

Peldaños en análisis de datos

Diagrama de dispersión y series temporales