====== SAS / R / Gretl ======
===== Zdroje dat =====
* https://www.quandl.com
===== Import dat =====
DATA fitness
INPUT VEK VAHA DOBA_CV POC_PULZ CV_PULZ MAX_PULZ SPOTR_KYSL SKUPINA POHLAVI $ ID;
DATALINES;
57 73.37 12.63 58 174 176 39.407 2 M 1
54 79.38 11.17 62 156 165 46.08 2 M 2
52 76.32 9.63 48 164 166 45.441 2 M 3
50 70.87 8.92 48 146 155 54.625 2 M 4
51 67.25 11.08 48 172 172 45.118 2 M 5
54 91.63 12.88 44 168 172 39.203 2 M 6
51 73.71 10.47 59 186 188 45.79 2 M 7
57 59.08 9.93 49 148 155 50.545 2 M 8
49 76.32 9.4 56 186 188 48.673 2 M 9
48 61.24 11.5 52 170 176 47.92 2 M 10
52 82.78 10.5 53 170 172 47.467 2 M 11
44 73.03 10.13 45 168 168 50.541 1 M 12
45 87.66 14.03 56 186 192 37.388 1 M 13
45 66.45 11.12 51 176 176 44.754 1 M 14
47 79.15 10.6 47 162 164 47.273 1 M 15
54 83.12 10.33 50 166 170 51.855 1 M 16
49 81.42 8.95 44 180 185 49.156 1 M 17
51 69.63 10.95 57 168 172 40.836 1 M 18
51 77.91 10 48 162 168 46.672 1 Z 19
48 91.63 10.25 48 162 164 46.774 1 Z 20
49 73.37 10.08 76 168 168 50.388 1 Z 21
44 89.47 11.37 62 178 182 44.609 0 Z 22
40 75.07 10.07 62 185 185 45.313 0 Z 23
44 85.84 8.65 45 156 168 54.297 0 Z 24
42 68.15 8.17 40 166 172 59.571 0 Z 25
38 89.02 9.22 55 178 180 49.874 0 Z 26
47 77.45 11.63 58 176 176 44.811 0 Z 27
40 75.98 11.95 70 176 180 45.681 0 Z 28
43 81.19 10.85 64 162 170 49.091 0 Z 29
44 81.42 13.08 63 174 176 39.442 0 Z 30
38 81.87 8.63 48 170 186 60.055 0 Z 31
;
PROC PRINT; RUN;
p<0,05 - zamitame H0\\
H0: ma normal
===== Data mining =====
velke soubory
data work.prestupky2;
proc univariate data=work.prestupky2 normal plot;
histogram pokuta/kernel normal;
qqplot pokuta/normal (mu=est sigma=est);
var pokuta;
run;
proc boxplot data=work.prestupky2;
plot pokuta*pohlavi;
plot pokuta*pohlavi / boxstyle=schematic;
plot pokuta*pohlavi / notches;
run;
proc univariate data=work.prestupky2 winsor=2;
var pokuta;
run;
proc means data=work.prestupky2 mean clm maxdec=2;
var pokuta;
run;
proc univariate data=work.prestupky2 cibasic;
var pokuta;
run;
proc univariate data=work.prestupky2 cibasic mu0=1335;
var pokuta;
run;
male soubory
data stat;
input vyuka @@;
datalines
;
90 85 98 87 65 88 93 85 97 103
;
proc univariate data=stat normal plot;
histogram vyuka/kernel normal;
qqplot vyuka/normal (mu=est sigma=est);
var vyuka;
run;
3. cv
proc reg data=fitness
model spotr_kysl = doba_cv;
plot spotr_kysl*doba_cv;
plot r.*p.;
symbol v=star;
run;
proc reg data=work.fitness;
model spotr_kysl = doba_cv/r influence spec;
plot spotr_kysl*doba_cv;
plot r.*p.;
symbol v=star;
run;
proc reg data=work.fitness;
model spotr_kysl = doba_cv vek/r vif influence spec;
plot r.*p.;
symbol v=star;
run;
4. cv
proc boxplot data=work.teploty;
plot vysledky*mesic;
run;
proc glm data=work.teploty;
class mesic;
model vysledky=mesic;
means mesic/hovtest tukey scheffe lsd sidak;
run;
proc npar1way data=work.teploty wilcoxon;
class mesic;
var vysledky;
run;
5. cv
## pearson chi squared
data souhlas;
input vzdelani $ prirazka $ pocet @@;
datalines;
ano ano 50 ano ne 7 ano nevim 11
ne ano 14 ne ne 23 ne nevim 20
;
proc freq data=souhlas;
tables vzdelani*prirazka /expected chisq measures norow nocol nopercent;
weight pocet;
run;
## Fischer (pokud pearson varuje, ze 33% cetnosti je < 5)
data zakon;
input zmena $ nakup $ pocet @@;
datalines;
ano denne 27 ano nekolikrat_t 79 ano jednou_t 13 ano jednou_14 2
ne denne 38 ne nekolikrat_t 79 ne jednou_t 24 ne jednou_14 3
;
proc freq data=zakon;
tables zmena*nakup/expected chisq measures norow nocol nopercent exact;
weight pocet;
run;
chisq p-value H0: neexistuje zavislost
lambda asymetric C|R = %
data zkouska;
input skola $ splneno $ pocet @@;
datalines;
gympl ano 45 ss ano 22 uc ano 7
gympl ne 7 ss ne 10 uc ne 9
;
proc freq data=zkouska;
tables skola*splneno/expected chisq measures norow nocol nopercent;
weight pocet;
run;
data semena;
input osetreno $ vyklicilo $ pocet @@;
datalines;
ne ano 70 ne ne 30
ano ano 130 ano ne 130
;
proc freq data=semena;
tables osetreno*vyklicilo /expected chisq measures norow nocol nopercent;
weight pocet;
run;
t-test
proc ttest data=mesta h0=1335;
run;
==== Procedury ke zkousce ====
=== 1. cv ===
data work.prestupky2;
proc univariate data= work.prestupky2 normal plot;
histogram body/kernel normal;
qqplot body/normal (mu= est sigma= est);
var body;
run;
data work.prestupky2;
proc boxplot data= work.prestupky2;
plot body*pohlavi/boxstyle=schematic;
plot body*pohlavi/notches;
run;
data work prestupky2;
proc univariate data= work.prestupky2 trimmed=2;
var body;
run;
data work prestupky2;
proc univariate data= work.prestupky2 winsorized=2;
var body;
run;
data work prestupky2;
proc means data= work.prestupky2 mean cv clm maxdec=2;
var body;
title "Interval spolehlivosti pro průměr";
run;
data work prestupky2;
proc univariate data= work.prestupky2 cibasic;
var body;
title "Basic Confidence Limits Assuming Normality - IS pro základní popisné statistiky";
run;
data stat;
input vyuka @@;
datalines;
98 79 88 64 80 92 67 88 90 60 63 67
;
proc univariate data= stat normal plot;
histogram vyuka/kernel normal;
qqplot vyuka/normal (mu= est sigma= est);
var vyuka;
run;
=== 2. cv ===
data fitness;
input vek doba_cv spotr_kysl;
datalines;
57 12.63 39.407
54 11.17 46.08
52 9.63 45.441
50 8.92 54.625
51 11.08 45.118
54 12.88 39.203
51 10.47 45.79
57 9.93 50.545
49 9.4 48.673
48 11.5 47.92
52 10.5 47.467
44 10.13 50.541
45 14.03 37.388
45 11.12 44.754
47 10.6 47.273
54 10.33 51.855
49 8.95 49.156
51 10.95 40.836
51 10 46.672
48 10.25 46.774
49 10.08 50.388
44 11.37 44.609
40 10.07 45.313
44 8.65 54.297
42 8.17 59.571
38 9.22 49.874
47 11.63 44.811
40 11.95 45.681
43 10.85 49.091
44 13.08 39.442
38 8.63 60.055
;
proc means data= fitness n mean cv median min max std skewness kurtosis max maxdec= 2;
var vek doba_cv spotr_kysl;
run;
kurt: 1+ = spicaty (light tail), -1- = placaty (heavy tail); +-3 silne\\
skew: 0.8 +vpravo -vlevo
proc univariate data= fitness normal plot;
var vek doba_cv spotr_kysl;
run;
proc corr data= fitness plots= matrix (histogram);
run;
proc corr data= fitness nosimple fisher;
run;
proc corr data= fitness nosimple;
var doba_cv spotr_kysl;
partial vek;
run;
proc corr data= fitness nosimple;
var vek spotr_kysl;
partial doba_cv;
run;
proc corr data= fitness nosimple pearson spearman;
run;
proc reg data= fitness;
model spotr_kysl = doba_cv;
plot spotr_kysl*doba_cv;
symbol v=star;
run;
proc reg data= fitness;
model spotr_kysl = doba_cv/r influence spec;
plot spotr_kysl*doba_cv;
plot r.*p.;
symbol v= star;
run;
=== 3. cv ===
proc reg data= fitness;
model spotr_kysl = doba_cv/r influence spec;
plot spotr_kysl*doba_cv;
plot r.*p.;
symbol v= star;
run;
=== 3.+4. cv ===
proc reg data= work.fitness;
model spotr_kysl = doba_cv/r influence spec;
plot spotr_kysl*doba_cv;
plot r.*p.;
symbol v= star;
run;
proc reg data= work.fitness;
model spotr_kysl = doba_cv vek/r vif influence spec;
plot r.*p.;
symbol v= star;
run;
quit;
proc reg data= work.fitness;
model spotr_kysl = doba_cv vek vaha/r vif influence spec;
plot r.*p.;
symbol v= star;
run;
quit;
proc reg data= work.fitness;
Forward:model spotr_kysl = vek vaha doba_cv max_pulz/selection=f;
Backward:model spotr_kysl = vek vaha doba_cv max_pulz/selection=b;
Stepwise:model spotr_kysl = vek vaha doba_cv max_pulz/selection=stepwise;
R_square:model spotr_kysl = vek vaha doba_cv max_pulz/selection=rsquare;
run;
quit;
proc reg data= work.fitness;
Forward:model spotr_kysl = vek vaha doba_cv max_pulz/selection=f details= summary;
Backward:model spotr_kysl = vek vaha doba_cv max_pulz/selection=b details= summary;
Stepwise:model spotr_kysl = vek vaha doba_cv max_pulz/selection=stepwise details= summary;
R_square:model spotr_kysl = vek vaha doba_cv max_pulz/selection=rsquare details= summary;
run;
quit;
=== Analyza rozptylu 1 ===
data teplota;
input mesic $ vysledky @@;
datalines;
leden 1.00 leden 0.64 leden 1.22 leden 1.19 leden 0.62 leden 0.87 leden 1.23 leden 0.96 leden 0.92 leden 1.11
unor 1.06 unor 0.88 unor 1.04 unor 1.66 unor 1.06 unor 1.07 unor 0.87 unor 0.97 unor 2.00 unor 1.09
brezen 1.19 brezen 1.77 brezen 1.46 brezen 1.58 brezen 1.55 brezen 1.22 brezen 1.64 brezen 1.35 brezen 1.29 brezen 1.41
;
proc boxplot data= teplota;
plot vysledky*mesic/boxstyle = schematic;
plot vysledky*mesic/notches;
run;
proc means data= teplota n mean median min max std cv skewness kurtosis maxdec=2;
class mesic;
var vysledky;
run;
proc univariate data= teplota noprint;
class mesic;
histogram vysledky/normal;
qqplot vysledky/normal (mu= est sigma= est)nrows =3;
run;
proc glm data= teplota;
class mesic;
model vysledky= mesic;
means mesic/hovtest tukey;
run;
proc npar1way data= teplota wilcoxon;
class mesic;
var vysledky;
run;
=== Analyza rozptylu 2 ===
proc boxplot data= work.trzby;
plot trzby*prodejna/boxstyle = schematic;
plot trzby*prodejna/notches;
run;
proc means data= work.trzby n mean median min max std cv skewness kurtosis maxdec=2;
class prodejna;
var trzby;
run;
proc univariate data= work.trzby noprint;
class prodejna;
histogram trzby/normal;
qqplot trzby/normal (mu= est sigma= est)nrows =3;
run;
proc glm data= work.trzby;
class prodejna;
model trzby= prodejna;
means prodejna/hovtest tukey;
run;
proc npar1way data= work.trzby wilcoxon;
class prodejna;
var trzby;
run;
=== 5. cv ===
1. prikad
data souhlas;
input vzdelani $ prirazka $ pocet @@;
datalines;
ano ano 50 ano ne 7 ano nevim 11
ne ano 14 ne ne 23 ne nevim 20
;
proc freq data = souhlas;
tables vzdelani*prirazka/expected chisq measures norow nocol nopercent;
weight pocet;
run;
2. priklad
data zakon;
input zmena $ nakup $ pocet @@;
datalines;
ano denne 27 ano nekolik_t 79 ano jednou_t 13 ano jednou_14 2
ne denne 38 ne nekolik_t 79 ne jednou_t 24 ne jednou_14 3
;
proc freq data = zakon;
tables zmena*nakup/expected chisq measures norow nocol nopercent exact;
weight pocet;
run;
3. priklad
data zkouska;
input slozeni_zk $ skola $ pocet @@;
datalines;
ano gymnazium 45 ano stredni_od 22 ano uciliste 7
ne gymnazium 7 ne stredni_od 10 ne uciliste 9
;
proc freq data = zkouska;
tables slozeni_zk*skola/expected chisq measures norow nocol nopercent exact;
weight pocet;
run;
=== 6. cv ===
data work.kriminalita;
proc princomp data= work.kriminalita out= components plots=score (ellipse ncomp=2);
var kriminalita_celkem obecna_kriminalita hospodarska_kriminalita loupeze vloupani vrazdy;
id kraj;
run;
proc gplot data=components;
plot prin2*prin1/vref=0 href=0;
symbol v=star pointlabel= (j=r position=middle "#kraj");
run;
proc cluster data= work.kriminalita method=ave std;
id kraj;
run;
===== Ekonometrie =====
> setwd("C:/Users/C24-11/Desktop")
> read.csv("mzdy.csv");
rok.mzda
1 1992;4644
2 1993;5904
3 1994;7004
4 1995;8307
5 1996;9825
6 1997;10802
7 1998;11801
8 1999;12797
9 2000;13219
10 2001;14378
11 2002;15524
12 2003;16430
13 2004;17466
14 2005;18344
15 2006;19546
16 2007;20957
17 2008;22592
18 2009;23344
19 2010;23797
20 2011;24319
> data = read.csv("mzdy.csv");
> View(data)
> View(data)
> data = read.csv("mzdy.csv", header=T, sep=";");
> y = data$mzda;
> x = data$rok;
> View(data)
> View(y);
> boxplot(y);
> plot(y~x)
> summary(data)
rok mzda
Min. :1992 Min. : 4644
1st Qu.:1997 1st Qu.:10558
Median :2002 Median :14951
Mean :2002 Mean :15050
3rd Qu.:2006 3rd Qu.:19899
Max. :2011 Max. :24319
> summary(y)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4644 10560 14950 15050 19900 24320
> install.packages("outliers")
Installing package into ‘C:/Users/C24-11/Documents/R/win-library/3.2’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/3.2/outliers_0.14.zip'
Content type 'application/zip' length 52663 bytes (51 KB)
downloaded 51 KB
package ‘outliers’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\C24-11\AppData\Local\Temp\Rtmpw9vBez\downloaded_packages
> outliers(y)
Error: could not find function "outliers"
> outliers::outlier(y)
> y[y<4645] <-NA
> data$dummy <- ifelse(data$rok > 2003, c(1), c(0))
# zpožděné proměnné
shift<-function(x,shift_by){
stopifnot(is.numeric(shift_by))
stopifnot(is.numeric(x))
if (length(shift_by)>1)
return(sapply(shift_by,shift, x=x))
out<-NULL
abs_shift_by=abs(shift_by)
if (shift_by > 0 )
out<-c(tail(x,-abs_shift_by),rep(NA,abs_shift_by))
else if (shift_by < 0 )
out<-c(rep(NA,abs_shift_by), head(x,-abs_shift_by))
else
out<-x
out
}
setwd("C:/Users/C24-21.ADDS.002/Desktop")
data = read.csv("mzdy.csv", header=T, sep=";");
y = data$mzda;
x = data$rok;
data <- data.frame(y,x)
data$yl <- shift(data$y, -1)
data$yl3 <- shift(data$y, -3)
data$yd <- (data$y - data$yl)
data$yr <- (data$y / data$yl)
spotreba = read.csv("spotreba.csv", header=T, sep=";");
y = spotreba$Sp_VM
x1 = spotreba$SpC_VM
x2 = spotreba$SpC_HM
x3 = spotreba$SpC_DM
x4 = spotreba$Prijem
x0 = matrix(1,11,1)
X = cbind(x0, x1, x2, x3, x4)
A = t(X)%*%X
B = solve(A)
C = t(X)%*%y
coef = B%*%C
View(coef)
fit = lm(formula = y~x1+x2+x3+x4, data = spotreba)
summary(fit)
#LRM verifikace
data = read.csv("spotreba.csv", header = T, sep = ";")
data$konst <- c(1)
y = data$Sp_VM
x0 = data$konst
x1 = data$SpC_VM
x2 = data$SpC_HM
x3 = data$SpC_DM
x4 = data$Prijem
X = cbind(x0, x1, x2, x3, x4)
K = solve(t(X)%*%X)
coef = K%*%t(X)%*%y
View (coef)
teor = X%*%coef # vektor teoretickych hodnot zavisle promenne
res = y - teor # vektor rezidui
RSS = sum(res*res)
a = y-mean(y)
TSS = sum(a*a)
KD = 1-(RSS/TSS)
n = length(y)
k = length(coef)
KKD = 1 - (1 - KD) * ((n - 1)*(n - k))
KRR = RSS/(n-k)
Rg1 = KRR*K[1,1]
SCHg1 = sqrt(Rg1)
tg1 = coef[1,1]/SCHg1
fit = lm(y~x1+x2+x3+x4)
summary(fit)
tseries::jarque.bera.test(res)
lmtest::dwtest(y~x1+x2+x3+x4)
#Dalsi triky s R:
print(cbind(rbind(1,2,3),rbind(4,5,6)))
print(rbind(cbind(1,2,3),cbind(4,5,6)))
#rbind() = sklada prvky pod sebe (rows)
#cbind() = sklada prvky vedle sebe (columns)
#takze lze jejich kombinaci zadat matici po sloupcich i po radkach podle potreby..
#print() = jako View() ale vypisuje do stavajici konzole misto otvirani novyho okna
cvicebnice do strany 28
#odhad logaritmickyho modelu ala gretl
setwd("C:/Users/C24-17.ADDS.001/Desktop")
data = read.csv("spotreba.csv", header = T, sep = ";")
data$konst <- c(1)
ly = log(data$Sp_VM)
x0 = data$konst
lx1 = log(data$SpC_VM)
lx2 = log(data$SpC_HM)
lx3 = log(data$Prijem)
#prvni zpusob vypoctu odhadu
fit = lm(ly~lx1+lx2+lx3)
summary(fit)
coef = fit$coefficients
gama0 = exp(coef[1])
#druhej zpusob vypoctu odhadu
X = cbind(x0, lx1, lx2, lx3)
A = t(X)%*%X
B = solve(A)
C = t(X)%*%ly
coef2 = B%*%C
View(coef2)
#nejsem si jistej co to pocita, ale dela se to takhle :-D
setwd("C:/Users/C24-17.ADDS.001/Desktop")
data = read.csv("tq.csv", header = T, sep = ";")
data$konst <- c(1)
y = 1/data$y1
x = 1/data$x1
#c = data$konst
fit = lm(y~x)
summary(fit)
#vypocet parametru
coef = fit$coefficients
gama0 = 1/coef[1]
gama1 = coef[2]*gama0
#testy
#install.packages("lmtest") #staci nainstalovat jednou
lmtest::reset(y~x) #ramseyuv RESET test
lmtest::bptest(y~x) #Breusch–Pagan test - vypocet heteroskedasticity
===== Prognostika =====
==== Jak prevyst data z GRETLU do R ====
Gretl -> nastroje -> spustit GNU R.
otevre se vam novy okno s R a jsou v nem prednacteny data z gretlu v objektu ''gretldata'' prvni promenna z gretlu je tam teda jako ''gretldata[,1]'', dalsi jako ''gretldata[,2]'', atd... muzete si to zkusit vypsat/vykreslit
* ''print(gretldata);''
* ''plot(gretldata);''
* ''print(gretldata[,1]);''
* ''plot(gretldata[,1]);''
==== Sezonni ocisteni dat ====
dejme tomu, ze chcem sezonne ocistit prvni promennou z gretlu. pouzijem tyhle prikazy:
fit <- stl(gretldata[,1], s.window=12)
print(fit)
plot(fit)
dev.copy(png,'myplot.png')
#dev.copy(svg,'myplot.svg')
dev.off()
write.csv2(fit$time.series, file="vystup.csv")
* ''gretldata[,1]'' je vstupni vektor (nactenej z gretlu), ''s.window=12'' udava, ze mame 12 udaju rocne (data po mesicich).
* Pokud chcete ulozit graf do png prikazy ''dev....'', tak nesmite zavrit okno s grafem
* Pokud chceme data z objektu ulozit do csv, musime pristupovat primo ke komponente casovych rad, ktera je dostupna jako ''fit$time.series''
* write.csv2 oddeluje desetiny cisla carkama, write.csv je oddeluje teckama. zvolte co je pro vas lepsi.