HSAUR3/ 0000755 0001762 0000144 00000000000 14660157032 011223 5 ustar ligges users HSAUR3/MD5 0000644 0001762 0000144 00000037573 14660157032 011552 0 ustar ligges users c9f54f30f9fdddb0b601516ec092d548 *DESCRIPTION
08c7c9e8acb0266eb09c9bf07a1104b5 *NAMESPACE
e037b0b716fc388cc2cfbded5b1300a4 *R/Rwelcome.R
37b36983346f76358360694e26d29ffa *R/citations.R
8963a5301ce00df40992a68de11bbae0 *R/isi2bibtex.R
ab7787d3daf0f58ff87020f01176a6c9 *R/tables.R
70f58684583c52bfa5a1b529d7bc17c6 *R/tools.R
26fdc3be860fdaceee40445cc8fb188e *build/vignette.rds
c2f6e0311f75e562b4b5b9894ad50ae7 *cleanup
df8649b68d6bf843721bc45b41b00af0 *data/BCG.rda
c82373ad9d5a787d954946e6eef6bc79 *data/BtheB.rda
5ae72d63282fbce488116970cffa81d4 *data/CHFLS.rda
69bae06cee8564b8fc3c9b86cdcafcdb *data/CYGOB1.rda
37298bce9f0070ac1c2ed4d2b1a60a31 *data/EFT.rda
abda8c8ace1989df8932bbfa0f8fe758 *data/Forbes2000.rda
44ffc0c2017114639ae5fe3279fa6e53 *data/GHQ.rda
fe6f3b873d9316c47b62dc2f91889042 *data/Lanza.rda
7ca4bc98f164865c66fc53a8593b1ba1 *data/Smoking_DollHill1950.rda
2dba6678dfb073393c9feb6c3c25f347 *data/Smoking_Mueller1940.rda
a21c54fc743de7a24706a608eaa6c1be *data/Smoking_SchairerSchoeniger1944.rda
75605f856874f562a8fd79c6dcdbedcc *data/Smoking_Wassink1945.rda
702a4fa288f4faf6b2f656113cb410ea *data/USairpollution.rda
d3c9e4da1933ae8dbdbca61cb4673eed *data/USmelanoma.rda
3770935a11dc4d03caefa87726a4f22e *data/USstates.rda
39c28a87cd54cb3f7bcf679fbb193dce *data/UStemp.rda
6b8a2c608e761158fd477bbe5192e808 *data/agefat.rda
79ef95e2fc7acd6bd1cc9fa0bdd0f862 *data/aspirin.rda
1aa0f055c3c3dc4b12b685de841b2d27 *data/backpain.rda
7b8d9071228ccbbad1358015a167b3a5 *data/birds.rda
2f848c3bf90cc8f3df4bcfec2fcc6a29 *data/birthdeathrates.rda
62b85263756f5b1f6fe23d94ffd22ed2 *data/bladdercancer.rda
3d8c3a976f06de590dc1ec83dab5422a *data/bp.rda
d797dc92ee9380bc3f7b9ecbb73692b5 *data/clouds.rda
242f3cfc57b2221add80d615ba0f115b *data/epilepsy.rda
54b09296243597c425377f708ae2bca3 *data/foster.rda
05e0585e480d1ea0f5e52d8cbaadf9da *data/gardenflowers.rda
b12591b2df301668f696d0e7e62dda3e *data/heptathlon.rda
ff375252bcdcfe1f8d062111419c2347 *data/household.rda
91c64feed83438aaf12df6903a807dab *data/mastectomy.rda
bb84090c1aed38d71eac51289e787261 *data/men1500m.rda
2bfe58e4f9fea1a68d0df4187d96caee *data/meteo.rda
175e1008cf118422a9c446893e56332a *data/orallesions.rda
1e6d6a71ad43c52f326bf3038c976ec1 *data/phosphate.rda
4c11830d296714a2825e426114d20fe9 *data/pistonrings.rda
cf7d0f6e8fc98270121c646ea7e07b0b *data/planets.rda
011081ee1f5e341db75fcfd1ddea02fc *data/plasma.rda
0c4884d475d6ecb3a52ba2a39dfbd01d *data/polyps.rda
feee75f3feb0c729a4c349bc56565c45 *data/polyps3.rda
451793a58c3becfec50361003b6a3427 *data/pottery.rda
799a6aaae1a298bb5e02404f65c73ea7 *data/rearrests.rda
dbf9e96c0dc321ef85c8b6490f018d4f *data/respiratory.rda
a9de191caeebe385f24b38e8be4ee11a *data/roomwidth.rda
7878cc76d27e0136ccc3dcc1647025a8 *data/schizophrenia.rda
9b18489b28b3fc9647442bd7b3ba258d *data/schizophrenia2.rda
9c2bb25a8d9819bee49647e4c20ce2c2 *data/schooldays.rda
70aa798e309696fc34f92b92adb79185 *data/skulls.rda
65330010da3e58d360dd985ee03eb6a5 *data/smoking.rda
09e1451a7c53a8b6c35c80e6d656bd05 *data/students.rda
4de30be86172b245ee0f161b63fa89da *data/suicides.rda
cd05b12e304910229e2641df194e1830 *data/suicides2.rda
de83514e94506b964c5425372ea73de8 *data/toenail.rda
e14fc311f2b36ba3128aa4dc7ed9bed8 *data/toothpaste.rda
748d06879c649eb69c117c52f59914f6 *data/voting.rda
090fa9d5ce11b9f2e230494a76f0ecd2 *data/water.rda
0dbf6f420ce54d4da559567db7107f4a *data/watervoles.rda
ed4c085f3b18f439263e5ce0ecfc11dc *data/waves.rda
616f5636cc18a08aa585710a12cd8fae *data/weightgain.rda
b5551cfd36a91a26caeb1c1448082164 *data/womensrole.rda
dc1dc7889a7d17b495fc08427577bde2 *inst/LaTeXBibTeX/HSAUR.bib
b49cfcbd98e9d5acfe8536cb4eb37f0e *inst/LaTeXBibTeX/refstyle.bst
06e4b368936e0fbb227b42494c985873 *inst/LaTeXBibTeX/setup.Rnw
1fbd75eb3542d43da5e3220d8644c18f *inst/NEWS
8c85522a291e4492cdbd6919215d5ad3 *inst/cache/DE-bootpara.rda
0383f1729295c85c3869deaf5e0724d9 *inst/doc/Ch_analysing_longitudinal_dataI.R
0814ae2054014967781371ef549e9c8d *inst/doc/Ch_analysing_longitudinal_dataI.Rnw
6cba093a49696e7e5a303a6129aeeea6 *inst/doc/Ch_analysing_longitudinal_dataI.pdf
bd577c486b60681dff86e5e22bb01f8a *inst/doc/Ch_analysing_longitudinal_dataII.R
6697f18d269facd65cfe2bf87e6a4861 *inst/doc/Ch_analysing_longitudinal_dataII.Rnw
0b016281b375894ae0b3789dc00f4eaf *inst/doc/Ch_analysing_longitudinal_dataII.pdf
3e13fd9c07f69d124a61ca93a731c570 *inst/doc/Ch_analysis_of_variance.R
f3fb91a7a06195cc19b7e751f4c779d8 *inst/doc/Ch_analysis_of_variance.Rnw
42221fc14be43cfe9acfd3fffaeaa502 *inst/doc/Ch_analysis_of_variance.pdf
d34e7caae766d91f920c820d3cbd93c6 *inst/doc/Ch_bayesian_inference.R
6ac6083e89b50a65aea2dd47aa426198 *inst/doc/Ch_bayesian_inference.Rnw
0494f1a5b962da4465f82885015deeb3 *inst/doc/Ch_bayesian_inference.pdf
a4199159472a94041fd516a51bda54b4 *inst/doc/Ch_cluster_analysis.R
5498a10bd6c199daea66dff32806aaa3 *inst/doc/Ch_cluster_analysis.Rnw
967bd55a38d26ff3b7330eb8e258c030 *inst/doc/Ch_cluster_analysis.pdf
ccf2420a35cac3611521f005a49e3235 *inst/doc/Ch_conditional_inference.R
229afa54a67c3de1c0e3d3a0d4001a3d *inst/doc/Ch_conditional_inference.Rnw
1ad794d26914789db60af258ce94c401 *inst/doc/Ch_conditional_inference.pdf
acb595c7bb23ecac06ba6d19ac3708fb *inst/doc/Ch_density_estimation.R
8ca53fb1b54170e4a7263c43ff6b02ac *inst/doc/Ch_density_estimation.Rnw
95f68dac486e5dac50d00114fb88489b *inst/doc/Ch_density_estimation.pdf
549534b89fc35c849bedddb98c96c50e *inst/doc/Ch_errata.R
08dd8a45ccab98efb3c80a5ba3459f02 *inst/doc/Ch_errata.Rnw
e6461ba69386cff8477ed4380bf57fd5 *inst/doc/Ch_errata.pdf
c7ed7e6df3e0a65e0e22f5547f5de42c *inst/doc/Ch_gam.R
a0684b721d8b22867cc4d2be11afda7c *inst/doc/Ch_gam.Rnw
edce7a3ef5736ffa1e53de85a3aece46 *inst/doc/Ch_gam.pdf
a8279615ff821847948273b06a90791f *inst/doc/Ch_graphical_display.R
255e1df7e7e21aa35cedde119741dcd3 *inst/doc/Ch_graphical_display.Rnw
b37130698e4cd5ee6ac81e6e6bc2d600 *inst/doc/Ch_graphical_display.pdf
de088bed10856840b8ed2529927a6875 *inst/doc/Ch_introduction_to_R.R
f983af71fd2bc668a5e2edf2ae226eb3 *inst/doc/Ch_introduction_to_R.Rnw
ee66838fdd447d16965bdc8f3055655b *inst/doc/Ch_introduction_to_R.pdf
8adc888262955dce7d7075b45df6d6b0 *inst/doc/Ch_logistic_regression_glm.R
cbf07d7ca92481bde11ba98077955e8b *inst/doc/Ch_logistic_regression_glm.Rnw
9d82277ff5529590952fbc54b2170625 *inst/doc/Ch_logistic_regression_glm.pdf
81e50e1464d6d9e97bb65b32bb386037 *inst/doc/Ch_meta_analysis.R
7495a1d40e34166b09835abfa91d82b8 *inst/doc/Ch_meta_analysis.Rnw
59b2eaf967c4321732de502ef76e0238 *inst/doc/Ch_meta_analysis.pdf
a5a0c1194a76b18012f596c471ee8378 *inst/doc/Ch_missing_values.R
b20231ae22cdc7610b80adfe259f7144 *inst/doc/Ch_missing_values.Rnw
436079434dfcd41a3c38ad021e9c3af6 *inst/doc/Ch_missing_values.pdf
0e51a696bedad62c09e708caeed2240c *inst/doc/Ch_multidimensional_scaling.R
c56bf8040fe4b91e83877830e1a756f6 *inst/doc/Ch_multidimensional_scaling.Rnw
92e0a584b9c4dc72896f32fc8529f646 *inst/doc/Ch_multidimensional_scaling.pdf
dc89da0d9cdf1819fb0e86157b68cca2 *inst/doc/Ch_multiple_linear_regression.R
6b093667c08316669a41dc3a7f6958d1 *inst/doc/Ch_multiple_linear_regression.Rnw
63567342817545527fce14771c0e62e8 *inst/doc/Ch_multiple_linear_regression.pdf
854ece30828d8c5ac04ffc42c5e41fab *inst/doc/Ch_principal_components_analysis.R
4d4ff5d0699d23a7c3840f10e72e5106 *inst/doc/Ch_principal_components_analysis.Rnw
a8c07e111552a9d900c388fdd6234274 *inst/doc/Ch_principal_components_analysis.pdf
cdee07aa026b67bf5f47d68fdfaa37e9 *inst/doc/Ch_quantile_regression.R
51086648b3c1409837a0a818a30395d6 *inst/doc/Ch_quantile_regression.Rnw
4dd95131150efda050d0fbe74078248d *inst/doc/Ch_quantile_regression.pdf
2c99bb333ea59dd03a917fd837f27a4c *inst/doc/Ch_recursive_partitioning.R
80800ff360858c127a29e92210a79d3a *inst/doc/Ch_recursive_partitioning.Rnw
41140ff53f5b159f6aec08caf5649acf *inst/doc/Ch_recursive_partitioning.pdf
8a51b6ed00cb6d2e5472470c73af57ab *inst/doc/Ch_simple_inference.R
5bdcfcdb82c367d215615331bc49cb43 *inst/doc/Ch_simple_inference.Rnw
522281a6f884b37e89bcc08049e0ddfa *inst/doc/Ch_simple_inference.pdf
46a7cef0348cad008ac807736efb59c5 *inst/doc/Ch_simultaneous_inference.R
1762dc48aa1b5e44e1a8a34d979f3d56 *inst/doc/Ch_simultaneous_inference.Rnw
937b5df83eb3c4fcff6a8c934827de2b *inst/doc/Ch_simultaneous_inference.pdf
c519d01e44f0bdc0331976c176edfd27 *inst/doc/Ch_survival_analysis.R
b50eada6d5ffbe95546081208016bf42 *inst/doc/Ch_survival_analysis.Rnw
49597ea84cad1b78bc87e52497aa87d4 *inst/doc/Ch_survival_analysis.pdf
3f3f012255088e103a4539b3f5288389 *inst/slides/Ch_analysing_longitudinal_dataI.Rnw
eb1c173570420b948643b9c658a81f7a *inst/slides/Ch_analysing_longitudinal_dataII.Rnw
871aa5d22de1ce9ad777aaca19fb289f *inst/slides/Ch_analysis_of_variance.Rnw
473a113cae53bea78d56fd1da83b3917 *inst/slides/Ch_cluster_analysis.Rnw
15620d7175e48656ecbc2fad998a8325 *inst/slides/Ch_conditional_inference.Rnw
7c7a9b69e5b55829848373d96e3d688c *inst/slides/Ch_density_estimation.Rnw
187e9a3c6a9b6549573de38bc9736747 *inst/slides/Ch_graphical_display.Rnw
204305320bd3e31a2ff28abc9a23c529 *inst/slides/Ch_introduction_to_R.Rnw
33e517f46472c6743c24fe4750fb27d7 *inst/slides/Ch_logistic_regression_glm.Rnw
a7c5a4d66dd22fafc6e4dbacc003d40d *inst/slides/Ch_multiple_linear_regression.Rnw
0a89260d948c4971e6af97fbd463ee3d *inst/slides/Ch_recursive_partitioning.Rnw
1e16bbc277e8be5423cad3db2c4cf650 *inst/slides/Ch_simple_inference.Rnw
707663774a2995d62d2f583146052444 *inst/slides/Ch_survival_analysis.Rnw
592ae1055a5c7af852ba5b6faed2f650 *inst/slides/HSAUR3_slides_4up.tex
9a5b5d322f69b1a652245a6d5cd0be5f *inst/slides/HSAUR_title.Rnw
b587cb5f1561e6a8e7c45650df5ac148 *inst/slides/beamerthemeHSAUR.sty
53e42ba8f0222b5225ffd8f194c832dc *inst/slides/definitions.tex
d4334ab5d760854952312ef3346ecade *inst/slides/graphics/HSAUR.jpg
aa9a5d45b7d56c6c09b2de17b2799fc3 *inst/slides/graphics/Rlogo.jpg
8f38990c110066034cd66e04e2d724b6 *inst/slides/setup.R
8445b9caeb489ee7f9b11cc4116aacfb *inst/slides/tables/CA_perm.tex
b6509f4091b26915ba508a422de52774 *inst/slides/tables/CI_rtimesc.tex
4f4809f5350f593c0a7748030dfab90a *inst/slides/tables/Lanza.tex
4ddd561262f148ffda425a71fa1cf548 *inst/slides/tables/MLR-ANOVA-tab.tex
fe83d92b9c60b59260f25f5e1d649746 *inst/slides/tables/MLR-Xtab.tex
3c9802a4e3b16cbd51adb92b195b0d39 *inst/slides/tables/PCA_tab.tex
3e45eab10b07e5f68353d73eca518705 *inst/slides/tables/SI_mcnemar.tex
41e395ca7cf9dca56e527521d3f2f112 *inst/slides/tables/SI_rtimesc.tex
32013c89e73d1cc57325c172da41df2f *inst/slides/tables/exMDS.tex
ad5988ab88c5fccb7ae8e00ced1fed5d *inst/slides/tables/rec.tex
6bc333031969a6a3e2642e358a4ad336 *inst/slides/title_UZH.tex
f9ba0408a428d4b8b6bc7587e352c0d6 *man/BCG.Rd
83a780cac81101abe73e97cef7af269f *man/BtheB.Rd
379776bb9a7844c3e22b64543269b007 *man/CHFLS.Rd
e51aa354de3c1169e0cd47a6e7482a5d *man/CYGOB1.Rd
a0944f758c1cdba392ae83bc87abcfa3 *man/EFT.Rd
c3428fec98ec7de3085ae5c77629e9f6 *man/Forbes2000.Rd
6ec1adf517271eea87056e836db81eba *man/GHQ.Rd
ae9256d8d7a60e88133fe22731a7e36a *man/HSAURtable.Rd
7f1c2336e25535d1e384eb821316bf5c *man/Lanza.Rd
c54a25e18269a7dfda6a59f4d35f20a2 *man/Smoking_DollHill1950.Rd
d9464c06b71705f979cbb553c30d032a *man/Smoking_Mueller1940.Rd
00ca6dee070fdaed2b333cad631d075a *man/Smoking_SchairerSchoeniger1944.Rd
f415d2b6de8223beeaca670991ad00fe *man/Smoking_Wassink1945.Rd
dff06334210986f77a36a044fb572b35 *man/USairpollution.Rd
a9689f3398363fc241e747c3abeeaf0d *man/USmelanoma.Rd
4d5e320117cfbe6e00d67c51d69870c2 *man/USstates.Rd
0a13a0c6b2e9d72a1a053a8c2999d7d6 *man/UStemp.Rd
b7d2f083161dfd796b4ef128182e9ff9 *man/agefat.Rd
823606625813acebabc08b0671ebe2e8 *man/aspirin.Rd
a1b1bab8e7da52b6f26123776602dbc2 *man/backpain.Rd
a9f6790ab5f99fcefcf6896914489faa *man/birds.Rd
9de00ee110f9a5568fb0de5d58235acd *man/birthdeathrates.Rd
27b5151aa085e50ea74734c87c6c2fbf *man/bladdercancer.Rd
0b7591d442d5f7878f0db0ff4c3db44d *man/bp.Rd
246b944215ade8b536c986fc5715f8c2 *man/clouds.Rd
83e1004438d51d9f8cf2ab25fea545e2 *man/epilepsy.Rd
c51b5ec6fa0db139705350fe9d5578a6 *man/foster.Rd
2f98524fb276e3e06ad62cdb0f913fe6 *man/gardenflowers.Rd
ffb04966dd8cc6b3fd8913c9165453cf *man/heptathlon.Rd
a21345831ad55584895c988b9663e743 *man/household.Rd
2aff2e93e618b11c058d93e8a7d7e05b *man/mastectomy.Rd
39266b6b7317624e4c45e5b33910c4c4 *man/men1500m.Rd
289ef07b173e4fa85458a2d143c67380 *man/meteo.Rd
5c2f1e8f2f8e51063856652c790fafb3 *man/orallesions.Rd
6c9a990e72933be538f041fa163585e3 *man/phosphate.Rd
1569cc6868cd96f46589df8fff508fed *man/pistonrings.Rd
01f973eba2b876c59446646cdd0f3374 *man/planets.Rd
cc81da706e7fe538678a4baefca4f40c *man/plasma.Rd
1eb4458aa3000bc3b3caa2c81df27c9a *man/polyps.Rd
a417e3f3f85f2a378f4dbb4a45e6df98 *man/polyps3.Rd
4532e74368374d84f95b5dc4fc8bd41b *man/pottery.Rd
c63298442837c6081f72dfb65c722a1b *man/rearrests.Rd
02e1d715f434389a8c690981f708e297 *man/respiratory.Rd
883a98c90fcec8a5000a2d9ccf0a33d9 *man/roomwidth.Rd
84b1ad81a1a15eb9e300a7be821e5ee6 *man/schizophrenia.Rd
41ce7503dfb0eba06386f60786d801e0 *man/schizophrenia2.Rd
74e12e9a00327160c43c9581f0b74ed7 *man/schooldays.Rd
68d188277c29586ca5ab6c8b997eba11 *man/skulls.Rd
1f66ee2d9a676eaf63edbbbdb079f22c *man/smoking.Rd
bb2199ecde8e220413749c4452d5dad1 *man/students.Rd
07381875ae4b87ee7191c7e7fa7dfc2b *man/suicides.Rd
0c294da81d8904adb8c6813d691555ff *man/suicides2.Rd
f8b1d69e7c4e6272de4112cf97295ff3 *man/toenail.Rd
335d80e2167024443a91769bc70d512c *man/toothpaste.Rd
fdcb5fbb6718fe3af1f7424e95a50afa *man/voting.Rd
a795f58cf6cee55792c56a84dbcd2797 *man/water.Rd
a70fff4537501e7a79575b9562cce449 *man/watervoles.Rd
399ec85c35a2dedcfd0c46e9342a8ba4 *man/waves.Rd
3656ef7811f97cf8cf153e597e5ea1b1 *man/weightgain.Rd
9d04e1d5053b38bd900f4c0b2a48584d *man/womensrole.Rd
0814ae2054014967781371ef549e9c8d *vignettes/Ch_analysing_longitudinal_dataI.Rnw
6697f18d269facd65cfe2bf87e6a4861 *vignettes/Ch_analysing_longitudinal_dataII.Rnw
f3fb91a7a06195cc19b7e751f4c779d8 *vignettes/Ch_analysis_of_variance.Rnw
6ac6083e89b50a65aea2dd47aa426198 *vignettes/Ch_bayesian_inference.Rnw
5498a10bd6c199daea66dff32806aaa3 *vignettes/Ch_cluster_analysis.Rnw
229afa54a67c3de1c0e3d3a0d4001a3d *vignettes/Ch_conditional_inference.Rnw
8ca53fb1b54170e4a7263c43ff6b02ac *vignettes/Ch_density_estimation.Rnw
08dd8a45ccab98efb3c80a5ba3459f02 *vignettes/Ch_errata.Rnw
a0684b721d8b22867cc4d2be11afda7c *vignettes/Ch_gam.Rnw
255e1df7e7e21aa35cedde119741dcd3 *vignettes/Ch_graphical_display.Rnw
f983af71fd2bc668a5e2edf2ae226eb3 *vignettes/Ch_introduction_to_R.Rnw
cbf07d7ca92481bde11ba98077955e8b *vignettes/Ch_logistic_regression_glm.Rnw
7495a1d40e34166b09835abfa91d82b8 *vignettes/Ch_meta_analysis.Rnw
b20231ae22cdc7610b80adfe259f7144 *vignettes/Ch_missing_values.Rnw
c56bf8040fe4b91e83877830e1a756f6 *vignettes/Ch_multidimensional_scaling.Rnw
6b093667c08316669a41dc3a7f6958d1 *vignettes/Ch_multiple_linear_regression.Rnw
4d4ff5d0699d23a7c3840f10e72e5106 *vignettes/Ch_principal_components_analysis.Rnw
51086648b3c1409837a0a818a30395d6 *vignettes/Ch_quantile_regression.Rnw
80800ff360858c127a29e92210a79d3a *vignettes/Ch_recursive_partitioning.Rnw
5bdcfcdb82c367d215615331bc49cb43 *vignettes/Ch_simple_inference.Rnw
1762dc48aa1b5e44e1a8a34d979f3d56 *vignettes/Ch_simultaneous_inference.Rnw
b50eada6d5ffbe95546081208016bf42 *vignettes/Ch_survival_analysis.Rnw
dc1dc7889a7d17b495fc08427577bde2 *vignettes/LaTeXBibTeX/HSAUR.bib
b49cfcbd98e9d5acfe8536cb4eb37f0e *vignettes/LaTeXBibTeX/refstyle.bst
06e4b368936e0fbb227b42494c985873 *vignettes/LaTeXBibTeX/setup.Rnw
5a132c6c9e7994b2c29f42b4e3b68901 *vignettes/chapman.cls
a0e5e5d1fb2b8adb4e400728239b4fb4 *vignettes/graphics/Rlogo_bw.png
8445b9caeb489ee7f9b11cc4116aacfb *vignettes/tables/CA_perm.tex
b6509f4091b26915ba508a422de52774 *vignettes/tables/CI_rtimesc.tex
51c3707ffceeebaf14994b7806f079c1 *vignettes/tables/Lanza.tex
0e8b881930d42ea4ecabbd14b209e417 *vignettes/tables/MA_table.tex
4ddd561262f148ffda425a71fa1cf548 *vignettes/tables/MLR-ANOVA-tab.tex
fe83d92b9c60b59260f25f5e1d649746 *vignettes/tables/MLR-Xtab.tex
3c9802a4e3b16cbd51adb92b195b0d39 *vignettes/tables/PCA_tab.tex
d0af86886c7e0333dbf0392f94171b61 *vignettes/tables/PCA_tab1.tex
7b1e1bfa03dd8ccfa67b4b1ddce7e91d *vignettes/tables/SI_mcnemar.tex
b6509f4091b26915ba508a422de52774 *vignettes/tables/SI_rtimesc.tex
32013c89e73d1cc57325c172da41df2f *vignettes/tables/exMDS.tex
d8ddaea43747ef1481c2116af519ceba *vignettes/tables/rec.tex
HSAUR3/R/ 0000755 0001762 0000144 00000000000 14416277347 011437 5 ustar ligges users HSAUR3/R/tables.R 0000644 0001762 0000144 00000014433 14172224352 013024 0 ustar ligges users
isep <- function(x)
paste(paste(x[-length(x)], "&", collapse = " "),
x[length(x)], collapse = " ")
caption <- function(xname, label, caption, pkg = NULL) {
RET <- paste("\\caption{\\Robject{", xname, "} data",
sep = "", collapse = "")
if (!is.null(pkg))
RET <- paste(RET, " (package \\Rpackage{", pkg, "})",
sep = "", collapse = "")
RET <- paste(RET, ". ", caption, sep = "", collapse = "")
RET <- paste(RET, paste("\\label{", label, "}}",
sep = "", collapse = ""))
return(RET)
}
HSAURtable <- function(object, ...)
UseMethod("HSAURtable")
HSAURtable.data.frame <- function(object, xname = deparse(substitute(object)),
pkg = NULL, nrows = NULL,...) {
digits <- 0:6
table <- matrix("0", nrow = nrow(object), ncol = ncol(object))
xcc <- object[complete.cases(object),]
for (i in 1:ncol(object)) {
if (is.numeric(xcc[[i]])) {
d <- min(which(sapply(digits,
function(d)
max(abs(xcc[[i]] - round(xcc[[i]], d))) <
sqrt(.Machine$double.eps))))
table[,i] <- formatC(object[[i]], digits = digits[d], format = "f")
} else {
table[,i] <- as.character(object[[i]])
}
}
if (!is.null(nrows)) table <- rbind(table[1:nrows,,drop = FALSE], "$\\vdots$")
RET <- list(xname = gsub("_", "\\\\_", xname),
pkg = pkg,
varnames = colnames(object),
rownames = rownames(object),
data = table)
class(RET) <- "dftab"
return(RET)
}
HSAURtable.table <- function(object, xname = deparse(substitute(object)),
pkg = NULL,...) {
xtab <- matrix(as.character(object), nrow = nrow(object),
ncol = ncol(object))
RET <- list(xname = gsub("_", "\\\\_", xname),
pkg = pkg,
varnames = names(dimnames(object)),
data = rbind(c(" ", dimnames(object)[[2]]),
cbind(dimnames(object)[[1]], xtab)))
class(RET) <- "tabtab"
return(RET)
}
toLatex.tabtab <- function(object, caption = NULL, label = NULL,
topcaption = TRUE, index = TRUE, ...) {
RET <- c()
nc <- ncol(object$data)
object$varnames <- gsub("_", "\\\\_", object$varnames)
if (index)
RET <- c(RET, paste("\\index{", object$xname, " data@\\Robject{",
object$xname, "} data}", sep = ""))
RET <- c(RET, "\\begin{center}")
RET <- c(RET, paste("\\begin{longtable}",
paste("{", paste(rep("r", nc + 1), collapse = ""), "}")))
if (topcaption)
RET <- c(RET, caption(object$xname, label, caption, object$pkg),
"\\\\")
RET <- c(RET, paste(" & & \\multicolumn{", nc - 1, "}{c}{\\Robject{",
object$varnames[2], "}} \\\\", collapse = ""))
object$data <- cbind(c(paste("\\Robject{", object$varnames[1], "}",
collapse = ""),
rep(" ", nrow(object$data) - 1)), object$data)
RET <- c(RET, apply(object$data, 1, function(x) paste(isep(x), "\\\\")))
if (!topcaption)
RET <- c(RET, caption(object$xname, label, caption, object$pkg))
RET <- c(RET, "\\end{longtable}")
RET <- c(RET, "\\end{center}")
class(RET) <- "Latex"
return(RET)
}
toLatex.dftab <- function(object, pcol = 1, caption = NULL,
label = NULL, rownames = FALSE, topcaption = TRUE, index = TRUE, ...) {
nc <- ncol(object$data)
object$varnames <- gsub("_", "\\\\_", object$varnames)
if (pcol > 1) {
nr <- ceiling(nrow(object$data) / pcol)
object$data <- rbind(object$data, matrix(" ",
nrow = nr * pcol - nrow(object$data),
ncol = nc))
d <- NULL
for (i in 1:pcol)
d <- cbind(d, object$data[((i - 1) * nr + 1):(i * nr),])
object$data <- d
}
RET <- c()
if (index)
RET <- c(RET, paste("\\index{", object$xname, " data@\\Robject{",
object$xname, "} data}", sep = ""))
RET <- c(RET, "\\begin{center}")
if (rownames)
RET <- c(RET,
paste("\\begin{longtable}{l", paste(rep(paste(rep("r", nc),
collapse = ""), pcol),
collapse = "|"), "}",
collapse = ""))
else
RET <- c(RET,
paste("\\begin{longtable}{", paste(rep(paste(rep("r", nc),
collapse = ""), pcol),
collapse = "|"), "}",
collapse = ""))
if (topcaption)
RET <- c(RET, caption(object$xname, label, caption, object$pkg),
"\\\\")
RET <- c(RET, "\\hline")
vn <- rep(object$varnames, pcol)
vn <- paste(paste("\\Robject{", vn, sep = ""), "}", sep = "")
if (rownames) {
RET <- c(RET, paste(" & ", isep(vn), "\\\\ \\hline"))
RET <- c(RET, "\\endfirsthead")
RET <- c(RET, paste("\\caption[]{\\Robject{", object$xname,
"} data (continued).} \\\\",
sep = "", collapse = ""))
RET <- c(RET, "\\hline")
RET <- c(RET, paste(" & ", isep(vn), "\\\\ \\hline"))
RET <- c(RET, "\\endhead")
for (i in 1:nrow(object$data))
RET <- c(RET, paste(object$rownames[i], " & ",
isep(object$data[i,]), "\\\\"))
} else {
RET <- c(RET, paste(isep(vn), "\\\\ \\hline"))
RET <- c(RET, "\\endfirsthead")
RET <- c(RET, paste("\\caption[]{\\Robject{", object$xname,
"} data (continued).} \\\\", sep = "", collapse = ""))
RET <- c(RET, "\\hline")
RET <- c(RET, paste(isep(vn), "\\\\ \\hline"))
RET <- c(RET, "\\endhead")
RET <- c(RET,
apply(object$data, 1, function(x) paste(isep(x), "\\\\")))
}
RET <- c(RET, "\\hline")
if (!topcaption)
RET <- c(RET, caption(object$xname, label, caption, object$pkg))
RET <- c(RET, "\\end{longtable}")
RET <- c(RET, "\\end{center}")
class(RET) <- "Latex"
return(RET)
}
HSAUR3/R/isi2bibtex.R 0000644 0001762 0000144 00000007274 14172224352 013623 0 ustar ligges users
isi2bibtex <- function(file) {
journals <- rbind(c("J. Am. Stat. Assoc.", "Journal of the American Statistical Association", "JASA"),
c("J. Stat. Plan. Infer.", "Journal of Statistical Planning and Inference", "JSPI"),
c("Biom. J.", "Biometrical Journal", "BJ"),
c("Stat. Med.", "Statistics in Medicine", "SiM"))
colnames(journals) <- c("Abbr", "Title", "ID")
tfile <- tempfile()
isitxt <- readLines(file)
isitxt <- gsub("(^[A-Z][A-Z,0-9])", "\\1:", isitxt, perl = TRUE)
writeLines(isitxt, con = tfile)
isidcf <- read.dcf(tfile, fields = c("PT", "AU", "TI", "SO", "LA", "DT", "DE", "ID",
"AB", "C1", "RP", "EM", "NR", "TC", "PU", "PI", "PA", "SN",
"J9", "JI", "PD", "PY", "VL", "IS", "BP", "EP", "PG", "SC",
"GA", "UT"))
### journals only
isidcf <- isidcf[isidcf[,"PT"] == "J",]
### missings
isidcf <- isidcf[!apply(isidcf, 1, function(x) all(is.na(x))),]
### rename interesting fields
cn <- colnames(isidcf)
colnames(isidcf)[cn == "AU"] <- "author"
colnames(isidcf)[cn == "TI"] <- "title"
colnames(isidcf)[cn == "JI"] <- "journal"
colnames(isidcf)[cn == "PD"] <- "month"
colnames(isidcf)[cn == "PY"] <- "year"
colnames(isidcf)[cn == "VL"] <- "volumne"
colnames(isidcf)[cn == "IS"] <- "number"
colnames(isidcf)[cn == "UT"] <- "isitag"
colnames(isidcf)[cn == "DE"] <- "keywords"
colnames(isidcf)[cn == "TC"] <- "timescited"
colnames(isidcf)[cn == "AB"] <- "abstract"
rownames(isidcf) <- 1:nrow(isidcf)
isidcf[,"title"] <- gsub("\n", " ", isidcf[,"title"])
### author names
for (i in 1:nrow(isidcf)) {
au <- strsplit(isidcf[i,"author"], "\n")
names <- strsplit(au[[1]], ", ")
for (j in 1:length(names))
names[[j]][2] <- paste(strsplit(names[[j]][2], "")[[1]], ". ",
sep = "", collapse = "")
lastnames <- sapply(names, function(x) gsub(" ", "", x[1]))
if (length(lastnames) > 3) lastnames <- lastnames[1:3]
jour <- isidcf[i,"journal"]
indx <- journals[, "Abbr"] == jour
if (sum(indx) == 1) {
isidcf[i, "journal"] <- journals[indx, "Title"]
jkey <- journals[indx, "ID"]
} else {
jkey <- gsub("\\.* ", "", jour)
}
label <- paste(paste(lastnames, collapse = "+"), ":",
jkey, ":", isidcf[i,"year"], sep = "")
rownames(isidcf)[i] <- label
isidcf[i,"author"] <- paste(sapply(names, function(x) paste(x[2], x[1], sep = "")),
collapse = " and ")
title <- isidcf[i, "title"]
if (!identical(toupper(title), title)) {
ttmp <- strsplit(title, " ")[[1]]
lower <- tolower(ttmp) != ttmp
lower[1] <- FALSE
ttmp[lower] <- paste("{", ttmp[lower], "}", sep = "")
isidcf[i, "title"] <- paste(ttmp, collapse = " ")
}
}
tags <- c("author", "title", "journal", "month", "year", "volumne", "number",
"isitag", "abstract", "keywords", "timescited")
isidcf[is.na(isidcf[,"month"]), "month"] <- ""
for (tag in tags)
isidcf[,tag] <- paste(tag, " = {", isidcf[,tag], "},", sep = "")
pages <- paste("pages = {", isidcf[, "BP"], "--", isidcf[, "EP"], "},", sep = "")
headerkey <- paste("@article{", rownames(isidcf), ",", sep = "")
ret <- vector(mode = "list", length = nrow(isidcf))
for (i in 1:nrow(isidcf))
ret[[i]] <- c(headerkey[i], paste(" ", isidcf[i, tags]),
paste(" ", pages[i]), "}", " ")
unlist(ret)
}
HSAUR3/R/citations.R 0000644 0001762 0000144 00000001210 14172224352 013534 0 ustar ligges users
HSAURcite <- function(pkg) {
ct <- citation(pkg)
attr(ct, "label") <- paste("PKG:", pkg, sep = "", collapse = "")
for (n in c("note"))
ct[[n]] <- gsub("R", "\\R{}", ct[[n]])
class(ct) <- "HSAURcitation"
return(ct)
}
toBibtex.HSAURcitation <- function (object, ...)
{
z <- paste("@", attr(object, "entry"), "{", attr(object, "label"),
",", sep = "")
if ("author" %in% names(object)) {
object$author <- toBibtex(object$author)
}
for (n in names(object)) z <- c(z, paste(" ", n, " = {",
object[[n]], "},", sep = ""))
z <- c(z, "}")
class(z) <- "Bibtex"
z
}
HSAUR3/R/tools.R 0000644 0001762 0000144 00000017661 14416277343 012731 0 ustar ligges users
### some tools that make life easier
### copy *Rout to *Rout.save
cpRoutsave <- function(Routdir = NULL, Routsavedir = NULL) {
Routfiles <- list.files(path = Routdir, pattern = "\\.Rout$",
full.names = FALSE)
srcfiles <- file.path(Routdir, Routfiles)
destfiles <- file.path(Routsavedir,
paste(Routfiles, ".save", sep = ""))
file.copy(srcfiles, destfiles, overwrite = TRUE)
}
### attach all data frames in the global environment
gattach <- function() {
env <- globalenv()
var <- eval(parse(text = "ls()"), envir = env)
df <- sapply(var, function(x)
eval(parse(text =
paste("is.data.frame(", x, ")", sep = "", collapse = "")),
envir = env))
if (any(df)) {
var <- var[df]
out <- sapply(var, function(x)
eval(parse(text =
paste("attach(", x, ")", sep = "", collapse = "")),
envir = env))
}
}
### extract and check Robject or Rcmd LaTeX markup
extRact <- function(file, what = "Robject") {
x <- readLines(file)
indx <- grep(what, x)
out <- sapply(indx, function(i) {
obj <- NULL
while (TRUE) {
where <- regexpr(what, x[i])
if (where != -1) {
x[i] <- substring(x[i], where)
dm <- delimMatch(x[i])
obj <- c(obj, (substring(x[i], dm + 1,
dm + attr(dm, "match.length") - 2)))
x[i] <- substring(x[i], dm + attr(dm, "match.length"))
} else {
break
}
}
return(obj)
})
cmds <- unique(gsub("\\\\", "", out))
gattach()
for (cmd in cmds) {
a <- try(eval(parse(text = cmd)))
if (inherits(a, "try-error")) print(a)
}
cmds
}
### try to polish S{in,out}put environments, this needs
### manual refinements in some places
prettyS <- function(file, texenvironment = c("Sinput", "Soutput"),
width = 63, split = " ", write = TRUE) {
### handle Sinput or Soutput environments
texenvironment <- match.arg(texenvironment)
if (texenvironment == "Sinput" && split == " ")
split <- c(", ", "/", " ")
### dirty hack: in `Makefile's I want to call `prettyS'
### right after weaving and thus have only `file.Rnw' available
if (length(grep("Rnw$", file)) > 0) file <- gsub("Rnw$", "tex", file)
### read file
x <- readLines(file)
### remove all end-line spaces
x <- gsub("\\s+$", "", x)
### determine begin and end lines of environment
start <- grep(paste("^\\\\begin\\{", texenvironment, "\\}$",
sep = "", collapse = ""), x)
end <- grep(paste("^\\\\end\\{", texenvironment, "\\}$",
sep = "", collapse = ""), x)
if (length(start) == 0) return(NULL)
if (length(start) != length(end))
stop("unbalanced begin and end statements")
n <- length(start)
for (i in 1:n) {
### iterate over all lines longer than width
lines <- (start[i]):(end[i])
lines <- lines[sapply(x[lines], nchar) > width]
for (line in lines) {
cat("prettyS: line ", line, " too long: \n", x[line], "\n")
y <- x[line]
add <- sapply(split, function(s)
ifelse(length(grep(s, y)) > 0, nchar(s), 0))
if (all(add == 0)) next()
s <- split[min(which(add > 0))]
y <- unlist(strsplit(y, split = s))
nc <- sapply(y, nchar) + add[min(which(add > 0))]
pos <- cumsum(nc) <= width
if (!any(pos)) next()
newline <- cumsum(nc)[max(which(pos))]
plus <- length(grep("^\\+", x[line])) > 0 &&
substr(x[line], newline - 1, newline) != ", "
x[line] <- paste(substr(x[line], 1, newline), "\n",
ifelse(texenvironment == "Sinput", options("continue"), ""),
ifelse(plus, " ", ""),
" ",
substr(x[line], newline + 1, nchar(x[line])), sep = "",
collapse = "")
# if (length(grep("^\\+", x[line + 1])) > 0 &&
# (nchar(x[line + 1]) + (nchar(x[line]) - newline) < width)) {
# y <- x[line + 1]
# y <- gsub("^\\+ ", "", y)
# x[line] <- paste(x[line], y, sep = "", collapse = "")
# x[line + 1] <- ""
# }
cat("prettyS: ", x[line], "\n")
}
}
if (write)
writeLines(x, con = file)
}
### extract all Sinput environments from tex files
chkS <- function(file, texenvironment = "Sinput") {
### read file
x <- readLines(file)
### determine begin and end lines of environment
start <- grep(paste("^\\\\begin\\{", texenvironment, "\\}$",
sep = "", collapse = ""), x)
end <- grep(paste("^\\\\end\\{", texenvironment, "\\}$",
sep = "", collapse = ""), x)
if (length(start) == 0) return(NULL)
if (length(start) != length(end))
stop("unbalanced begin and end statements")
n <- length(start)
y <- NULL
for (i in 1:n) {
### iterate over all lines longer than width
lines <- (start[i] + 1):(end[i] - 1)
x[lines] <- gsub("^R>", "", x[lines])
x[lines] <- gsub("^\\+", "", x[lines])
y <- c(y, x[lines])
}
y
}
### read in a BibTeX file and return as list
readBibtex <- function(file = NULL) {
bib <- readLines(file)
entries <- grep("^@", bib)
labels <- gsub(",$", "", gsub("^@[A-Za-z].*\\{", "", bib[entries]))
if (any(duplicated(labels))) {
print(labels[duplicated(labels)])
stop("non-unique BibTeX labels in ", file)
}
biblist <- vector(mode = "list", length = length(entries))
for (i in 1:length(entries)) {
nexte <- ifelse(i == length(entries), length(entries),
entries[i + 1] - 1)
biblist[[i]] <- bib[entries[i]:nexte]
empty <- grep("^$", biblist[[i]])
if (length(empty) > 0)
biblist[[i]] <- biblist[[i]][-empty]
}
names(biblist) <- labels
class(biblist) <- "txtBibtex"
return(biblist)
}
### the subset of a BibTeX database actually used in `file'
extractBibtex <- function(file, bibtex) {
if (!inherits(bibtex, "txtBibtex"))
bibtex <- readBibtex(bibtex)
tex <- readLines(file)
tex <- tex[grep("\\cite", tex)]
enames <- gsub("\\+", "\\\\+", names(bibtex))
cited <- sapply(enames, function(name) length(grep(name, tex)) > 0)
biblist <- bibtex[cited]
class(biblist) <- "txtBibtex"
return(biblist)
}
### output to a file
toBibtex.txtBibtex <- function(object, ...) {
tmp <- lapply(object, function(bib) {
cat(paste(bib, "\n"))
cat("\n\n")
})
}
### set package version in BibTeX (quick'n'dirty hack)
pkgversions <- function(file) {
x <- readLines(file)
indx <- grep("VERSION", x)
for (i in indx) {
xx <- strsplit(x[i], " ")[[1]]
xx <- xx[grep("VERSION", xx)]
pkg <- gsub("[},]", "", gsub("VERSION", "", xx))
version <- packageDescription(pkg)$Version
x[i] <- gsub(paste(pkg, "VERSION", sep = "", collapse = ""), version,
x[i])
}
class(x) <- "Latex"
x
}
### set package date in BibTeX (quick'n'dirty hack)
pkgyears <- function(file) {
x <- readLines(file)
indx <- grep("PKGYEAR", x)
for (i in indx) {
xx <- strsplit(x[i], " ")[[1]]
xx <- xx[grep("PKGYEAR", xx)]
pkg <- gsub("[{},]", "", gsub("PKGYEAR", "", xx))
year <- format(as.Date(strsplit(packageDescription(pkg)$Built, ";")[[1]][3]),"%Y")
x[i] <- gsub(paste(pkg, "PKGYEAR", sep = "", collapse = ""), year,
x[i])
}
class(x) <- "Latex"
x
}
pkgs <- function()
c("scatterplot3d", "alr3", "ape", "coin", "flexmix", "gee", "ipred", "lme4",
"mclust", "party", "randomForest", "rmeta", "vcd", "gamair", "multcomp",
"sandwich", "mboost")
HSAUR3/R/Rwelcome.R 0000644 0001762 0000144 00000002403 14172224352 013321 0 ustar ligges users
Rwelcome <- function() {
tversion <- paste(version$major, version$minor, sep = ".")
tdate <- paste(version$year, version$month, version$day, sep = "-")
x <- c(paste("R : Copyright", version$year, "The R Foundation for Statistical Computing"),
paste("Version", tversion, paste("(", tdate, "),", sep = ""),
"ISBN 3-900051-07-0"),
" ",
"R is free software and comes with ABSOLUTELY NO WARRANTY.",
"You are welcome to redistribute it under certain conditions.",
"Type 'license()' or 'licence()' for distribution details.",
" ",
"R is a collaborative project with many contributors.",
"Type 'contributors()' for more information and",
"'citation()' on how to cite R or R packages in publications.",
" ",
"Type 'demo()' for some demos, 'help()' for on-line help, or",
"'help.start()' for an HTML browser interface to help.",
"Type 'q()' to quit R.",
">")
cat(paste(x, collapse = "\n"))
}
exename <- function() {
tversion <- paste(version$major, "0", substr(version$minor, 1, 1),
substr(version$minor,3,3), sep = "")
return(paste("rw", tversion, ".exe", sep = ""))
}
HSAUR3/cleanup 0000755 0001762 0000144 00000001467 14660150123 012602 0 ustar ligges users #!/bin/sh
for f in ./R/*~; do
rm -f $f
done
for f in ./man/*~; do
rm -f $f
done
for f in *~; do
rm -f $f
done
for f in .*~; do
rm -f $f
done
for f in ./tests/*~; do
rm -f $f
done
for f in ./inst/*~; do
rm -f $f
done
for f in ./tests/*.ps; do
rm -f $f
done
for f in ./inst/doc/*~; do
rm -f $f
done
for f in ./inst/doc/*.aux; do
rm -f $f
done
for f in ./inst/doc/*.bbl; do
rm -f $f
done
for f in ./inst/doc/*.blg; do
rm -f $f
done
for f in ./inst/doc/*.log; do
rm -f $f
done
for f in ./inst/doc/*.brf; do
rm -f $f
done
for f in ./inst/doc/*.out; do
rm -f $f
done
for f in ./inst/doc/*.tex; do
rm -f $f
done
for f in ./book/.RData; do
rm -f $f
done
find . -name "DEADJOE" -exec rm -f {} \;
find . -name "svn-commit*" -exec rm -f {} \;
exit 0
HSAUR3/vignettes/ 0000755 0001762 0000144 00000000000 14660150123 013225 5 ustar ligges users HSAUR3/vignettes/Ch_bayesian_inference.Rnw 0000644 0001762 0000144 00000072561 14416236367 020171 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Bayesian Inference}
%%\VignetteDepends{rmeta,coin}
\setcounter{chapter}{17}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
\chapter[Bayesian Inference]{Incorporating Prior Knowledge via Bayesian Inference:
Smoking and Lung Cancer \label{BI}}
\section{Introduction}
\index{Smoking and lung cancer|(}
At the beginning of the 20th century, the death toll due to lung cancer was
on the rise and the search for possible causes began. For lung cancer in pit
workers, animal experiments showed that the so-called `Schneeberg lung
disease' was induced by radiation. But this could not explain the
increasing incidence of lung cancer in the general population. The
identification of possible risk factors was a challenge for epidemiology and
statistics, both disciplines being still in their infancy in the 1920s and 1930s.
The first modern controlled epidemiological study on the effect of smoking
on lung cancer was performed by Franz Hermann M\"uller as part of his
dissertation at the University of Cologne in 1939. The results were
published a year later \citep{HSAUR:Mueller1940}. M\"uller sent out
questionnaires to the relatives of people who had recently died of lung
cancer, asking about the smoking behavior and its intensity of the deceased
relative. He also sent the questionnaire to healthy controls to obtain
information about the smoking behavior in a control group, although it is
not clear how this control group was defined. The number of lung cancer
patients and healthy controls in five different groups (nonsmokers to
extreme smokers) are given in Table~\ref{BI-Smoking_Mueller1940-tab}.
<>=
data("Smoking_Mueller1940", package = "HSAUR3")
toLatex(HSAURtable(Smoking_Mueller1940),
caption = paste("Smoking and lung cancer case-control study by M\\\"uller (1940).",
"The smoking intensities were defined by the number of",
"cigarettes smoked daily:",
"1-15 (moderate), 16-25 (heavy), 26-35 (very heavy),",
"and more than 35 (extreme)."),
label = "BI-Smoking_Mueller1940-tab")
@
Four years later Erich Sch\"oninger also wrote his dissertation on the
association between smoking and lung cancer and, together with his
supervisor Eberhard Schairer at the University of Jena, published his
results on a case-control study
\citep{HSAUR:SchairerSchoeninger1944} where he assessed the smoking behavior
of lung cancer patients, patients diagnosed with other forms of cancer, and
also a healthy control group. The data are given in
Table~\ref{BI-Smoking_SchairerSchoeniger1944-tab}.
<>=
x <- as.table(Smoking_SchairerSchoeniger1944[,
c("Lung cancer", "Healthy control")])
toLatex(HSAURtable(x, xname = "Smoking_SchairerSchoeniger1944"),
caption = paste("Smoking and lung cancer case-control study by Schairer and Sch\\\"oniger (1944). Cancer other than lung cancer omitted.",
"The smoking intensities were defined by the number of",
"cigarettes smoked daily:",
"1-5 (moderate), 6-10 (medium), 11-20 (heavy),",
"and more than 20 (very heavy)."),
label = "BI-Smoking_SchairerSchoeniger1944-tab")
@
Shortly after the war, a Dutch epidemiologist reported on a case-control
study performed in Amsterdam \citep{HSAUR:Wassink1945} and found similar
results as the two German studies; see
Table~\ref{BI-Smoking_Wassink1945-tab}.
<>=
data("Smoking_Wassink1945", package = "HSAUR3")
toLatex(HSAURtable(Smoking_Wassink1945),
caption = paste("Smoking and lung cancer case-control study by Wassink (1945).",
"Smoking categories correspond to the categories used by M\\\"uller (1940)."),
label = "BI-Smoking_Wassink1945-tab")
@
In 1950 perhaps the most important, but not the first, case-control study showing
an increasing risk of developing lung cancer with the amount of tobacco
smoked, was published in Great Britain by Richard Doll and Austin Bradford Hill
\citep{HSAUR:DollHill1950}. We restrict discussion here to data obtained for males
and the data shown in Table~\ref{BI-Smoking_DollHill1950-tab} corresponds to
the most recent amount of tobacco consumed regularly by smokers before
disease onset \citep[Table~V in][]{HSAUR:DollHill1950}.
<>=
data("Smoking_DollHill1950", package = "HSAUR3")
x <- as.table(Smoking_DollHill1950[,,"Male", drop = FALSE])
toLatex(HSAURtable(x, xname = "Smoking_DollHill1950"),
caption = paste("Smoking and lung cancer case-control study (only males) by Doll and Hill (1950).",
"The labels for the smoking categories give the number of cigarettes smoked every day."),
label = "BI-Smoking_DollHill1950-tab")
@
Although the design of the studies by \cite{HSAUR:Mueller1940} and
\cite{HSAUR:SchairerSchoeninger1944}, especially the selection of their
control groups, can be criticized \citep[see][for a detailed
discussion]{HSAUR:Morabia2013} and the study by \cite{HSAUR:DollHill1950}
was larger than the older studies and more detailed information on the
smoking behavior was obtained by direct patient interviews, the information
provided by the earlier studies was not taken into account by
\cite{HSAUR:DollHill1950}. They cite \cite{HSAUR:Mueller1940} in their
introduction, but did not compare their findings with his results. It is
remarkable to see that both \cite{HSAUR:SchairerSchoeninger1944} and
\cite{HSAUR:Wassink1945} extensively made use of the report by
\cite{HSAUR:Mueller1940} and go as far as analyzing the merged data
\citep[Grafiek I, E, and F, in][]{HSAUR:Wassink1945}. In an informal way,
these authors wanted to use the already available information, in today's
terms called `prior knowledge', to make a stronger case with the new data.
Formal statistical methods to incorporate prior knowledge into data analysis
as part of the `Bayesian' way of doing statistical analyses were developed
in the second half of the last century, and we will focus on them in the
present chapter. \index{Smoking and lung cancer|)}
\section{Bayesian Inference}
\section{Analysis Using \R{}}
\subsection{One-by-one Analysis}
For the analysis of the four different case-control studies on smoking and
lung cancer, we will (retrospectively, of course) update our knowledge with
every new study. We begin with a re-analysis of the data described by
\cite{HSAUR:Mueller1940}. Using an approximate permutation test introduced in
Chapter~\ref{CI} for the hypothesis of independence of the amount of
tobacco smoked and group membership (lung cancer or healthy control), we get
<>=
library("coin")
set.seed(29)
independence_test(Smoking_Mueller1940, teststat = "quad",
distribution = approximate(100000))
@
and there is clearly a strong association between the number of cigarettes
smoked and incidence of lung cancer. Because the amount of tobacco smoked
is an ordered categorical variable, it is more appropriate to take this
information into account, for example by means of a linear association test
(see Chapter~\ref{CI}). Nonsmokers receive a score of zero, and for the
remaining groups we choose the mid-point of the intervals of daily
cigarettes smoked that were used by \cite{HSAUR:Mueller1940} to define his
groups:
<>=
ssc <- c(0, 1 + 14 / 2, 16 + 9 / 2, 26 + 9 / 2, 40)
independence_test(Smoking_Mueller1940, teststat = "quad",
scores = list(Smoking = ssc),
distribution = approximate(100000))
@
The result shows that the data are in favor of an ordered alternative. The
$p$-values obtained from approximate permutation tests are attractive
because no distributional assumptions are required, but it is hard to derive
estimates and confidence intervals for interpretable parameters from such
tests. We will therefore now switch to logistic regression models as
described in Chapter~\ref{GLM} to model the odds of lung cancer in the
different smoking groups. Before we start, let us define a small function
for computing odds (for intercept parameters) and odds ratios (for
difference parameters) and corresponding confidence intervals from a
logistic regression model:
<>=
eci <- function(model)
cbind("Odds (Ratio)" = exp(coef(model)),
exp(confint(model)))
@
We model the probability of developing lung cancer given the smoking
behavior. Because our data was obtained from case-control studies where the
groups (lung cancer patients and healthy controls) were defined first and
only after that we observed data on the smoking behavior (in a so-called
\stress{choice-based sampling}), this may seem the wrong model to start
with. However, the marginal distribution of the two groups only changes the
intercept in such a logistic model and the effects of smoking can still be
interpreted in the way we require \citep[see][for example]{HSAUR:Tutz2012}.
The formula for specifying a logistic regression model can be set up such
that the response is a matrix with two columns for each smoking group
consisting of the number of lung cancer deaths and the number of healthy
controls. Although smoking is an ordered factor, we first fit the model
with treatment contrasts, i.e., we can interpret the $\exp$ of the
regression coefficients as odds ratios between each smoking group and
nonsmokers:
<>=
smoking <- ordered(rownames(Smoking_Mueller1940),
levels = rownames(Smoking_Mueller1940))
contrasts(smoking) <- "contr.treatment"
eci(glm(Smoking_Mueller1940 ~ smoking, family = binomial()))
@
We see that all but one of the odds ratios increase with the amount of
tobacco smoked with a maximum of almost $30$ for extreme smokers (more than
$35$ cigarettes per day). The likelihood confidence intervals are rather
wide due to the limited sample size, but also the lower limit increases with
smoking.
An alternative model formulation can help to compare each smoking group with
the preceding group, the so-called split-coding \citep[for this and other
codings see][]{HSAUR:Tutz2012}:
<>=
K <- diag(nlevels(smoking) - 1)
K[lower.tri(K)] <- 1
contrasts(smoking) <- rbind(0, K)
eci(glm(Smoking_Mueller1940 ~ smoking, family = binomial()))
@
The two largest differences are between moderate smokers and nonsmokers
(\Robject{smoking1}) and between very heavy and heavy smokers
(\Robject{smoking3}). The latter group difference seems, at least judged by
the confidence interval, to be larger than expected under a model with no
effect of smoking.
For the analysis of the three remaining studies, we first perform
permutation tests for the independence of smoking and the two groups
(lung cancer and healthy controls) in males:
<>=
xSS44 <- as.table(Smoking_SchairerSchoeniger1944[,
c("Lung cancer", "Healthy control")])
ap <- approximate(100000)
pvalue(independence_test(xSS44,
teststat = "quad", distribution = ap))
pvalue(independence_test(Smoking_Wassink1945,
teststat = "quad", distribution = ap))
xDH50 <- as.table(Smoking_DollHill1950[,, "Male"])
pvalue(independence_test(xDH50,
teststat = "quad", distribution = ap))
@
All $p$-values indicate that the data are not well-described by the
independence model.
\subsection{Joint Bayesian Analysis}
For a Bayesian analysis, we first merge the data from all four studies into
one data frame. In doing so, we also merge the smoking groups in a way that
we only have three groups left: nonsmokers, moderate smokers, and heavy
smokers. These groups are chosen in a way that the number of daily
cigarettes is comparable. We first merge the heavy, very heavy, and extreme
smokers from \cite{HSAUR:Mueller1940}
<>=
(M <- rbind(Smoking_Mueller1940[1:2,],
colSums(Smoking_Mueller1940[3:5,])))
@
and proceed with the lung cancer patients and healthy controls from
\cite{HSAUR:SchairerSchoeninger1944} in the same way
<>=
SS <- Smoking_SchairerSchoeniger1944[,
c("Lung cancer", "Healthy control")]
(SS <- rbind(SS[1,], colSums(SS[2:3,]), colSums(SS[4:5,])))
@
and finally perform the same exercise for the \cite{HSAUR:Wassink1945}
and \cite{HSAUR:DollHill1950} data
<>=
(W <- rbind(Smoking_Wassink1945[1:2,],
colSums(Smoking_Wassink1945[3:4,])))
DH <- Smoking_DollHill1950[,, "Male"]
(DH <- rbind(DH[1,], colSums(DH[2:3,]), colSums(DH[4:6,])))
@
The three new groups are now called nonsmokers, moderate smokers, and
heavy smokers, and we set up a data frame that contains
the number of people in each of the possible groups for all studies:
<>=
smk <- c("Nonsmoker", "Moderate smoker", "Heavy smoker")
x <- expand.grid(Smoking = ordered(smk, levels = smk),
Diagnosis = factor(c("Lung cancer", "Control")),
Study = c("Mueller1940", "SchairerSchoeniger1944",
"Wassink1945", "DollHill1950"))
x$weights <- c(as.vector(M), as.vector(SS),
as.vector(W), as.vector(DH))
@
Before we fit logistic regression models using the data organized in such a
way, we define the contrasts for the smoking ordered factor and
expand the data in a way that each row corresponds to one person. This is necessary
because the \Rcmd{weights} argument to the \Rcmd{glm} function must not be used
to define case weights:
<>=
contrasts(x$Smoking) <- "contr.treatment"
x <- x[rep(1:nrow(x), x$weights),]
@
We now compute one logistic regression model for each study for later
comparisons:
<>=
models <- lapply(levels(x$Study), function(s)
glm(Diagnosis ~ Smoking, data = x, family = binomial(),
subset = Study == s))
names(models) <- levels(x$Study)
@
In 1939, M\"uller was hardly in the position to come up with a reasonable
prior for the odds ratios between moderate or heavy smokers and nonsmokers. So
we also use a noninformative prior and just perform the maximum likelihood analysis:
<>=
eci(models[["Mueller1940"]])
@
Four years later, the maximum likelihood results obtained for the
\cite{HSAUR:SchairerSchoeninger1944} data
<>=
eci(models[["SchairerSchoeniger1944"]])
@
could have been improved by using a normal prior for the difference in log odds
whose distribution is the distribution of the maximum likelihood estimator obtained
for M\"uller's data. At least approximately, we can compute posterior
$90\%$ credibility intervals and the posterior mode from the
Schairer and Sch\"oniger data by analyzing both data sets simultaneously.
We should, however, keep in mind that the odds of developing lung cancer
for nonsmokers is not really interesting for our analysis and that
the four studies may very well differ with respect to this intercept
parameter. Consequently, we don't want to specify a prior for the
intercept. One way to implement such a strategy is to exclude the intercept
term from the joint model while allowing a separate intercept for
each of the studies:
<>=
mM40_SS44 <- glm(Diagnosis ~ 0 + Study + Smoking, data = x,
family = binomial(),
subset = Study %in% c("Mueller1940",
"SchairerSchoeniger1944"))
eci(mM40_SS44)
@
We observe two important differences between the maximum likelihood and
Bayesian results for the Schairer and Sch\"oniger data: In the Bayesian
analysis, the estimated odds ratio for moderate smokers is closer to the
smaller value obtained from M\"uller's data and, more important, the
credibility intervals are much narrower and, one has to say, more realistic now.
An odds ratio as large as $40$ is hardly something one would expect to see in
practice.
If Wassink had been aware of Bayesian statistics, he could have used
the posterior distribution of the parameters from
our model \Robject{mM40\_SS44} as a prior distribution for analyzing his
data. The maximum likelihood results for his data
<>=
eci(models[["Wassink1945"]])
@
would have changed to
<>=
mM40_SS44_W45 <- glm(Diagnosis ~ 0 + Study + Smoking,
data = x, family = binomial(),
subset = Study %in% c("Mueller1940",
"SchairerSchoeniger1944",
"Wassink1945"))
eci(mM40_SS44_W45)
@
The rather small odds ratios obtained from the model fitted to the
Wassink data only are now closer to the estimates obtained from the
two previous studies and the variability, as given by the credibility intervals,
is much smaller.
Now, finally, the model for the Doll and Hill data reports rather large
odds ratios with wide confidence intervals:
<>=
eci(models[["DollHill1950"]])
@
With a (now rather strong) prior defined by the three earlier studies, we get
from the joint model for all four studies
<>=
m_all <- glm(Diagnosis ~ 0 + Study + Smoking, data = x,
family = binomial())
eci(m_all)
@
<>=
r <- eci(m_all)
xM <- round(r["SmokingModerate smoker", 2:3], 1)
xH <- round(r["SmokingHeavy smoker", 2:3], 1)
@
In 1950, the joint evidence based on such an analysis with an odds ratio
between $\Sexpr{xM[1]}$ and $\Sexpr{xM[2]}$ for moderate smokers and between
$\Sexpr{xH[1]}$ and $\Sexpr{xH[2]}$ for heavy smokers compared to
nonsmokers, would have made a much stronger case than any of the single
studies alone. It is interesting to see that with this strong prior for the
Doll and Hill study, we also get relatively large odds ratios when comparing
heavy to moderate smokers (see row labeled \Rcmd{Smoking2}):
<>=
K <- diag(nlevels(x$Smoking) - 1)
K[lower.tri(K)] <- 1
contrasts(x$Smoking) <- rbind(0, K)
eci(glm(Diagnosis ~ 0 + Study + Smoking, data = x,
family = binomial()))
@
\subsection{A Comparison with Meta Analysis}
One may ask how the Bayesian approach of progressively updating the estimates
considered here differs from a classical meta analysis described in
Chapter~\ref{MA}. We
first reshape the data into a form suitable for such an analysis
<>=
y <- xtabs(~ Study + Smoking + Diagnosis, data = x)
ntrtM <- margin.table(y, 1:2)[,"Moderate smoker"]
nctrl <- margin.table(y, 1:2)[,"Nonsmoker"]
ptrtM <- y[,"Moderate smoker","Lung cancer"]
pctrl <- y[,"Nonsmoker","Lung cancer"]
ntrtH <- margin.table(y, 1:2)[,"Heavy smoker"]
ptrtH <- y[,"Heavy smoker","Lung cancer"]
@
and then compute joint odds ratios and confidence intervals for moderate and
heavy smokers compared to nonsmokers:
<>=
library("rmeta")
meta.MH(ntrt = ntrtM, nctrl = nctrl,
ptrt = ptrtM, pctrl = pctrl)
meta.MH(ntrt = ntrtH, nctrl = nctrl,
ptrt = ptrtH, pctrl = pctrl)
@
For moderate smokers, the effect is a little weaker compared with the
results reported on earlier and for heavy smokers, the meta analysis
identifies a stronger effect for heavy smokers. Nevertheless, the
differences between the two rather different approaches are negligible and
the conclusions would have been the same.
\section{Summary of Findings}
We have seen that, using a Bayesian approach to incorporate prior knowledge
into a model, the odds of developing lung cancer increase with increased
amounts of smoking. Of course, our analysis here is very simplistic, because
we ignored that also pipe and cigar smokers were present in the data, we
merged the data based on a very rough assessment of the number of cigarettes
smoked per day, ignored whether or not the smokers inhaled the smoke into
their lungs, or if nonsmokers were subject to passive-smoking, as we call it
today. Most importantly, we must not misinterpret findings from
case-control studies as casual and, in fact, none of the authors cited here
did so. The debate on whether smoking, and which kind of smoking, actually
\stress{causes} lung cancer was initiated by the publications cited in this
chapter and many famous statisticians took part in the debate, for example,
Sir Ronald Fisher \citep{HSAUR:Fisher1959}, took the view that the inference
of causation was premature. In retrospect this was one issue (perhaps the
only one) where Fisher was mistaken.
\section{Final Comments}
There remain a few hard-line opponents of Bayesian inference (just a few)
who reject the method because of the use of subjective prior distributions
which, these opponents feel, have no place in scientific investigations.
And there are Bayesians who think that the only defense of using
non-Bayesian methods is incompetence.
But for an increasing number of statisticians Bayesian inference is very
attractive, because we can use the posterior distribution of the parameters
to draw conclusions from the data. Although this requires the specification
of a prior distribution, we have seen in this chapter that, using data from
previous experiments, priors can be defined in a reasonable way. It is not
absolutely necessary to rely on rather complex numerical procedures to`estimate' a posterior distribution. When we are willing to cut some
corners, we can implement simple Bayesian approaches using standard
software. We should also keep in mind that the prior can be interpreted as
a penalty on the parameters, and many penalization approaches therefore
have an (often implicit) connection to the Bayesian way of doing statistics.
Of course, just picking the prior that `works best' is dangerous and
almost surely inappropriate.
\section*{Exercises}
\begin{description}
\exercise
Produce a forest plot as introduced in Chapter~\ref{MA} for the four
smoking studies analyzed here.
\exercise
Produce a modified forest plot where one can see how the evidence
for smoking being related to lung cancer evolved between 1940 and 1950.
\exercise
Use the \Rpackage{INLA} add-on package to perform a similar analysis by
using the coefficients and their standard errors estimated from
our initial logistic regression model \texttt{m[["Mueller1940"]]} as parameters of a normal prior
for a logistic regression applied to the Schairer and Sch\"oniger data.
Compare the resulting credibility intervals for the two odds-ratios
with the approximate results obtained in this chapter.
\end{description}
%%\bibliographystyle{LaTeXBibTeX/refstyle}
%%\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_quantile_regression.Rnw 0000644 0001762 0000144 00000064325 14416236370 020433 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Quantile Regression}
%%\VignetteDepends{lattice,quantreg}
\setcounter{chapter}{11}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
%% lower png resolution for vignettes
\SweaveOpts{resolution = 80}
<>=
library("lattice")
trellis.par.set(list(plot.symbol = list(col=1,pch=20, cex=0.7),
box.rectangle = list(col=1),
plot.line = list(col = 1, lwd = 1),
box.umbrella = list(lty=1, col=1),
strip.background = list(col = "white")))
ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme)
data("db", package = "gamlss.data")
nboys <- with(db, sum(age > 2))
@
\chapter[Quantile Regression]{Quantile Regression:
Head Circumference for Age\label{QR}}
\section{Introduction}
\section{Quantile Regression}
\section{Analysis Using \R{}}
We begin with a graphical inspection of the influence of age on head
circumference by means of a scatterplot. Plotting all pairs of age and
head circumference in one panel gives more weight to the teens and 20s,
so we produce one plot for younger boys between two and nine years old and
one additional plot for boys older than nine years (or $>108$ months,
to be precise). The \Rcmd{cut} function is very convenient for constructing
a factor representing these two groups
<>=
summary(db)
db$cut <- cut(db$age, breaks = c(2, 9, 23),
labels = c("2-9 yrs", "9-23 yrs"))
@
which can then be used as a
conditioning variable for conditional scatterplots produced with the
\Rcmd{xyplot} function \citep[package \Rpackage{lattice}]{PKG:lattice}.
Because we draw $\Sexpr{nboys}$ points in total, we use transparent shading
(via \Rcmd{rgb(.1, .1, .1, .1)}) in order to obtain a clearer picture
for the more populated areas in the plot.
\begin{figure}
\begin{center}
<>=
db$cut <- cut(db$age, breaks = c(2, 9, 23),
labels = c("2-9 yrs", "9-23 yrs"))
xyplot(head ~ age | cut, data = db, xlab = "Age (years)",
ylab = "Head circumference (cm)",
scales = list(x = list(relation = "free")),
layout = c(2, 1), pch = 19,
col = rgb(.1, .1, .1, .1))
@
\caption{Scatterplot of age and head circumference for $\Sexpr{nboys}$
Dutch boys. \label{QR-db-plot}}
\end{center}
\end{figure}
Figure~\ref{QR-db-plot}, as expected, shows that head circumference
increases with age. It also shows that there is considerable variation and
also quite a number of extremely large or small head circumferences in the
respective age cohorts. It should be noted that each point corresponds to
one boy participating in the study due to its cross-sectional study design.
No longitudinal measurements (cf.~Chapter~\ref{ALDI}) were taken and we can
safely assume independence between observations.
We start with a simple linear model, computed separately for the younger and
older boys, for regressing the mean head circumference on age
<>=
(lm2.9 <- lm(head ~ age, data = db, subset = age < 9))
(lm9.23 <- lm(head ~ age, data = db, subset = age > 9))
@
This approach is equivalent to fitting two intercepts and two slopes in
the joint model
<>=
(lm_mod <- lm(head ~ age:I(age < 9) + I(age < 9) - 1,
data = db))
@
while omitting the global intercept. Because the median of the
normal distribution is equal to its mean, the two models can be interpreted
as conditional median models under the normal assumption. The model
states that within one year, the head circumference increases by
$\Sexpr{round(coef(lm_mod)["age:I(age < 9)TRUE"], 3)}$ cm for boys less than nine years old
and by $\Sexpr{round(coef(lm_mod)["age:I(age < 9)FALSE"], 3)}$ for older boys.
We now relax this distributional assumption and compute a median
regression model using the \Rcmd{rq} function from package \Rpackage{quantreg}
\citep{PKG:quantreg}:
<>=
library("quantreg")
(rq_med2.9 <- rq(head ~ age, data = db, tau = 0.5,
subset = age < 9))
(rq_med9.23 <- rq(head ~ age, data = db, tau = 0.5,
subset = age > 9))
@
When we construct confidence intervals for the
intercept and slope parameters from both models for the younger boys
<>=
cbind(coef(lm2.9)[1], confint(lm2.9, parm = "(Intercept)"))
cbind(coef(lm2.9)[2], confint(lm2.9, parm = "age"))
summary(rq_med2.9, se = "rank")
@
we see that the two intercepts are almost identical but there seems to be
a larger slope parameter for age in the median regression model.
For the older boys, we get the confidence intervals via
<>=
cbind(coef(lm9.23)[1], confint(lm9.23, parm = "(Intercept)"))
cbind(coef(lm9.23)[2], confint(lm9.23, parm = "age"))
summary(rq_med9.23, se = "rank")
@
with again almost identical intercepts and only a slightly increased
slope for age in the median regression model.
Since one of our aims was the construction of growth curves, we first use the
linear models regressing head circumference on age to plot such curves.
Based on the two normal linear models, we can compute the quantiles of head
circumference for age. For the following values of $\tau$
<>=
tau <- c(.01, .1, .25, .5, .75, .9, .99)
@
and a grid of age values
<>=
gage <- c(2:9, 9:23)
i <- 1:8
@
(the index \Rcmd{i} denoting younger boys), we compute the
standard prediction intervals
\index{Prediction interval}
taking the randomness of the estimated
intercept, slope, and variance parameters into account. We first set up
a data frame with our grid of age values and then use the \Rcmd{predict}
function for a linear model to compute prediction intervals, here
with a coverage of $50\%$. The lower limit of such a $50\%$ prediction
interval is equivalent to the conditional $25\%$ quantile for the given age
and the upper limit corresponds to the $75\%$ quantile. The conditional
mean is also reported and is equivalent to the conditional median:
<>=
idf <- data.frame(age = gage[i])
p <- predict(lm2.9, newdata = idf, level = 0.5,
interval = "prediction")
colnames(p) <- c("0.5", "0.25", "0.75")
p
@
We now proceed with $80\%$ prediction intervals for constructing the
$10\%$ and $90\%$ quantiles, and with $98\%$ prediction intervals
corresponding to the $1\%$ and $99\%$ quantiles and repeat the exercise
also for the older boys:
<>=
p <- cbind(p, predict(lm2.9, newdata = idf, level = 0.8,
interval = "prediction")[,-1])
colnames(p)[4:5] <- c("0.1", "0.9")
p <- cbind(p, predict(lm2.9, newdata = idf, level = 0.98,
interval = "prediction")[,-1])
colnames(p)[6:7] <- c("0.01", "0.99")
p2.9 <- p[, c("0.01", "0.1", "0.25", "0.5",
"0.75", "0.9", "0.99")]
idf <- data.frame(age = gage[-i])
p <- predict(lm9.23, newdata = idf, level = 0.5,
interval = "prediction")
colnames(p) <- c("0.5", "0.25", "0.75")
p <- cbind(p, predict(lm9.23, newdata = idf, level = 0.8,
interval = "prediction")[,-1])
colnames(p)[4:5] <- c("0.1", "0.9")
p <- cbind(p, predict(lm9.23, newdata = idf, level = 0.98,
interval = "prediction")[,-1])
colnames(p)[6:7] <- c("0.01", "0.99")
@
We now reorder the columns of this table and get the following
conditional quantiles, estimated under the normal assumption of
head circumference:
<>=
p9.23 <- p[, c("0.01", "0.1", "0.25", "0.5",
"0.75", "0.9", "0.99")]
round((q2.23 <- rbind(p2.9, p9.23)), 3)
@
We can now superimpose these conditional quantiles on our scatterplot.
To do this, we need to write our own little panel function that
produces the scatterplot using the \Rcmd{panel.xyplot} function and then
adds the just computed conditional quantiles by means of the
\Rcmd{panel.lines} function called for every column of $\Robject{q2.23}$.
Figure~\ref{QR-db-lm-plot} shows parallel lines owing to the fact that the
linear model assumes an error variance independent from age; this is the
so-called variance homogeneity. Compared to a plot with only a single
(mean) regression line, we plotted a whole bunch of conditional
distributions here, one for each value of age. Of course, we did so
under extremely simplifying assumptions like linearity and variance
homogeneity that we're going to drop now.
\begin{figure}
\begin{center}
<>=
pfun <- function(x, y, ...) {
panel.xyplot(x = x, y = y, ...)
if (max(x) <= 9) {
apply(q2.23, 2, function(x)
panel.lines(gage[i], x[i]))
} else {
apply(q2.23, 2, function(x)
panel.lines(gage[-i], x[-i]))
}
panel.text(rep(max(db$age), length(tau)),
q2.23[nrow(q2.23),], label = tau, cex = 0.9)
panel.text(rep(min(db$age), length(tau)),
q2.23[1,], label = tau, cex = 0.9)
}
xyplot(head ~ age | cut, data = db, xlab = "Age (years)",
ylab = "Head circumference (cm)", pch = 19,
scales = list(x = list(relation = "free")),
layout = c(2, 1), col = rgb(.1, .1, .1, .1),
panel = pfun)
@
\caption{Scatterplot of age and head circumference for $\Sexpr{nboys}$
Dutch boys with superimposed normal quantiles.
\label{QR-db-lm-plot}}
\end{center}
\end{figure}
For the production of a nonparametric version of our growth curves,
we start with fitting not only one but multiple quantile regression models,
one for each value of $\tau$. We start with the younger boys
<>=
(rq2.9 <- rq(head ~ age, data = db, tau = tau,
subset = age < 9))
@
and continue with the older boys
<>=
(rq9.23 <- rq(head ~ age, data = db, tau = tau,
subset = age > 9))
@
Naturally, the intercept parameters vary but there is also a considerable
variation in the slopes, with the largest value
for the $1\%$ quantile regression model for younger boys.
The parameters $\beta_\tau$ have to be interpreted with care.
In general, they cannot be interpreted on an individual-specific level. A
boy who happens to be at the $\tau \times 100\%$ quantile of head circumference
conditional on his age would not be at the same
quantile anymore when he gets older. When knowing
$\beta_\tau$, the only conclusion that can be drawn is how the $\tau
\times 100\%$ quantile of a population with a specific age
differs from the $\tau \times 100\%$ quantile of a population with a
different age.
Because the linear functions estimated by linear quantile regression,
here in model \Robject{rq9.23}, directly correspond to the conditional
quantiles of interest, we can use the \Rcmd{predict} function to compute
the estimated conditional quantiles:
<>=
p2.23 <- rbind(predict(rq2.9,
newdata = data.frame(age = gage[i])),
predict(rq9.23,
newdata = data.frame(age = gage[-i])))
@
It is important to note that these numbers were obtained without assuming
anything about the continuous distribution of head circumference given any
age. Again, we produce a scatterplot with superimposed quantiles, this time
each line corresponds to a specific model. For the sake of comparison with
the linear model, we add the linear model quantiles as dashed lines to
Figure~\ref{QR-db-rq-plot}. For the older boys, there seems to be almost no
difference but the more extreme $1\%$ and $99\%$ quantiles for the younger
boys differ considerably. So, at least for the younger boys, we might want
to allow for age-specific variability in the distribution of head
circumference.
\begin{figure}
\begin{center}
<>=
pfun <- function(x, y, ...) {
panel.xyplot(x = x, y = y, ...)
if (max(x) <= 9) {
apply(q2.23, 2, function(x)
panel.lines(gage[i], x[i], lty = 2))
apply(p2.23, 2, function(x)
panel.lines(gage[i], x[i]))
} else {
apply(q2.23, 2, function(x)
panel.lines(gage[-i], x[-i], lty = 2))
apply(p2.23, 2, function(x)
panel.lines(gage[-i], x[-i]))
}
panel.text(rep(max(db$age), length(tau)),
p2.23[nrow(p2.23),], label = tau, cex = 0.9)
panel.text(rep(min(db$age), length(tau)),
p2.23[1,], label = tau, cex = 0.9)
}
xyplot(head ~ age | cut, data = db, xlab = "Age (years)",
ylab = "Head circumference (cm)", pch = 19,
scales = list(x = list(relation = "free")),
layout = c(2, 1), col = rgb(.1, .1, .1, .1),
panel = pfun)
@
\caption{Scatterplot of age and head circumference for $\Sexpr{nboys}$
Dutch boys with superimposed regression quantiles (solid lines) and
normal quantiles (dashed lines). \label{QR-db-rq-plot}}
\end{center}
\end{figure}
Still, with the quantile regression models shown in
Figure~\ref{QR-db-rq-plot} we assume that the quantiles of head
circumference depend on age in a linear way. Additive quantile regression
is one way to approach the estimation of non-linear quantile functions. By
considering two different models for younger and older boys, we allowed
for a certain type of non-linear function in the results shown so far.
Additive quantile regression should be able to deal with this problem
and we therefore fit these models to all boys simultaneously.
For our different choices of $\tau$, we fit one additive quantile regression model
using the \Rcmd{rqss} function from the \Rpackage{quantreg} and allow
smooth quantile functions of age via the \Rcmd{qss} function in the
right-hand side of the model formula.
Note that we transformed age by the third root prior to model fitting. This
does not affect the model since it is a monotone transformation, however, it
helps to avoid fitting a function with large derivatives for very young
boys resulting in a low penalty parameter $\lambda$:
<>=
rqssmod <- vector(mode = "list", length = length(tau))
db$lage <- with(db, age^(1/3))
for (i in 1:length(tau))
rqssmod[[i]] <- rqss(head ~ qss(lage, lambda = 1),
data = db, tau = tau[i])
@
For the analysis of the head circumference, we choose a penalty parameter
$\lambda = 1$, which is the default for the \Rcmd{qss} function. Simply
using the default without a careful hyperparameter tuning, for example using
crossvalidation or similar procedures, is almost always a mistake.
By visual inspection (Figure~\ref{QR-db-rqss-plot}) we find this choice
appropriate but ask the readers to make a second guess (Exercise 3).
For a finer grid of age values, we compute the conditional quantiles
from the \Rcmd{predict} function:
<>=
gage <- seq(from = min(db$age), to = max(db$age),
length = 50)
p <- sapply(1:length(tau), function(i) {
predict(rqssmod[[i]],
newdata = data.frame(lage = gage^(1/3)))
})
@
Using very similar code as for plotting linear quantiles, we produce again a
scatterplot of age and head circumference but this time overlaid with
non-linear regression quantiles. Given that the results from the linear
models presented in Figure~\ref{QR-db-rq-plot} looked pretty convincing, the
quantile curves in Figure~\ref{QR-db-rqss-plot} shed a surprising new light
on the data. For the younger boys, we expected to see a larger variability
than for boys between two and three years old, but in fact the distribution seems
to be more complex. The distribution seems to be positively skewed with a
heavy lower tail and the degree of skewness varies with age (note that
the median is almost linear for boys older than four years).
Also in the right part of Figure~\ref{QR-db-rqss-plot}, we see an age-varying
skewness, although less pronounced as for the younger boys. The median
increases up to 16 years but then the growth rate is much smaller. This
does not seem to be the case for the $1\%, 10\%, 90\%$, and $99\%$
quantiles. Note that the discontinuity in the quantiles between the two age
groups is only due to the overlapping abscissae.
However, the deviations between the growth curves obtained from a linear
model under normality assumption on the one hand and quantile regression on the
other hand as shown in Figures~\ref{QR-db-rq-plot} and \ref{QR-db-rqss-plot}
are hardly dramatic for the head circumference data.
\begin{figure}
\begin{center}
<>=
pfun <- function(x, y, ...) {
panel.xyplot(x = x, y = y, ...)
apply(p, 2, function(x) panel.lines(gage, x))
panel.text(rep(max(db$age), length(tau)),
p[nrow(p),], label = tau, cex = 0.9)
panel.text(rep(min(db$age), length(tau)),
p[1,], label = tau, cex = 0.9)
}
xyplot(head ~ age | cut, data = db, xlab = "Age (years)",
ylab = "Head circumference (cm)", pch = 19,
scales = list(x = list(relation = "free")),
layout = c(2, 1), col = rgb(.1, .1, .1, .1),
panel = pfun)
@
\caption{Scatterplot of age and head circumference for $\Sexpr{nboys}$
Dutch boys with superimposed non-linear regression quantiles.
\label{QR-db-rqss-plot}}
\end{center}
\end{figure}
\section{Summary of Findings}
We can conclude that the whole distribution of head circumference changes
with age and that assumptions like symmetry and variance homogeneity might be
questionable for such type of analysis.
One alternative to the estimation of conditional quantiles is the estimation
of conditional distributions. One very interesting parametric approach are
generalized additive models for location, scale, and shape
\citep[GAMLSS,][]{HSAUR:RigbyStasinopoulos2005}. In
\cite{HSAUR:StasinopoulosRigby2007}, an analysis of the age and head
circumference by means of the \Rpackage{gamlss} package can be found.
One practical problem associated with contemporary methods in quantile
regression is quantile crossing. Because we fitted one quantile regression model
for each of the quantiles of interest, we cannot guarantee that the conditional
quantile functions are monotone, so the $90\%$ quantile may well be larger than the
$95\%$ quantile in some cases. Postprocessing of the estimated quantile curves
may help in this situation \citep{HSAUR:DetteVolgushev2008}.
\section{Final Comments}
When estimating regression models, we have to be aware of the implications
of model assumptions when interpreting the results. Symmetry, linearity,
and variance homogeneity are among the strongest but common assumptions.
Quantile regression, both in its linear and additive formulation, is an
intellectually stimulating and practically very useful framework where such
assumptions can be relaxed. At a more basic level, one should always ask
\stress{Am I really interested in the mean?} before using the regression models
discussed in other chapters of this book.
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_density_estimation.Rnw 0000644 0001762 0000144 00000047163 14416236367 020273 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Density Estimation}
%%\VignetteDepends{flexmix,KernSmooth,boot}
\setcounter{chapter}{7}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
%% lower png resolution for vignettes
\SweaveOpts{resolution = 100}
<>=
x <- library("KernSmooth")
x <- library("flexmix")
x <- library("boot")
@
\chapter[Density Estimation]{Density Estimation: Erupting Geysers and Star
Clusters \label{DE}}
\section{Introduction}
\section{Density Estimation}
The three kernel functions are implemented in \R{} as shown in lines 1--3 of
Figure~\ref{DE-kernel-fig}. For some grid \Robject{x}, the kernel functions
are plotted using the \R{} statements in lines 5--11 (Figure~\ref{DE-kernel-fig}).
\numberSinput
\begin{figure}
\begin{center}
<>=
rec <- function(x) (abs(x) < 1) * 0.5
tri <- function(x) (abs(x) < 1) * (1 - abs(x))
gauss <- function(x) 1/sqrt(2*pi) * exp(-(x^2)/2)
x <- seq(from = -3, to = 3, by = 0.001)
plot(x, rec(x), type = "l", ylim = c(0,1), lty = 1,
ylab = expression(K(x)))
lines(x, tri(x), lty = 2)
lines(x, gauss(x), lty = 3)
legend(-3, 0.8, legend = c("Rectangular", "Triangular",
"Gaussian"), lty = 1:3, title = "kernel functions",
bty = "n")
@
\caption{Three commonly used kernel functions. \label{DE-kernel-fig}}
\end{center}
\end{figure}
\rawSinput
<>=
w <- options("width")$w
options(width = 66)
@
The kernel estimator $\hat{f}$ is a sum of `bumps' placed at the observations. %'
The kernel function determines the shape of the bumps while the
window width $h$ determines their width.
\index{Windows, in kernel density estimation}
Figure~\ref{DE-bumps} \citep[redrawn from a similar plot in][]{HSAUR:Silverman1986}
shows the individual bumps $n^{-1}h^{-1} K((x - x_i) / h)$, as well as the estimate $\hat{f}$
obtained by adding them up for an artificial set of data points
<>=
x <- c(0, 1, 1.1, 1.5, 1.9, 2.8, 2.9, 3.5)
n <- length(x)
@
For a grid
<>=
xgrid <- seq(from = min(x) - 1, to = max(x) + 1, by = 0.01)
@
on the real line, we can compute the contribution of each measurement in
\Robject{x}, with $h = 0.4$, by the Gaussian kernel (defined in
Figure~\ref{DE-kernel-fig}, line 3) as follows;
<>=
h <- 0.4
bumps <- sapply(x, function(a) gauss((xgrid - a)/h)/(n * h))
@
A plot of the individual bumps and their sum, the kernel density estimate
$\hat{f}$, is shown in Figure~\ref{DE-bumps}.
<>=
options(width = w)
@
\numberSinput
\begin{figure}
\begin{center}
<>=
plot(xgrid, rowSums(bumps), ylab = expression(hat(f)(x)),
type = "l", xlab = "x", lwd = 2)
rug(x, lwd = 2)
out <- apply(bumps, 2, function(b) lines(xgrid, b))
@
\caption{Kernel estimate showing the contributions of Gaussian
kernels evaluated for the individual observations with bandwidth $h =
0.4$. \label{DE-bumps}}
\end{center}
\end{figure}
\rawSinput
\begin{figure}
\begin{center}
<>=
epa <- function(x, y)
((x^2 + y^2) < 1) * 2/pi * (1 - x^2 - y^2)
x <- seq(from = -1.1, to = 1.1, by = 0.05)
epavals <- sapply(x, function(a) epa(a, x))
persp(x = x, y = x, z = epavals, xlab = "x", ylab = "y",
zlab = expression(K(x, y)), theta = -35, axes = TRUE,
box = TRUE)
@
\caption{Epanechnikov kernel for a grid between $(-1.1, -1.1)$ and $(1.1, 1.1)$.
\label{DE-epakernel-fig}}
\end{center}
\end{figure}
\section{Analysis Using \R{}}
\numberSinput
\begin{figure}
\begin{center}
<>=
data("faithful", package = "datasets")
x <- faithful$waiting
layout(matrix(1:3, ncol = 3))
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Gaussian kernel",
border = "gray")
lines(density(x, width = 12), lwd = 2)
rug(x)
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Rectangular kernel",
border = "gray")
lines(density(x, width = 12, window = "rectangular"), lwd = 2)
rug(x)
hist(x, xlab = "Waiting times (in min.)", ylab = "Frequency",
probability = TRUE, main = "Triangular kernel",
border = "gray")
lines(density(x, width = 12, window = "triangular"), lwd = 2)
rug(x)
@
\caption{Density estimates of the geyser eruption data imposed on a histogram
of the data. \label{DE:faithfuldens}}
\end{center}
\end{figure}
\rawSinput
\begin{figure}
\begin{center}
<>=
library("KernSmooth")
data("CYGOB1", package = "HSAUR3")
CYGOB1d <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1, dpik))
contour(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat,
xlab = "log surface temperature",
ylab = "log light intensity")
@
\caption{A contour plot of the bivariate density estimate of the \Robject{CYGOB1} data,
i.e., a two-dimensional graphical display for a three-dimensional problem.
\label{DE:CYGOB12Dcontour}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
persp(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat,
xlab = "log surface temperature",
ylab = "log light intensity",
zlab = "estimated density",
theta = -35, axes = TRUE, box = TRUE)
@
\caption{The bivariate density estimate of the \Robject{CYGOB1} data, here shown in a
three-dimensional fashion using the \Rcmd{persp} function.
\label{DE:CYGOB12Dpersp}}
\end{center}
\end{figure}
\subsection{A Parametric Density Estimate for the Old Faithful Data
\label{DE-waiting}}
<>=
logL <- function(param, x) {
d1 <- dnorm(x, mean = param[2], sd = param[3])
d2 <- dnorm(x, mean = param[4], sd = param[5])
-sum(log(param[1] * d1 + (1 - param[1]) * d2))
}
startparam <- c(p = 0.5, mu1 = 50, sd1 = 3, mu2 = 80, sd2 = 3)
opp <- optim(startparam, logL, x = faithful$waiting,
method = "L-BFGS-B",
lower = c(0.01, rep(1, 4)),
upper = c(0.99, rep(200, 4)))
@
\newpage
<>=
opp
@
<>=
print(opp[names(opp) != "message"])
@
Of course, optimizing the appropriate likelihood `by hand' %'
is not very convenient. In fact, (at least) two packages offer high-level
functionality for estimating mixture models. The first one is package
\Rpackage{mclust} \citep{PKG:mclust} implementing the methodology described
in \cite{HSAUR:FraleyRaftery2002}. Here, a Bayesian information criterion
(BIC) is applied to choose the form of the mixture model:
\index{Bayesian Information Criterion (BIC)}
<>=
library("mclust")
@
<>=
library("mclust")
mc <- Mclust(faithful$waiting)
mc
@
and the estimated means are
<>=
mc$parameters$mean
@
with estimated standard deviation (found to be equal within both groups)
<>=
sqrt(mc$parameters$variance$sigmasq)
@
The proportion is $\hat{p} = \Sexpr{round(mc$parameters$pro[1], 2)}$. The second package is called
\Rpackage{flexmix} whose functionality is described by
\cite{HSAUR:Leisch2004}.
A mixture of two normals can be fitted using
<>=
library("flexmix")
fl <- flexmix(waiting ~ 1, data = faithful, k = 2)
@
with $\hat{p} = \Sexpr{round(fl@prior, 2)}$ and estimated parameters
<>=
parameters(fl, component = 1)
parameters(fl, component = 2)
@
\begin{figure}
\begin{center}
<>=
opar <- as.list(opp$par)
rx <- seq(from = 40, to = 110, by = 0.1)
d1 <- dnorm(rx, mean = opar$mu1, sd = opar$sd1)
d2 <- dnorm(rx, mean = opar$mu2, sd = opar$sd2)
f <- opar$p * d1 + (1 - opar$p) * d2
hist(x, probability = TRUE, xlab = "Waiting times (in min.)",
border = "gray", xlim = range(rx), ylim = c(0, 0.06),
main = "")
lines(rx, f, lwd = 2)
lines(rx, dnorm(rx, mean = mean(x), sd = sd(x)), lty = 2,
lwd = 2)
legend(50, 0.06, lty = 1:2, bty = "n",
legend = c("Fitted two-component mixture density",
"Fitted single normal density"))
@
\caption{Fitted normal density and two-component normal mixture for geyser
eruption data. \label{DE:2Dplot}}
\end{center}
\end{figure}
\index{Bootstrap approach|(}
We can get standard errors for the five parameter estimates
by using a bootstrap approach \citep[see][]{HSAUR:EfronTibshirani1993}.
The original data are slightly perturbed by drawing $n$ out of $n$
observations \stress{with replacement} and those artificial replications of
the original data are called \stress{bootstrap samples}. Now, we can fit the
mixture for each bootstrap sample and assess the variability of the
estimates, for example using confidence intervals.
\index{Confidence interval!derived from bootstrap samples}
Some suitable \R{} code based on the \Rcmd{Mclust}
function follows. First, we define a function that,
for a bootstrap sample \Robject{indx}, fits a two-component mixture
model and returns $\hat{p}$ and the estimated means (note that we need to make
sure that we always get an estimate of $p$, not $1 - p$):
<>=
library("boot")
fit <- function(x, indx) {
a <- Mclust(x[indx], minG = 2, maxG = 2,
modelNames="E")$parameters
if (a$pro[1] < 0.5)
return(c(p = a$pro[1], mu1 = a$mean[1],
mu2 = a$mean[2]))
return(c(p = 1 - a$pro[1], mu1 = a$mean[2],
mu2 = a$mean[1]))
}
@
The function \Rcmd{fit} can now be fed into the \Rcmd{boot} function \citep{PKG:boot}
for bootstrapping (here $1000$ bootstrap samples are drawn)
\begin{Schunk}
\begin{Sinput}
R> bootpara <- boot(faithful$waiting, fit, R = 1000)
\end{Sinput}
\end{Schunk}
<>=
bootparafile <- system.file("cache", "DE-bootpara.rda", package = "HSAUR3")
if (file.exists(bootparafile)) {
load(bootparafile)
} else {
bootpara <- boot(faithful$waiting, fit, R = 1000)
}
@
We assess the variability of our estimates $\hat{p}$ by
means of adjusted bootstrap percentile (BCa) confidence intervals, which for
$\hat{p}$ can be obtained from
<>=
boot.ci(bootpara, type = "bca", index = 1)
@
We see that there is a reasonable variability in the mixture model; however,
the means in the two components are rather stable, as can be seen from
<>=
boot.ci(bootpara, type = "bca", index = 2)
@
for $\hat{\mu}_1$ and for $\hat{\mu}_2$ from
<>=
boot.ci(bootpara, type = "bca", index = 3)
@
Finally, we show a graphical representation of both the bootstrap
distribution of the mean estimates \stress{and} the corresponding confidence
intervals. For convenience, we define a function for plotting, namely
<>=
bootplot <- function(b, index, main = "") {
dens <- density(b$t[,index])
ci <- boot.ci(b, type = "bca", index = index)$bca[4:5]
est <- b$t0[index]
plot(dens, main = main)
y <- max(dens$y) / 10
segments(ci[1], y, ci[2], y, lty = 2)
points(ci[1], y, pch = "(")
points(ci[2], y, pch = ")")
points(est, y, pch = 19)
}
@
The element \Robject{t} of an object created by \Rcmd{boot} contains the
bootstrap replications of our estimates, i.e., the values computed by
\Rcmd{fit} for each of the $1000$ bootstrap samples of the geyser data.
First, we plot a simple density estimate and then construct a line
representing the confidence interval.
We apply this function to the bootstrap distributions of our estimates
$\hat{\mu}_1$ and $\hat{\mu}_2$ in Figure~\ref{DE-bootplot}.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, ncol = 2))
bootplot(bootpara, 2, main = expression(mu[1]))
bootplot(bootpara, 3, main = expression(mu[2]))
@
\caption{Bootstrap distribution and confidence intervals for the mean estimates
of a two-component mixture for the geyser data. \label{DE-bootplot}}
\end{center}
\end{figure}
\index{Bootstrap approach|)}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_gam.Rnw 0000644 0001762 0000144 00000062345 14416236367 015123 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Generalized Additive Models}
%%\VignetteDepends{mgcv,rpart,wordcloud,mboost}
\setcounter{chapter}{9}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
library("mgcv")
library("mboost")
library("rpart")
library("wordcloud")
@
\chapter[Scatterplot Smoothers and Additive Models]{Scatterplot
Smoothers and Generalized Additive Models: The Men's Olympic 1500m,
Air Pollution in the US, Risk Factors for Kyphosis, and Women's Role in %'
Society \label{GAM}}
\section{Introduction}
\section{Scatterplot Smoothers and Generalized Additive Models}
\section{Analysis Using \R{}}
\subsection{Olympic 1500m Times}
To begin we will construct a scatterplot of winning time
against the year the games were held. The \R{} code and the resulting plot
are shown in Figure~\ref{GAM-men1500m-plot}. There is a very clear downward trend in
the times over the years, and, in addition there is a very clear
outlier namely the winning time for 1896. We shall remove this
time from the data set and now concentrate on the remaining times.
First we will fit a simple linear regression to the data and
plot the fit onto the scatterplot. The code and the resulting
plot are shown in Figure~\ref{GAM-men1500m-lm}. Clearly the linear regression
model captures in general terms the downward trend in the times.
Now we can add the fits given by the lowess smoother and by a
cubic spline smoother; the resulting graph and the extra \R{} code
needed are shown in Figure~\ref{GAM-men1500m-smooth}.
Both non-parametric fits suggest some distinct departure from
linearity, and clearly point to a quadratic model being more
sensible than a linear model here. And fitting a parametric model
that includes both a linear and a quadratic effect for the year gives a
prediction curve very similar to the non-parametric curves; see Figure~\ref{GAM-men1500m-quad}.
Here use of the non-parametric smoothers has effectively diagnosed
our linear model and pointed the way to using a more suitable
parametric model; this is often how such non-parametric models can be used most
effectively. For these data, of course, it is clear that the simple linear
model cannot be suitable if the investigator is interested in predicting
future times since even the most basic knowledge of human physiology
will tell us that times cannot continue to go down. There must be some
lower limit to the time man can run 1500m. But in other situations
use of the non-parametric smoothers may point to a parametric model
that could not have been identified \emph{a priori}.
\begin{figure}
\begin{center}
<>=
plot(time ~ year, data = men1500m, xlab = "Year",
ylab = "Winning time (sec)")
@
\caption{Scatterplot of year and winning time. \label{GAM-men1500m-plot}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
men1500m1900 <- subset(men1500m, year >= 1900)
men1500m_lm <- lm(time ~ year, data = men1500m1900)
plot(time ~ year, data = men1500m1900, xlab = "Year",
ylab = "Winning time (sec)")
abline(men1500m_lm)
@
\caption{Scatterplot of year and winning time with fitted values from
a simple linear model. \label{GAM-men1500m-lm}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
x <- men1500m1900$year
y <- men1500m1900$time
men1500m_lowess <- lowess(x, y)
plot(time ~ year, data = men1500m1900, xlab = "Year",
ylab = "Winning time (sec)")
lines(men1500m_lowess, lty = 2)
men1500m_cubic <- gam(y ~ s(x, bs = "cr"))
lines(x, predict(men1500m_cubic), lty = 3)
@
\caption{Scatterplot of year and winning time with fitted values from a smooth
non-parametric model. \label{GAM-men1500m-smooth}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
men1500m_lm2 <- lm(time ~ year + I(year^2),
data = men1500m1900)
plot(time ~ year, data = men1500m1900, xlab = "Year",
ylab = "Winning time (sec)")
lines(men1500m1900$year, predict(men1500m_lm2))
@
\caption{Scatterplot of year and winning time with fitted values from a
quadratic model. \label{GAM-men1500m-quad}}
\end{center}
\end{figure}
It is of some interest to look at the predictions of winning
times in future Olympics from both the linear and quadratic models.
For example, for 2008 and 2012 the predicted times and their $95\%$
confidence intervals can be found using the following code
\newpage
<>=
predict(men1500m_lm,
newdata = data.frame(year = c(2008, 2012)),
interval = "confidence")
predict(men1500m_lm2,
newdata = data.frame(year = c(2008, 2012)),
interval = "confidence")
@
\newpage
For predictions far into the future both
the quadratic and the linear model fail; we leave readers to get
some more predictions to see what happens. We can compare
the first prediction with the time actually recorded by the winner
of the men's 1500m in Beijing 2008,
Rashid Ramzi from Brunei, who won the event in $212.94$ seconds. The confidence
interval obtained from the simple linear model does not include this value
but the confidence interval for the prediction
derived from the quadratic model does.
\subsection{Air Pollution in US Cities}
Unfortunately, we cannot fit an additive model for
describing the $\text{SO}_2$ concentration based on all six
covariates because this leads to more parameters than cities,
i.e., more parameters than observations when using the
default parameterization of \Rpackage{mgcv}. Thus, before we can apply
the \Rcmd{gam} function from package \Rpackage{mgcv}, we have
to decide which covariates should enter the model and which
subset of these covariates should be allowed to deviate from
a linear regression relationship.
As briefly discussed in Section~\ref{GAM:VS}, we can
fit an additive model using the iterative boosting algorithm
as described by \cite{HSAUR:BuehlmannHothorn2007}. The complexity
of the model is determined by an AIC criterion, which can also be used
to determine an appropriate number of boosting iterations to choose.
The methodology is available
from package \Rpackage{mboost} \citep{PKG:mboost}. We start
with a small number of boosting iterations ($100$ by default) and
compute the AIC of the corresponding $100$ models:
<>=
library("mboost")
USair_boost <- gamboost(SO2 ~ ., data = USairpollution)
USair_aic <- AIC(USair_boost)
USair_aic
@
The AIC suggests that the boosting algorithm should be stopped
after $\Sexpr{mstop(USair_aic)}$ iterations. The partial contributions of each
covariate to the predicted $\text{SO}_2$ concentration
are given in Figure~\ref{GAM-USairpollution-boostplot}.
The plot indicates that all six covariates enter the model and
the selection of a subset of covariates for modeling
isn't appropriate in this case. However, the number of manufacturing
enterprises seems to add linearly to the $\text{SO}_2$ concentration, which
simplifies the model. Moreover, the average annual precipitation
contribution seems to deviate from zero only for some extreme observations
and one might refrain from using the covariate at all.
\begin{figure}
\begin{center}
<>=
USair_gam <- USair_boost[mstop(USair_aic)]
layout(matrix(1:6, ncol = 3))
plot(USair_gam, ask = FALSE)
@
\caption{Partial contributions of six exploratory covariates
to the predicted $\text{SO}_2$ concentration. \label{GAM-USairpollution-boostplot}}
\end{center}
\end{figure}
As always, an inspection of the model fit via a residual plot
is worth the effort. Here, we plot the fitted values against the residuals
and label the points with the name of the corresponding city using the \Rcmd{textplot} function from package \Rpackage{wordcloud}.
Figure~\ref{GAM-USairpollution-residplot} shows at least two
extreme observations. Chicago has a very large observed and fitted
$\text{SO}_2$ concentration, which is due to the huge number of inhabitants and
manufacturing plants (see Figure~\ref{GAM-USairpollution-boostplot} also).
One smaller city, Providence, is associated with a rather large
positive residual indicating that the actual $\text{SO}_2$ concentration
is underestimated by the model. In fact, this small town has
a rather high $\text{SO}_2$ concentration which is hardly explained
by our model. Overall, the model doesn't fit the data very well,
so we should avoid overinterpreting the model structure
too much. In addition, since each of the six covariates
contributes to the model, we aren't able to select a smaller subset
of the covariates for modeling and thus fitting a model using
\Rcmd{gam} is still complicated (and will not add much knowledge anyway).
\begin{figure}
\begin{center}
<>=
SO2hat <- predict(USair_gam)
SO2 <- USairpollution$SO2
plot(SO2hat, SO2 - SO2hat, type = "n",
xlim = c(-20, max(SO2hat) * 1.1),
ylim = range(SO2 - SO2hat) * c(2, 1))
textplot(SO2hat, SO2 - SO2hat, rownames(USairpollution),
show.lines = FALSE, new = FALSE)
abline(h = 0, lty = 2, col = "grey")
@
\caption{Residual plot of $\text{SO}_2$ concentration.
\label{GAM-USairpollution-residplot}}
\end{center}
\end{figure}
\subsection{Risk Factors for Kyphosis}
\index{Spinogram}
Before modeling the relationship between kyphosis and
the three exploratory variables age, starting vertebral level of
the surgery, and number of vertebrae involved, we investigate the
partial associations by so-called \stress{spinograms}, as
introduced in \Sexpr{ch("DAGD")}. The
numeric exploratory covariates are discretized and their
empirical relative frequencies are plotted against the
conditional frequency of kyphosis in the corresponding
group. Figure~\ref{GAM-kyphosis-plot} shows that kyphosis
is absent in very young or very old children, children
with a small starting vertebral level, and high number of vertebrae
involved.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:3, nrow = 1))
spineplot(Kyphosis ~ Age, data = kyphosis,
ylevels = c("present", "absent"))
spineplot(Kyphosis ~ Number, data = kyphosis,
ylevels = c("present", "absent"))
spineplot(Kyphosis ~ Start, data = kyphosis,
ylevels = c("present", "absent"))
@
\caption{Spinograms of the three exploratory variables and response variable
\Robject{kyphosis}. \label{GAM-kyphosis-plot}}
\end{center}
\end{figure}
The logistic additive model needed to describe the conditional
probability of kyphosis given the exploratory variables
can be fitted using function \Rcmd{gam}. Here, the dimension
of the basis ($k$) has to be modified for \Robject{Number} and
\Robject{Start} since these variables are heavily tied. As for
generalized linear models, the \Robject{family} argument determines
the type of model to be fitted, a logistic model in our case:
<>=
(kyphosis_gam <- gam(Kyphosis ~ s(Age, bs = "cr") +
s(Number, bs = "cr", k = 3) + s(Start, bs = "cr", k = 3),
family = binomial, data = kyphosis))
@
The partial contributions of each covariate to the conditional
probability of kyphosis with confidence bands are shown in
Figure~\ref{GAM-kyphosis-gamplot}. In essence, the same conclusions
as drawn from Figure~\ref{GAM-kyphosis-plot} can be stated here.
The risk of kyphosis being present decreases with higher
starting vertebral level and lower number of vertebrae involved.
Children about $100$ months old are under higher risk compared
to younger or older children.
\begin{figure}
\begin{center}
<>=
trans <- function(x)
binomial()$linkinv(x)
layout(matrix(1:3, nrow = 1))
plot(kyphosis_gam, select = 1, shade = TRUE, trans = trans)
plot(kyphosis_gam, select = 2, shade = TRUE, trans = trans)
plot(kyphosis_gam, select = 3, shade = TRUE, trans = trans)
@
\caption{Partial contributions of three exploratory variables
with confidence bands. \label{GAM-kyphosis-gamplot}}
\end{center}
\end{figure}
\subsection{Women's Role in Society} %'
In Chapter~\ref{GLM}, we saw that a logistic regression with an interaction between
gender and level of education described the data better than a main-effects only
model. Using an additive logistic regression model, we can fit separate, possibly
nonlinear, functions of levels of education to both genders:
<>=
data("womensrole", package = "HSAUR3")
fm1 <- cbind(agree, disagree) ~ s(education, by = gender)
womensrole_gam <- gam(fm1, data = womensrole,
family = binomial())
@
The resulting model is best inspected by a plot
(Figure~\ref{GAM-womensrole-gamplot}). For males, the log-odds of agreement
decreases linearly with each additional year of education. For females,
the log-odds is more or less constant up to five years of education and only
then begins to decrease. After 15 years, there seems to be no further
impact on the log-odds. When we plot the resulting fitted probabilities in
a way similar to Figure~\ref{GLM-role2plot}, we see that the interaction is
even more pronounced in the additive compared to the linear model. The flat
curve for women with less than five years of education can be explained by
the rather large variability of the answers in this area but the plateau to
the right is due to two groups of highly educated women with a rather
large proportion of agreement.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, nrow = 1))
plot(womensrole_gam, select = 1, shade = TRUE)
plot(womensrole_gam, select = 1, shade = TRUE)
@
\caption{Effects of level of education for males (right) and females (left)
on the log-odds scale derived from an additive logistic regression model.
The shaded area denotes confidence bands. \label{GAM-womensrole-gamplot}}
\end{center}
\end{figure}
<>=
myplot <- function(role.fitted) {
f <- womensrole$gender == "Female"
plot(womensrole$education, role.fitted, type = "n",
ylab = "Probability of agreeing",
xlab = "Education", ylim = c(0,1))
lines(womensrole$education[!f], role.fitted[!f], lty = 1)
lines(womensrole$education[f], role.fitted[f], lty = 2)
lgtxt <- c("Fitted (Males)", "Fitted (Females)")
legend("topright", lgtxt, lty = 1:2, bty = "n")
y <- womensrole$agree / (womensrole$agree +
womensrole$disagree)
size <- womensrole$agree + womensrole$disagree
size <- size - min(size)
size <- (size / max(size)) * 3 + 1
text(womensrole$education, y, ifelse(f, "\\VE", "\\MA"),
family = "HersheySerif", cex = size)
}
@
\begin{figure}
\begin{center}
<>=
myplot(predict(womensrole_gam, type = "response"))
@
\caption{Effects of level of education for males (right) and females (left)
on the log-odds scale derived from an additive logistic regression model.
The shaded area denotes confidence bands. \label{GAM-womensrole-probplot}}
\end{center}
\end{figure}
\section{Summary of Findings}
\begin{description}
\item[Olympic 1500m times]
Here the use of a generalized additive model suggested that a quadratic
model might best describe the data. When such a model was fitted it made
reasonable predictions of the winner's times in the Olympic Games of 2008
and 2012.
\item[Air pollution data]
Finding a suitable model for these data was problematic because of the
outliers in the data and the high correlations between some pairs of
explanatory variables. Except for wind, the fitted partial contributions
are well approximated by a linear function for most of the observations and
it might be questioned if the more complex additive model is really needed.
\item[Kyphosis]
The risk of kyphosis being present decreases with higher starting vertebral
level and lower number of vertebrae involved. Children about 100 months old
are under higher risk compared to younger or older children.
\item[Women's role in society]
For males, the log-odds of agreement decreases linearly with each
additional year of education. For females, the log-odds is more or less
constant up to five years of education and only then begins to decrease.
After $15$ years, there seems to be no further impact on the log-odds.
\end{description}
\section{Final Comments}
Additive models offer flexible modeling tools for regression
problems. They stand between generalized linear models, where
the regression relationship is assumed to be linear, and
more complex models like random forests (see \Sexpr{ch("RP")})
where the regression relationship remains unspecified. Smooth
functions describing the influence of covariates on the response
can be easily interpreted. Variable
selection is a technically difficult problem in this class
of models; boosting methods are one possibility to deal with this
problem.
\section*{Exercises}
\begin{description}
\exercise
Consider the body fat data introduced in \Sexpr{ch("RP")},
Table~\ref{RP-bodyfat-tab}. First fit a generalized additive
model assuming normal errors using function \Rcmd{gam}. Are
all potential covariates informative? Check the results
against a generalized additive model that underwent
AIC-based variable selection (fitted using function \Rcmd{gamboost}).
\exercise
Again fit an additive model to the body fat data, but this time for
a log-transformed response. Compare the two models, which one is more appropriate?
\exercise
Try to fit a logistic additive model to the glaucoma data
discussed in \Sexpr{ch("RP")}. Which covariates should enter the
model and how is their influence on the probability of
suffering from glaucoma?
\exercise
Investigate the use of different types of scatterplot smoothers on the
Hubble data in Table~\ref{MLR-hubble-tab} in Chapter~\ref{MLR-hubble-tab}.
\end{description}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_multidimensional_scaling.Rnw 0000644 0001762 0000144 00000027407 14416236367 021434 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Multidimensional Scaling}
%%\VignetteDepends{ape,wordcloud,MASS}
\setcounter{chapter}{19}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
x <- library("ape")
library("wordcloud")
@
\chapter[Multidimensional Scaling]{Multidimensional Scaling: British Water
Voles and Voting in US Congress \label{MDS}}
\section{Introduction}
\section{Multidimensional Scaling}
\section{Analysis Using \R{}}
We can apply classical scaling to the distance matrix for
populations of water voles using the \R{} function \Rcmd{cmdscale}.
The following code finds the classical scaling solution and computes
the two criteria for assessing the required number of dimensions
as described above.
<>=
data("watervoles", package = "HSAUR3")
voles_mds <- cmdscale(watervoles, k = 13, eig = TRUE)
voles_mds$eig
@
Note that some of the eigenvalues are negative.
The criterion $P_2$ can be computed by
<>=
sum(abs(voles_mds$eig[1:2]))/sum(abs(voles_mds$eig))
@
and the criterion suggested by \cite{HSAUR:Mardiaetal1979} is
<>=
sum((voles_mds$eig[1:2])^2)/sum((voles_mds$eig)^2)
@
The two criteria for judging number of dimensions differ considerably, but
both values are reasonably large, suggesting that the original distances
between the water vole populations can be represented adequately in two
dimensions. The two-dimensional solution can be plotted by extracting the
coordinates from the \Robject{points} element of the \Robject{voles\_mds}
object; the plot is shown in Figure~\ref{MDS-watervoles-plot}. The
\Rcmd{textplot} function from package \Rpackage{wordcloud} can be used to
annotate the plot with non-overlapping text.
\begin{figure}
\begin{center}
<>=
x <- voles_mds$points[,1]
y <- voles_mds$points[,2]
plot(x, y, xlab = "Coordinate 1", ylab = "Coordinate 2",
xlim = range(x)*1.2, type = "n")
textplot(x, y, words = colnames(watervoles), new = FALSE)
@
\caption{Two-dimensional solution from classical multidimensional scaling of
distance matrix for water vole populations. \label{MDS-watervoles-plot}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
library("ape")
st <- mst(watervoles)
plot(x, y, xlab = "Coordinate 1", ylab = "Coordinate 2",
xlim = range(x)*1.2, type = "n")
for (i in 1:nrow(watervoles)) {
w1 <- which(st[i, ] == 1)
segments(x[i], y[i], x[w1], y[w1])
}
textplot(x, y, words = colnames(watervoles), new = FALSE)
@
\caption{Minimum spanning tree for the \Robject{watervoles} data.
\label{MDS-watervoles-mst}}
\end{center}
\end{figure}
We shall now apply non-metric scaling to the voting behavior shown in
Table~\ref{MDS-voting-tab}. Non-metric scaling is available with
function \Rcmd{isoMDS} from package \Rpackage{MASS}
\citep{HSAUR:VenablesRipley2002}:
<>=
library("MASS")
data("voting", package = "HSAUR3")
voting_mds <- isoMDS(voting)
@
and we again depict the two-dimensional solution
(Figure~\ref{MDS-voting-plot}). The Figure suggests that voting behavior is
essentially along party lines, although there is more variation among
Republicans. The voting behavior of one of the Republicans (Rinaldo)
seems to be closer to his democratic colleagues rather than to the voting
behavior of other Republicans.
\begin{figure}
\begin{center}
<>=
x <- voting_mds$points[,1]
y <- voting_mds$points[,2]
plot(x, y, xlab = "Coordinate 1", ylab = "Coordinate 2",
xlim = range(voting_mds$points[,1])*1.2, type = "n")
textplot(x, y, words = colnames(voting), new = FALSE)
voting_sh <- Shepard(voting[lower.tri(voting)],
voting_mds$points)
@
\caption{Two-dimensional solution from non-metric multidimensional scaling of
distance matrix for voting matrix. \label{MDS-voting-plot}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
plot(voting_sh, pch = ".", xlab = "Dissimilarity",
ylab = "Distance", xlim = range(voting_sh$x),
ylim = range(voting_sh$x))
lines(voting_sh$x, voting_sh$yf, type = "S")
@
\caption{The Shepard diagram for the \Robject{voting} data shows some
discrepancies between the original dissimilarities and the
multidimensional scaling solution. \label{MDS-voting-shepard}}
\end{center}
\end{figure}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/chapman.cls 0000644 0001762 0000144 00000072633 14172224353 015357 0 ustar ligges users % CHAPMAN.STY
% v0.17 --- released 6th April 1993
% v0.16 --- released 11th November 1991
% v0.15 --- released 8th November 1991
% v0.14 --- first release 3rd November 1991
%
% A LaTeX style file for Chapman and Hall books
% Copyright 1993 Cambridge University Press
%Modified Sept 1995 to work under Latex 2e
%
% based on the BOOK DOCUMENT STYLE -- Released 26 April 88
% for LaTeX version 2.09
% Copyright (C) 1988 by Leslie Lamport
%
\typeout{Document Style `chapman' v0.17 <6th April 1993>}
%
% Books use two-sided printing.
%
%\usepackage{times,mathtime}%for latex 2e user to use mathtimes font
\@twosidetrue
\@mparswitchtrue
%
% draft option
%
\def\ds@draft{\overfullrule 5pt}
\@options
% ****************************************
% * FONTS *
% ****************************************
%
\lineskip 1pt
\normallineskip 1pt
\def\baselinestretch{1}
\def\normalsize{\@setsize\normalsize{12pt}\xpt\@xpt
\abovedisplayskip 6pt plus 1pt minus 1pt%
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus3pt%
\belowdisplayshortskip 3.25pt plus 1pt minus 1pt%
\let\@listi\@listI}
\def\small{\@setsize\small{11pt}\ixpt\@ixpt
\abovedisplayskip 5.5pt plus 2pt minus 2pt%
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus3pt%
\belowdisplayshortskip 3.25pt plus 1pt minus 1pt%
\def\@listi{\leftmargin\leftmargini
\topsep 3pt plus 2pt minus 2pt\parsep 2pt plus 1pt minus 1pt
\itemsep \z@ plus 2pt}}
\def\footnotesize{\@setsize\footnotesize{9pt}\viiipt\@viiipt
\abovedisplayskip 5pt plus 2pt minus 2pt%
\belowdisplayskip \abovedisplayskip
\abovedisplayshortskip \z@ plus 1pt%
\belowdisplayshortskip 3pt plus 1pt minus 2pt
\def\@listi{\leftmargin\leftmargini
\topsep 3pt plus 1pt minus 1pt\parsep 2pt plus 1pt minus 1pt
\itemsep \z@ plus 2pt}}
\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
\def\tiny{\@setsize\tiny{6pt}\vpt\@vpt}
\def\large{\@setsize\large{14pt}\xiipt\@xiipt}
\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
\def\LARGE{\@setsize\LARGE{19pt}\xviipt\@xviipt}
\def\huge{\@setsize\huge{22pt}\xxpt\@xxpt}
\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
\normalsize
% ****************************************
% * PAGE LAYOUT *
% ****************************************
%
% All margin dimensions measured from a point one inch from top and side
% of page.
%
% SIDE MARGINS:
%
\oddsidemargin 6pc %5pc
\evensidemargin 5.7pc %5pc
\marginparwidth 4pc
\marginparsep 1pc
\topmargin 12pt %0pt
\headheight 12pt
\headsep 8pt
\footskip 2pc
%
% DIMENSION OF TEXT:
%
\textheight = 45\baselineskip
%\advance\textheight by \topskip
\addtolength\textheight{3pt}
\textwidth 28pc
\addtolength\textwidth{.5pt}
% \textheight = 43\baselineskip
% %\advance\textheight by \topskip
%\addtolength\textheight{3pt}
% \textwidth 26pc
%\addtolength\textwidth{.5pt}
\columnsep 1pc
\columnseprule 0pt
%
% FOOTNOTES
%
\footnotesep 6.65pt
\skip\footins 12pt plus 3pt minus 1.5pt
%
% FLOATS
%
% FOR FLOATS ON A TEXT PAGE:
% ONE-COLUMN MODE OR SINGLE-COLUMN FLOATS IN TWO-COLUMN MODE:
\floatsep 12pt plus 2pt minus 2pt
\textfloatsep 18pt plus 2pt minus 4pt
\intextsep 12pt plus 2pt minus 2pt
% TWO-COLUMN FLOATS IN TWO-COLUMN MODE:
\dblfloatsep 12pt plus 2pt minus 2pt
\dbltextfloatsep 18pt plus 2pt minus 4pt
%
% FOR FLOATS ON A SEPARATE FLOAT PAGE OR COLUMN:
% ONE-COLUMN MODE OR SINGLE-COLUMN FLOATS IN TWO-COLUMN MODE:
\@fptop 0pt plus 0fil
\@fpsep 12pt plus 0fil
\@fpbot 0pt plus 3fil
%
% DOUBLE-COLUMN FLOATS IN TWO-COLUMN MODE.
\@dblfptop 0pt plus 0fil
\@dblfpsep 12pt plus 0fil
\@dblfpbot 0pt plus 3fil
%
% MARGINAL NOTES:
%
\marginparpush 5pt
% ****************************************
% * PARAGRAPHING *
% ****************************************
%
\parskip 0pt plus .25pt
\parindent 1em
\partopsep 2pt plus 1pt minus 1pt
%
% The following page-breaking penalties are defined
%
\@lowpenalty 51
\@medpenalty 151
\@highpenalty 301
\@beginparpenalty -\@lowpenalty
\@endparpenalty -\@lowpenalty
\@itempenalty -\@lowpenalty
%
\clubpenalty=0 % 'Club line' at bottom of page.
\widowpenalty=10000 % 'Widow line' at top of page.
% \displaywidowpenalty % Math display widow line.
% \predisplaypenalty % Breaking before a math display.
% \postdisplaypenalty % Breaking after a math display.
% \interlinepenalty % Breaking at a line within a paragraph.
% \brokenpenalty % Breaking after a hyphenated line.
%
\def\thin@rule{{\parindent0pt\par\rule{\textwidth}{0.5pt}\par}}
\def\thick@rule{{\parindent0pt\par\rule{\textwidth}{1pt}\par}}
% ****************************************
% * CHAPTERS AND SECTIONS *
% ****************************************
%
% DEFINE COUNTERS:
%
\newcounter{part}
\newcounter{chapter}
\newcounter{section}[chapter]
\newcounter{subsection}[section]
\newcounter{subsubsection}[subsection]
\newcounter{paragraph}[subsubsection]
\newcounter{subparagraph}[paragraph]
\def\thepart {\Roman{part}}
\def\thechapter {\arabic{chapter}}
\def\thesection {\thechapter.\arabic{section}}
\def\thesubsection {\thesection.\arabic{subsection}}
\def\thesubsubsection{\thesubsection .\arabic{subsubsection}}
\def\theparagraph {\thesubsubsection.\arabic{paragraph}}
\def\thesubparagraph {\theparagraph.\arabic{subparagraph}}
\def\@chapapp{CHAPTER}
% ****************************************
% * PARTS *
% ****************************************
%
\def\part{%
\cleardoublepage
\thispagestyle{empty}%
\if@twocolumn
\onecolumn \@tempswatrue
\else
\@tempswafalse
\fi
\secdef\@part\@spart
}
%
% Heading for the \part command.
%
\def\@part[#1]#2{%
\ifnum \c@secnumdepth >-2\relax
\refstepcounter{part}%
\addcontentsline{toc}{part}{\thepart \hspace{1em}#1}%
\typeout{PART \number\c@part.}%
\else
\addcontentsline{toc}{part}{#1}%
\fi
\markboth{}{}%
\vspace*{-17pt}%
\vbox{\thin@rule\par
\parindent 0pt \centering
\vskip 17pt%
\ifnum \c@secnumdepth >\m@ne
\normalfont PART \thepart\par
\else
\normalfont \phantom{PART \thepart}\par
\fi
\vskip 17pt%
\LARGE \bfseries #1\par
\nobreak
\addvspace{-4pt}%
\thick@rule
\vskip 25pt}%
\@endpart
}
%
% Heading for the \part* command.
%
\def\@spart#1{%
\vspace*{-17pt}%
\vbox{\thin@rule\par
\parindent 0pt \centering
\vskip 17pt%
\normalfont\phantom{PART \thepart}\par
\vskip 17pt%
\LARGE \bfseries #1\par
\nobreak
\addvspace{-4pt}%
\thick@rule
\vskip 25pt}%
\@endpart
}
%
% \@endpart finishes the part page.
%
\def\@endpart{%
\vfil\newpage
\if@twoside
\hbox{}%
\thispagestyle{empty}%
\newpage
\fi
\if@tempswa \twocolumn\fi
}
% ****************************************
% * CHAPTERS *
% ****************************************
%
% Chapter text macros
%
\newif\if@chptxt
\newbox\@chptxtbox
%
\def\chaptertext{\global\@chptxttrue\global\setbox\@chptxtbox=\vbox\bgroup%
\hsize=\textwidth\normalfont\small\noindent\ignorespaces}
\def\endchaptertext{\egroup}
%
% Heading for the \chapter command.
%
\def\@makechapterhead#1{%
\vspace*{-17pt}%
\vbox{\thin@rule\par
\parindent 0pt \centering
\vskip 17pt%
\ifnum \c@secnumdepth >\m@ne
\normalfont \@chapapp{} \thechapter\par
\else
\normalfont \phantom{\@chapapp{} \thechapter}\par
\fi
\vskip 17pt%
\LARGE \bfseries #1\par
\nobreak
\addvspace{-4pt}%
\thick@rule
\if@chptxt
\addvspace{24pt plus 2pt}\unvbox\@chptxtbox
\addvspace{6pt}\global\@chptxtfalse
\else
\vskip 23.5pt%
\fi}%
}
%
% Heading for the \chapter* command.
%
\def\@makeschapterhead#1{%
\vspace*{-17pt}%
\vbox{\thin@rule\par
\parindent 0pt \centering
\vskip 17pt%
\normalfont\phantom{\@chapapp{} \thechapter}\par
\vskip 17pt%
\LARGE \bf #1\par
\nobreak
\addvspace{-4pt}%
\thick@rule
\if@chptxt
\addvspace{24pt plus 2pt}\unvbox\@chptxtbox
\addvspace{6pt}\global\@chptxtfalse
\else
\vskip 23.5pt%
\fi}%
}
%
% \secdef{UNSTARCMDS}{STARCMDS} :
%
\def\chapter{%
\cleardoublepage
\thispagestyle{plain}%
\global\@topnum\z@
\@afterindentfalse
\secdef\@chapter\@schapter
}
%
\def\@chapter[#1]#2{%
\ifnum \c@secnumdepth >\m@ne
\refstepcounter{chapter}%
\typeout{\@chapapp\space\thechapter.}%
\addcontentsline{toc}{chapter}{\protect\numberline{\thechapter}#1}%
\else
\addcontentsline{toc}{chapter}{#1}%
\fi
\chaptermark{#1}%
\addtocontents{lof}{\protect\addvspace{10pt}}%
\addtocontents{lot}{\protect\addvspace{10pt}}%
\if@twocolumn
\@topnewpage[\@makechapterhead{#2}]%
\else
\@makechapterhead{#2}\@afterheading
\fi
}
%
\def\@schapter#1{%
\chaptermark{#1}%
\addtocontents{lof}{\protect\addvspace{10pt}}%
\addtocontents{lot}{\protect\addvspace{10pt}}%
\thispagestyle{empty}%
%% \if@nocntentry
%% \else
%% \addcontentsline{toc}{chapter}{#1}%
%% \fi
\if@twocolumn
\@topnewpage[\@makeschapterhead{#1}]%
\else
\@makeschapterhead{#1}\@afterheading
\fi
}
% ****************************************
% * SECTIONS *
% ****************************************
%
% \@startsection {NAME}{LEVEL}{INDENT}{BEFORESKIP}{AFTERSKIP}{STYLE}
% optional * [ALTHEADING]{HEADING}
%
\def\section{\@startsection{section}{1}{\z@}
{-1.5pc plus -1pt minus -2pt}
{6pt plus 1pt}
{\normalsize\bf\raggedright}}
\def\subsection{\@startsection{subsection}{2}{\z@}
{-1.5pc plus -1pt minus -2pt}
{6pt plus 1pt}
{\normalsize\it\raggedright}}
\def\subsubsection{\@startsection{subsubsection}{3}{\z@}
{-1.0pc plus -1pt minus -2pt}
{6pt plus 1pt}
{\normalsize\it\raggedright}}
\def\paragraph{\@startsection{paragraph}{4}{\z@}
{3.25pt plus 1pt minus .2pt}
{-1em}
{\normalsize\it}}
\def\subparagraph{\@startsection{subparagraph}{4}{\parindent}
{3.25pt plus 1pt minus.2pt}
{-1em}
{\normalsize\normalfont}}
%
% Default initializations of \...mark commands
%
\def\chaptermark#1{}
\setcounter{secnumdepth}{2}
%
% APPENDIX
%
\def\appendix{\par
\setcounter{chapter}{0}
\setcounter{section}{0}
\def\@chapapp{APPENDIX}
\def\thechapter{\Alph{chapter}}}
% ****************************************
% * LISTS *
% ****************************************
%
\leftmargini 1pc
\leftmarginii 1pc
\leftmarginiii 1pc
\leftmarginiv 1pc
\leftmarginv 1pc
\leftmarginvi 1pc
\leftmargin\leftmargini
\labelsep 0.5em
\labelwidth\leftmargini\advance\labelwidth-\labelsep
\def\@listI{\leftmargin\leftmargini \parsep 3pt plus 1pt minus 1pt%
\topsep 3pt plus 1pt minus 2pt%
\itemsep \z@ plus 2pt}
\let\@listi\@listI
\@listi
\def\@listii{\leftmargin\leftmarginii
\labelwidth\leftmarginii\advance\labelwidth-\labelsep
\topsep 3pt plus 2pt minus 1pt
\parsep 2pt plus 1pt minus 1pt
\itemsep \z@ plus 2pt}
\def\@listiii{\leftmargin\leftmarginiii
\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
\topsep 3pt plus 1pt minus 1pt
\parsep \z@ \partopsep 1pt plus 0pt minus 1pt
\itemsep \z@ plus 2pt}
\def\@listiv{\leftmargin\leftmarginiv
\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
\def\@listv{\leftmargin\leftmarginv
\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
\def\@listvi{\leftmargin\leftmarginvi
\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
%
% ENUMERATE -- with optional argument to set left margin
%
% label macros for Range-Left and Range-Right labels
\def\makeRLlabel#1{\rlap{#1}\hss}
\def\makeRRlabel#1{\hss\llap{#1}}
%
\def\enumerate{\ifnum \@enumdepth >3 \@toodeep \else
\advance\@enumdepth \@ne
\edef\@enumctr{enum\romannumeral\the\@enumdepth}%
\fi
\@ifnextchar [{\@enumeratetwo}{\@enumerateone}%
}
\def\@enumeratetwo[#1]{%
\list{\csname label\@enumctr\endcsname}%
{\settowidth\labelwidth{[#1]}
\leftmargin\labelwidth \advance\leftmargin\labelsep
\usecounter{\@enumctr}
\let\makelabel\makeRRlabel}
}
\def\@enumerateone{%
\list{\csname label\@enumctr\endcsname}%
{\usecounter{\@enumctr}
\let\makelabel\makeRRlabel}}
%
\def\labelenumi{\theenumi}
\def\theenumi{\arabic{enumi}.}
\def\labelenumii{\theenumii}
\def\theenumii{(\alph{enumii})}
\def\p@enumii{\theenumi}
\def\labelenumiii{\theenumiii}
\def\theenumiii{\roman{enumiii}.}
\def\p@enumiii{\theenumi(\theenumii)}
\def\labelenumiv{\theenumiv}
\def\theenumiv{\Alph{enumiv}.}
\def\p@enumiv{\p@enumiii\theenumiii}
%
% ITEMIZE
%
\def\labelitemi{$\bullet$}
\def\labelitemii{\bf --}
\def\labelitemiii{$\ast$}
\def\labelitemiv{$\cdot$}
%
% VERSE
%
\def\verse{\let\\=\@centercr
\list{}{\itemsep\z@ \itemindent -1em\listparindent \itemindent
\rightmargin\leftmargin\advance\leftmargin 1em}\item[]}
\let\endverse\endlist
%
% QUOTATION
%
\def\quotation{\list{}{\listparindent 1em
\itemindent\listparindent
\rightmargin\z@
\parsep 0pt plus 1pt}\item[]\small}
\let\endquotation=\endlist
%
% QUOTE
%
\def\quote{\list{}{\rightmargin\z@}\item[]\small}
\let\endquote=\endlist
%
% DESCRIPTION
%
\def\descriptionlabel#1{\hspace\labelsep \bf #1}
\def\description{\list{}{\labelwidth\z@ \itemindent-\leftmargin
\let\makelabel\descriptionlabel}}
\let\enddescription\endlist
\newdimen\descriptionmargin
\descriptionmargin=3em
% ****************************************
% * OTHER ENVIRONMENTS *
% ****************************************
%
% PROOF
\def\proof{\normalfont \trivlist \item[\hskip \labelsep{\itshape Proof.}]}
\def\endproof{\hspace*{1em}{\begin{picture}(6.5,6.5)%
\put(0,0){\framebox(6.5,6.5){}}\end{picture}}\endtrivlist}
\@namedef{proof*}{\normalfont\trivlist \item[\hskip \labelsep{\itshape
Proof.}]}
\@namedef{endproof*}{\endtrivlist}
\def\proofbox{\begin{picture}(6.5,6.5)%
\put(0,0){\framebox(6.5,6.5){}}\end{picture}}
%
% ARRAY AND TABULAR
%
\arraycolsep 5pt
\tabcolsep 6pt
\arrayrulewidth .5pt
\doublerulesep 0pt
%
% TABBING
%
\tabbingsep \labelsep
%
% MINIPAGE
%
% \skip\@mpfootins : plays same role for footnotes in a minipage as
% \skip\footins does for ordinary footnotes
\skip\@mpfootins = \skip\footins
\def\thempfootnote{\mbox{{$\fnsymbol{mpfootnote}$}}}
%
% FRAMEBOX
%
\fboxsep = 3pt
\fboxrule = .5pt
% ****************************************
% * TABLE OF CONTENTS, ETC. *
% ****************************************
%
\def\@pnumwidth{2.5em}
\def\@tocrmarg {2.55em}
\def\@dotsep{4.5}
\setcounter{tocdepth}{1}
%
% \@dottedtocline{LEVEL}{INDENT}{NUMWIDTH}{TITLE}{PAGE} :
%
\def\@dottedtocline#1#2#3#4#5{%
\ifnum #1>\c@tocdepth
\else
\vskip \z@ plus .2pt
{\leftskip #2\relax \rightskip \@tocrmarg plus2em% v.0.16
\parfillskip -\rightskip
\parindent #2\relax
\@afterindenttrue
\interlinepenalty\@M
\leavevmode
\@tempdima #3\relax \advance\leftskip \@tempdima \hbox{}\hskip -\leftskip
#4\nobreak
% \leaders\hbox{$\m@th \mkern \@dotsep mu.\mkern \@dotsep mu$}
\hfill \nobreak
\hbox to\@pnumwidth{\hfil\normalfont #5}\par}%
\fi
}
% TABLEOFCONTENTS
%
\newif\if@nocntentry
%
\def\tableofcontents{\@restonecolfalse
\if@twocolumn
\@restonecoltrue\onecolumn
\fi
\@nocntentrytrue
\chapter*{Contents}%
\@nocntentryfalse
% \@mkboth{Contents}{Contents}%
\@starttoc{toc}%
\if@restonecol\twocolumn\fi
}
\def\l@chapter#1#2{\pagebreak[3]
\vskip 12pt plus 1pt
\@tempdima 1.5em
\begingroup
\parindent \z@
\rightskip \@pnumwidth
\parfillskip -\@pnumwidth
\bf \leavevmode
\advance\leftskip\@tempdima
\hskip -\leftskip
{\raggedright #1}\nobreak
\hfil \nobreak\hbox to\@pnumwidth{\hss #2}\par
\endgroup}
%
\let\l@part=\l@chapter
%
\def\l@section {\@dottedtocline{1}{15.0pt}{22.5pt}}
\def\l@subsection {\@dottedtocline{2}{37.5pt}{30.0pt}}
\def\l@subsubsection{\@dottedtocline{3}{67.5pt}{20.0pt}}
\def\l@paragraph {\@dottedtocline{4}{87.5pt}{20.0pt}}
\def\l@subparagraph {\@dottedtocline{5}{107.5pt}{20.0pt}}
%
% The default width of TOC entries for sections in CHAPMAN.STY
% will only cater for sections with numbers up to 10.9. Numbers larger
% than this result in the section number leaving no space between the
% number and the title.
%
% This can be fixed by placing the \widetocentries command before
% the \tableofcontents command (but after the \documentstyle line).
%
\def\widetocentries{%
\def\l@section {\@dottedtocline{1}{15.0pt}{27.5pt}}%
\def\l@subsection {\@dottedtocline{2}{42.5pt}{40.0pt}}%
\def\l@subsubsection{\@dottedtocline{3}{82.5pt}{20.0pt}}%
\def\l@paragraph {\@dottedtocline{4}{102.5pt}{20.0pt}}%
\def\l@subparagraph {\@dottedtocline{5}{120.5pt}{20.0pt}}%
}
%
% LIST OF FIGURES
%
\def\listoffigures{\@restonecolfalse
\if@twocolumn \@restonecoltrue\onecolumn \fi
\chapter*{List of Figures}
% \@mkboth{List of Figures}{List of Figures}
\@starttoc{lof}
\if@restonecol \twocolumn \fi
}
\def\l@figure{\@dottedtocline{1}{1.5em}{2.3em}}
%
% LIST OF TABLES
%
\def\listoftables{\@restonecolfalse
\if@twocolumn \@restonecoltrue \onecolumn \fi
\chapter*{List of Tables}
% \@mkboth{List of Tables}{List of Tables}
\@starttoc{lot}
\if@restonecol \twocolumn\fi
}
\let\l@table\l@figure
% ****************************************
% * BIBLIOGRAPHY *
% ****************************************
%
\newcounter{dummy}
%
\def\thebibliography#1{%
\chapter*{References}
% \@mkboth{References}{References}
\typeout{References.}
\list{}{\labelwidth\z@
\leftmargin 1em
\itemsep \z@ plus .1pt
\itemindent-\leftmargin
\usecounter{dummy}}
\small
\parindent\z@
\parskip\z@ plus .1pt\relax
\def\newblock{\hskip .11em plus .33em minus .07em}
\sloppy\clubpenalty4000\widowpenalty4000
\sfcode`\.=1000\relax
}
\let\endthebibliography=\endlist
% \def\@biblabel#1{[#1]\hfill}
% \def\@cite#1{[#1]}
% ****************************************
% * THE INDEX *
% ****************************************
%
% The theindex, theauthorindex and thesubjectindex environment's
%
\newif\if@restonecol
\newif\if@royalflag
\def\theindex{\the@index{Index}}
\def\endtheindex{\par\endthe@index}
\def\theauthorindex{\the@index{Author index}}
\def\endtheauthorindex{\par\endthe@index}
\def\thesubjectindex{\the@index{Subject index}}
\def\endthesubjectindex{\par\endthe@index}
\def\the@index#1{\@restonecoltrue\if@twocolumn\@restonecolfalse\fi
\columnseprule \z@
\columnsep 1pc%
%%\twocolumn[\vspace*{11pt}\@makeschapterhead{#1}]%
\twocolumn[\vspace*{11pt}]
\if@royalflag % If royal 1 or 2 is in use
\chapter*{Index}%
%%TH \@mkboth{#1}{#1}%
\else
\chapter*{Index}%
%%TH \@mkboth{\uppercase{#1}}{\uppercase{#1}}%
\fi
\typeout{#1.}%
%%TH \addcontentsline{toc}{chapter}{#1}%
%%TH \thispagestyle{empty}%
\parindent\z@
\parskip\z@ plus .3pt%
\small\raggedright
\relax
\let\item\@idxitem
}
\def\endthe@index{\if@restonecol\onecolumn\else\clearpage\fi}
\def\@idxitem{\par\hangindent 10pt}
\def\subitem{\par\hangindent 20pt \hspace*{10pt}}
\def\subsubitem{\par\hangindent 30pt \hspace*{20pt}}
\def\indexspace{\par\vskip 16pt plus 2pt minus 2pt\relax}
% ****************************************
% * FOOTNOTES *
% ****************************************
%
\newskip\@footindent
\@footindent=1em
\def\footnoterule{\kern-3\p@ \hrule width 0\columnwidth \kern 2.6\p@}
\@addtoreset{footnote}{chapter}
\long\def\@makefntext#1{\@setpar{\@@par\@tempdima \hsize
\advance\@tempdima-\@footindent
\parshape \@ne \@footindent \@tempdima}\par
\noindent \hbox to \z@{\hss$^{\@thefnmark}$\ }#1}
\renewcommand{\thefootnote}{\mbox{{$\fnsymbol{footnote}$}}}
\def\@fnsymbol#1{\ifcase#1\or * \or \dagger\or \ddagger\or
\S \or \P \or \|\or **\or \dagger\dagger
\or \ddagger\ddagger \or \S\S \or \P\P \else\@ctrerr\fi\relax}
% ****************************************
% * FIGURES AND TABLES *
% ****************************************
%
% Float placement parameters.
%
\setcounter{topnumber}{2}
\def\topfraction{.9}
\setcounter{bottomnumber}{2}
\def\bottomfraction{.5}
\setcounter{totalnumber}{4}
\def\textfraction{.1}
\def\floatpagefraction{.8}
\setcounter{dbltopnumber}{2}
\def\dbltopfraction{.9}
\def\dblfloatpagefraction{.8}
%
% \@makecaption{NUMBER}{TEXT} : Macro to make a figure or table caption.
%
%\long\def\@makecaption#1#2{%
% \vskip 10pt%
% \setbox\@tempboxa\hbox{\small \normalfont #1\enskip \itshape #2}%
% \ifdim \wd\@tempboxa >\hsize
% \small \normalfont #1\enskip \itshape #2\par
% \else
% \hbox to\hsize{\hfil\box\@tempboxa\hfil}%
% \fi%
%}
\long\def\@makecaption#1#2{%
\vskip 10pt%
\setbox\@tempboxa\hbox{\small \normalfont #1\unskip\hskip10pt #2}%
\ifdim \wd\@tempboxa >\hsize
\small \normalfont
\@hangfrom{#1\unskip\hskip10pt\ignorespaces}#2\par
\else
\hbox to\hsize{\hfil\box\@tempboxa\hfil}%
\fi%
}
%
% FIGURE
%
\newcounter{figure}[chapter]
\def\thefigure{\thechapter.\@arabic\c@figure}
\def\fps@figure{tbp}
\def\ftype@figure{1}
\def\ext@figure{lof}
\def\fnum@figure{{\bf Figure \thefigure}}
\def\figure{\@float{figure}}
\let\endfigure\end@float
\@namedef{figure*}{\@dblfloat{figure}}
\@namedef{endfigure*}{\end@dblfloat}
%
% TABLE
%
\newcounter{table}[chapter]
\def\thetable{\thechapter.\@arabic\c@table}
\def\fps@table{tbp}
\def\ftype@table{2}
\def\ext@table{lot}
\def\fnum@table{{\bf Table \thetable}}
\def\table{\@float{table}}
\let\endtable\end@float
\@namedef{table*}{\@dblfloat{table}}
\@namedef{endtable*}{\end@dblfloat}
% ****************************************
% * TITLE *
% ****************************************
%
% TITLEPAGE
%
\def\titlepage{\@restonecolfalse
\if@twocolumn \@restonecoltrue\onecolumn
\else \newpage
\fi
\thispagestyle{empty}
}
\def\endtitlepage{\if@restonecol\twocolumn \else \newpage \fi}
%
\def\maketitle{\make@cornermarks\begin{titlepage}
\let\footnotesize\small
\let\footnoterule\relax
\setcounter{page}{1}
\null
\vspace*{-17pt}%
{\parindent 0pt \centering
\par
\LARGE \bfseries \@title
\par
\nobreak
\vskip 0pt
\thick@rule
\vskip 25pt
\par
\large \normalfont
\begin{tabular}[t]{c}
\@author
\end{tabular}\par
}
\vfill
\@thanks
\null
\end{titlepage}
\setcounter{footnote}{0}
\let\thanks\relax
\gdef\@thanks{}
\gdef\@author{}
\gdef\@title{}
\let\maketitle\relax
}
% ****************************************
% * PAGE STYLES *
% ****************************************
\def\cleardoublepage{%
\clearpage
\if@twoside
\ifodd\c@page
\else
\hbox{}%
\pagestyle{empty}%
\newpage
\if@twocolumn
\hbox{}%
\newpage\fi\fi\fi}
\newdimen\htrim
\newdimen\vtrimtop
\newdimen\vtrimbot
% \htrim.75in
% \vtrimtop.8607in
% \vtrimbot1.027in
% \hoffset-.49in
% \voffset-.63in%.04in
\htrim4.42pc
\vtrimtop6.26pc
\vtrimbot6.37pc
% \hoffset-5pt
\voffset39pt
%\fi
\newsavebox\ul@box
\newsavebox\ur@box
\newsavebox\ll@box
\newsavebox\lr@box
\def\top@cornermarks{%
\hskip-\htrim
\vbox to 0\p@{\vskip-\vtrimtop\llap{\copy\ul@box}\vss}%
\vbox to 0\p@{\vskip-\vtrimtop\rlap{\hskip\textwidth\hskip2\htrim\copy\ur@box}\vss}%
\vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\llap{\copy\ll@box}\vss}%
\vbox to 0\p@{\vskip\textheight\vskip\vtrimbot\rlap{\hskip\textwidth\hskip2\htrim\copy\lr@box}\vss}%
\hskip\htrim}
\def\make@cornermarks{%
\sbox\ul@box{\rule{18\p@}{.25\p@}\hskip8\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}%
\sbox\ur@box{\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}%
\sbox\ll@box{\rule{18\p@}{.25\p@}\hskip8\p@\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}}%
\sbox\lr@box{\lower34\p@\hbox to.25\p@{\vbox to26\p@{\noindent\rule{.25\p@}{18\p@}}}\hskip8\p@\rule{18\p@}{.25\p@}}}
\def\even@head{%
\top@cornermarks
\@the@page
{%\RunningHeadFont
\hfil
{%\MakeUppercase
\leftmark
}
}}%\hfil
\def\odd@head{%
\top@cornermarks
{%\RunningHeadFont
{%\MakeUppercase
\rightmark
}
}
\hfil
\@the@page
}
\def\@the@page{{\thepage}}
%\def\@the@page{{\PageNumFont\thepage}}
\def\ps@empty{%
\let\@mkboth\@gobbletwo
\let\@oddhead\top@cornermarks
\let\@evenhead\top@cornermarks
\let\@oddfoot\@empty
\let\@evenfoot\@empty
}
\def\ps@folio{%
\let\@mkboth\@gobbletwo
\let\@oddhead\top@cornermarks
\def\@oddfoot{%
\parindent\z@
\baselineskip7\p@
\hbox{%
\textwidth\@ciprulewidth
\vbox{%
\if@cip\rule{\@ciprulewidth}{.25pt}\par
\hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}}
\hfill\@the@page}
\let\@evenhead\odd@head
\let\@evenfoot\@oddfoot
}
\def\ps@headings{%
\let\@mkboth\@gobbletwo
\let\@oddfoot\@empty
\let\@evenfoot\@empty
\let\@evenhead\even@head
\let\@oddhead\odd@head
\def\chaptermark##1{\markboth {\uppercase{##1}}{\uppercase{##1}}}
\def\sectionmark##1{\markright{\uppercase{##1}}}
}
\def\ps@opening{%
\let\@mkboth\@gobbletwo
\make@cornermarks
\let\@oddhead\top@cornermarks
\let\@evenhead\top@cornermarks
\def\@oddfoot{%
\parindent\z@
\baselineskip7\p@
\hbox{%
\textwidth\@ciprulewidth
\vbox{%
\if@cip\rule{\@ciprulewidth}{.25pt}\par
\hbox{\vbox{\noindent\copy\@cipboxa\par\noindent\copy\@cipboxb}}\fi}}
\hfill\@the@page}
\let\@evenfoot\@oddfoot
}
%
% Initializes TeX's marks
%
\mark{{}{}}
%
% \ps@empty and \ps@plain defined in LATEX.TEX
%
\def\ps@plain{%
\let\@mkboth\@gobbletwo
\let\@oddhead\top@cornermarks
\let\@evenhead\top@cornermarks
\def\@oddfoot{\hfil{\footnotesize\rm \thepage}\hfil}%
\def\@evenfoot{\hfil{\footnotesize\rm \thepage}\hfil}%
}
%
%
% Definition of 'headings' page style
%
%\def\ps@headings{\let\@mkboth\markboth
% \def\@oddhead{\footnotesize\normalfont \rightmark \hfill \thepage}
% \def\@oddfoot{}
% \def\@evenhead{\footnotesize\normalfont \thepage \hfill \leftmark}
% \def\@evenfoot{}
% \def\chaptermark##1{\markboth {\uppercase{##1}}{\uppercase{##1}}}
% \def\sectionmark##1{\markright{\uppercase{##1}}}
%}
%
% Definition of 'myheadings' page style.
%
\def\ps@myheadings{\let\@mkboth\@gobbletwo
\def\@oddhead{\footnotesize\normalfont \rightmark \hfill \thepage}
\def\@oddfoot{}
\def\@evenhead{\footnotesize\normalfont\thepage \hfill \leftmark}
\def\@evenfoot{}
\def\chaptermark##1{}
% \def\sectionmark##1{}
% \def\subsectionmark##1{}
}
% ****************************************
% * MISCELLANEOUS *
% ****************************************
%
% DATE
%
\def\today{\ifcase\month\or
January\or February\or March\or April\or May\or June\or
July\or August\or September\or October\or November\or December\fi
\space\number\day, \number\year}
%
% EQUATION and EQNARRAY -- put here because it must follow \chapter definition
%
\@addtoreset{equation}{chapter}
\def\theequation{\thechapter.\arabic{equation}}
% \jot = 3pt % Extra space added between lines of an eqnarray environment
% ****************************************
% * CUP SPECIALS *
% ****************************************
%
% cleardoublepage with empty page
%
\def\cleardoublepage{\clearpage
\if@twoside \ifodd \c@page
\else \thispagestyle{empty}
\hbox{}\newpage
\if@twocolumn \thispagestyle{plain}\hbox{}\newpage
\fi
\fi
\fi}
%
% redefinition of sections to get en space after chapter number
%
\def\@sect#1#2#3#4#5#6[#7]#8{%
\ifnum #2>\c@secnumdepth
\def\@svsec{}%
\else
\refstepcounter{#1}
% \ifnum #2>1
% \edef\@svsec{{\normalfont \csname the#1\endcsname\hskip 0.5em}}
% \else
\edef\@svsec{\csname the#1\endcsname\hskip 0.5em}
% \fi
\fi
\@tempskipa #5\relax
\ifdim \@tempskipa>\z@
\begingroup #6\relax
\@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
\endgroup
\csname #1mark\endcsname{#7}%
\addcontentsline{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi #7}
\else
\def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
{#7}\addcontentsline{toc}{#1}{\ifnum #2>\c@secnumdepth \else
\protect\numberline{\csname the#1\endcsname}\fi#7}}%
\fi
\@xsect{#5}}
%
% redefinition of \hline to get extra space
%
\def\hline{\noalign{\ifnum0=`}\fi \vskip 6pt
\hrule \@height \arrayrulewidth \vskip 6pt
\futurelet \@tempa\@xhline}
\def\@xhline{\ifx\@tempa\hline \vskip -12pt
\vskip \doublerulesep \fi \ifnum0=`{\fi}}
%
% redefinition of tabular to get rid of vertical lines in tables
%
\def\tabular{\def\@halignto{}
\def\hline{\noalign{\ifnum0=`}\fi
\vskip 3pt
\hrule \@height \arrayrulewidth
\vskip 3pt
\futurelet \@tempa\@xhline}
\def\@xhline{\ifx\@tempa\hline
\vskip -6pt
\vskip \doublerulesep
\fi
\ifnum0=`{\fi}}
\def\@arrayrule{\@addtopreamble{\hskip -.5\arrayrulewidth
% \vrule \@width \arrayrulewidth
\hskip .5\arrayrulewidth}}
\@tabular
}
\DeclareOldFontCommand{\rm}{\normalfont\rmfamily}{\mathrm}
\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf}
\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt}
\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf}
\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit}
\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl}
\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc}
\DeclareRobustCommand{\cal}{\@fontswitch{\relax}{\mathcal}}
\DeclareRobustCommand{\mit}{\@fontswitch{\relax}{\mathnormal}}
\RequirePackage{latexsym}
% ****************************************
% * INITIALIZATION *
% ****************************************
%
% Default initializations
\ps@headings
\pagenumbering{arabic}
\onecolumn
\frenchspacing
\flushbottom
% end of chapman.sty
HSAUR3/vignettes/Ch_principal_components_analysis.Rnw 0000644 0001762 0000144 00000041327 14416236370 022477 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Principal Component Analysis}
\setcounter{chapter}{18}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
\chapter[Principal Component Analysis]{Principal Component Analysis: The Olympic Heptathlon \label{PCA}}
\section{Introduction}
\section{Principal Component Analysis}
\section{Analysis Using \R{}}
To begin it will help to score all seven events in the
same direction, so that `large' values are `good'. We will
recode the running events to achieve this;
<>=a
data("heptathlon", package = "HSAUR3")
heptathlon$hurdles <- max(heptathlon$hurdles) -
heptathlon$hurdles
heptathlon$run200m <- max(heptathlon$run200m) -
heptathlon$run200m
heptathlon$run800m <- max(heptathlon$run800m) -
heptathlon$run800m
@
\begin{figure}
\begin{center}
<>=
score <- which(colnames(heptathlon) == "score")
plot(heptathlon[,-score])
@
\caption{Scatterplot matrix for the \Robject{heptathlon} data (all countries). \label{PCA-heptathlon-scatter}}
\end{center}
\end{figure}
Figure~\ref{PCA-heptathlon-scatter}
shows a scatterplot matrix of the results from all $25$ competitors for the seven events. Most of the scatterplots in
the diagram suggest that there is a positive relationship between the results for each pairs of events. The exception are the
plots involving the javelin event which give little evidence of any relationship between the result for this event and the
results from the other six events; we will suggest possible reasons for this below, but first we will examine the numerical
values of the between pairs events correlations by applying the
\Rcmd{cor} function
<>=
w <- options("width")
options(width = 65)
@
<>=
round(cor(heptathlon[,-score]), 2)
@
<>=
options(width = w$width)
@
Examination of these numerical values confirms that most pairs of events are positively correlated, some moderately (for
example, high jump and shot) and others relatively highly (for example, high jump and hurdles). And we see that the
correlations involving the javelin event are all close to zero. One possible explanation for the latter finding is perhaps
that training for the other six events does not help much in the javelin because it is essentially a `technical' event. An
alternative explanation is found if we examine the scatterplot matrix in Figure~\ref{PCA-heptathlon-scatter}
a little more closely.
It is very clear in this diagram that for all events except the
javelin there is an outlier, the competitor from Papua New Guinea
(PNG), who is much poorer than the other athletes at these six
events and who finished last in the competition in terms of
points scored. But surprisingly in the
scatterplots involving the javelin it is this competitor who again
stands out but because she has the third highest value for the event.
It might be sensible to look again at both the correlation matrix and the
scatterplot matrix after removing the competitor from PNG; the relevant \R{}
code is
<>=
heptathlon <- heptathlon[-grep("PNG", rownames(heptathlon)),]
@
Now, we again look at the scatterplot and correlation matrix;
\begin{figure}
\begin{center}
<>=
score <- which(colnames(heptathlon) == "score")
plot(heptathlon[,-score])
@
\caption{Scatterplot matrix for the \Robject{heptathlon} data
after removing observations of
the PNG competitor. \label{PCA-heptathlon-scatter2}}
\end{center}
\end{figure}
<>=
w <- options("width")
options(width = 65)
@
<>=
round(cor(heptathlon[,-score]), 2)
@
<>=
options(width = w$width)
@
The correlations change quite substantially and the
new scatterplot matrix in Figure~\ref{PCA-heptathlon-scatter2} does
not point us to any further extreme observations. In the remainder
of this chapter we analyze the \Robject{heptathlon} data with
the observations of the competitor from Papua New Guinea removed.
<>=
w <- options("digits")
options(digits = 4)
@
Because the results for the seven heptathlon events are on different
scales we shall extract the principal components from the correlation matrix.
A principal component analysis of the data can be applied
using the \Rcmd{prcomp} function with the \Rcmd{scale} argument
set to \Robject{TRUE} to ensure the analysis is carried out on the correlation
matrix. The result is a list
containing the coefficients defining each component
(sometimes referred to as \stress{loadings}),
\index{Loadings}
the principal component scores, etc. The required
code is (omitting the \Robject{score} variable)
<>=
heptathlon_pca <- prcomp(heptathlon[, -score], scale = TRUE)
print(heptathlon_pca)
@
The \Rcmd{summary} method can be used for further inspection of the details:
<>=
summary(heptathlon_pca)
@
<>=
options(digits = w$digits)
@
The linear combination for the first principal component is
<>=
a1 <- heptathlon_pca$rotation[,1]
a1
@
We see that the hurdles and long jump competitions receive the highest weight
but the javelin result is less important.
For computing the first principal component, the data need to be rescaled
appropriately. The center and the scaling used by \Rcmd{prcomp} internally
can be extracted from the \Robject{heptathlon\_pca}
via
<>=
center <- heptathlon_pca$center
scale <- heptathlon_pca$scale
@
Now, we can apply the \Rcmd{scale} function to the data and multiply with
the loadings matrix in order to compute the first principal component score
for each competitor
<>=
hm <- as.matrix(heptathlon[,-score])
drop(scale(hm, center = center, scale = scale) %*%
heptathlon_pca$rotation[,1])
@
or, more conveniently, by extracting the first from all precomputed
principal components
<>=
predict(heptathlon_pca)[,1]
@
\begin{figure}
\begin{center}
<>=
plot(heptathlon_pca)
@
\caption{Barplot of the variances explained by the principal components
(with observations for PNG removed). \label{PCA-heptathlon-pca-plot}}
\end{center}
\end{figure}
<>=
sdev <- heptathlon_pca$sdev
prop12 <- round(sum(sdev[1:2]^2)/sum(sdev^2)*100, 0)
@
The first two components account for $\Sexpr{prop12}\%$ of the variance.
A barplot of each component's variance (see %%'
Figure~\ref{PCA-heptathlon-pca-plot}) shows how the first two components dominate.
A plot of the data in the space of the first two principal components, with the points
labeled by the name of the corresponding competitor, can be produced as shown
with Figure~\ref{PCA-heptathlon-biplot}. In addition, the first two loadings for
the events are given in a second coordinate system, also illustrating the special
role of the javelin event. This graphical representation is known as
\stress{biplot} \citep{HSAUR:Gabriel1971}. \index{Biplot}
A biplot is a graphical representation of the information in an
$n \times p$ data matrix. The `bi' is a reflection that the technique
produces a diagram that gives variance and covariance information about
the variables and information about generalized distances
between individuals. The coordinates used to produce the biplot can all
be obtained directly from the principal components analysis of the
covariance matrix of the data and so the plots can be viewed as an
alternative representation of the results of such an analysis. Full
details of the technical details of the biplot are given in
\cite{HSAUR:Gabriel1981} and in \cite{HSAUR:GowerHand1996}.
Here we simply construct the biplot for the heptathlon data (without PNG);
the result is shown in Figure~\ref{PCA-heptathlon-biplot}.
The plot clearly shows that the winner of the gold medal, Jackie Joyner-Kersee,
accumulates the majority of her points from the three events
long jump, hurdles, and 200m.
\begin{figure}
\begin{center}
<>=
biplot(heptathlon_pca, col = c("gray", "black"))
@
<>=
tmp <- heptathlon[, -score]
rownames(tmp) <- abbreviate(gsub(" \\(.*", "", rownames(tmp)))
biplot(prcomp(tmp, scale = TRUE), col = c("black", "lightgray"), xlim =
c(-0.5, 0.7))
@
\caption{Biplot of the (scaled) first two principal components
(with observations for PNG removed).
\label{PCA-heptathlon-biplot}}
\end{center}
\end{figure}
The correlation between the score
given to each athlete by the standard scoring system used for
the heptathlon and the first principal component score can be
found from
<>=
cor(heptathlon$score, heptathlon_pca$x[,1])
@
This implies that the first principal component is in good agreement with
the score assigned to the athletes by official Olympic rules; a scatterplot
of the official score and the first principal component is given in
Figure~\ref{PCA-heptathlonscore}.
\begin{figure}
\begin{center}
<>=
plot(heptathlon$score, heptathlon_pca$x[,1])
@
\caption{Scatterplot of the score assigned to each athlete in 1988 and the
first principal component. \label{PCA-heptathlonscore}}
\end{center}
\end{figure}
%%\bibliographystyle{LaTeXBibTeX/refstyle}
%%\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_recursive_partitioning.Rnw 0000644 0001762 0000144 00000055141 14416236370 021143 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Recursive Partitioning}
%%\VignetteDepends{vcd,lattice,randomForest,partykit}
\setcounter{chapter}{8}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
library("vcd")
library("lattice")
library("randomForest")
library("partykit")
ltheme <- canonical.theme(color = FALSE) ## in-built B&W theme
ltheme$strip.background$col <- "transparent" ## change strip bg
lattice.options(default.theme = ltheme)
mai <- par("mai")
options(SweaveHooks = list(nullmai = function() { par(mai = rep(0, 4)) },
twomai = function() { par(mai = c(0, mai[2], 0, 0)) },
threemai = function() { par(mai = c(0, mai[2], 0.1, 0)) }))
numbers <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
@
\chapter[Recursive Partitioning]{Recursive Partitioning:
Predicting Body Fat, Glaucoma Diagnosis, and Happiness in China \label{RP}}
\section{Introduction}
\section{Recursive Partitioning}
\section{Analysis Using \R{}}
\subsection{Predicting Body Fat Content}
The \Rcmd{rpart} function from \Rpackage{rpart} can be used to grow a
regression tree. The response variable and the covariates are defined by a
model formula in the same way as for \Rcmd{lm}, say. By default, a large
initial tree is grown, we restrict the number of observations required to
establish a potential binary split to at least ten:
<>=
library("rpart")
data("bodyfat", package = "TH.data")
bodyfat_rpart <- rpart(DEXfat ~ age + waistcirc + hipcirc +
elbowbreadth + kneebreadth, data = bodyfat,
control = rpart.control(minsplit = 10))
@
A \Rcmd{print} method for \Rclass{rpart} objects is available; however, a
graphical representation \citep[here utilizing functionality offered
from package \Rpackage{partykit},][]{PKG:partykit} shown in Figure~\ref{RP-bodyfat-plot} is more
convenient. Observations that satisfy the condition shown for each node
go to the left and
observations that don't are an element of the right branch in each node. %'
As expected, higher values for waist and hip circumferences and wider knees
correspond to higher values of body fat content. The rightmost terminal node
consists of only three rather extreme observations.
\begin{figure}
\begin{center}
<>=
library("partykit")
plot(as.party(bodyfat_rpart), tp_args = list(id = FALSE))
@
\caption{Initial tree for the body fat data with the distribution
of body fat in terminal nodes visualized via boxplots. \label{RP-bodyfat-plot}}
\end{center}
\end{figure}
\index{Cross-validation}
To determine if the tree is appropriate or if some of the branches need to
be subjected to pruning we can use the \Robject{cptable}
element of the \Rclass{rpart} object:
<>=
print(bodyfat_rpart$cptable)
opt <- which.min(bodyfat_rpart$cptable[,"xerror"])
@
The \Robject{xerror} column contains estimates of cross-validated prediction error
for different numbers of splits (\Robject{nsplit}). The best tree has
\Sexpr{numbers[bodyfat_rpart$cptable[opt, "nsplit"] + 1]}
splits. Now we can prune back the large initial tree using
<>=
cp <- bodyfat_rpart$cptable[opt, "CP"]
bodyfat_prune <- prune(bodyfat_rpart, cp = cp)
@
The result is shown in Figure~\ref{RP-bodyfat-pruneplot}. Note that
the inner nodes three and six have been removed from the tree. Still, the
rightmost terminal node might give very unreliable extreme predictions.
\begin{figure}
\begin{center}
<>=
plot(as.party(bodyfat_prune), tp_args = list(id = FALSE))
@
\caption{Pruned regression tree for body fat data.
\label{RP-bodyfat-pruneplot}}
\end{center}
\end{figure}
Given this model, one can predict the (unknown, in real circumstances) body fat
content based on the covariate measurements. Here, using the known values of
the response variable, we compare the model predictions with the actually measured
body fat as shown in Figure~\ref{RP-bodyfat-predict}. The three observations
with large body fat measurements in the rightmost terminal node can be identified
easily.
\begin{figure}
\begin{center}
<>=
DEXfat_pred <- predict(bodyfat_prune, newdata = bodyfat)
xlim <- range(bodyfat$DEXfat)
plot(DEXfat_pred ~ DEXfat, data = bodyfat, xlab = "Observed",
ylab = "Predicted", ylim = xlim, xlim = xlim)
abline(a = 0, b = 1)
@
\caption{Observed and predicted DXA measurements.
\label{RP-bodyfat-predict}}
\end{center}
\end{figure}
\subsection{Glaucoma Diagnosis}
<>=
set.seed(290875)
@
<>=
data("GlaucomaM", package = "TH.data")
glaucoma_rpart <- rpart(Class ~ ., data = GlaucomaM,
control = rpart.control(xval = 100))
glaucoma_rpart$cptable
opt <- which.min(glaucoma_rpart$cptable[,"xerror"])
cp <- glaucoma_rpart$cptable[opt, "CP"]
glaucoma_prune <- prune(glaucoma_rpart, cp = cp)
@
\setkeys{Gin}{width = 0.65\textwidth}
\begin{figure}
\begin{center}
<>=
plot(as.party(glaucoma_prune), tp_args = list(id = FALSE))
@
\caption{Pruned classification tree of the glaucoma data with class
distribution in the leaves. \label{RP:gl}}
\end{center}
\end{figure}
\setkeys{Gin}{width=0.95\textwidth}
\index{Classification tree!choice of tree size}
\index{Tree size}
As we discussed earlier, the choice of the appropriately sized tree is not a
trivial problem. For the glaucoma data, the above choice of three leaves is
very unstable across multiple runs of cross-validation. As an illustration
of this problem we repeat the very same analysis as shown above and record
the optimal number of splits as suggested by the cross-validation runs.
<>=
nsplitopt <- vector(mode = "integer", length = 25)
for (i in 1:length(nsplitopt)) {
cp <- rpart(Class ~ ., data = GlaucomaM)$cptable
nsplitopt[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
@
\newpage
<>=
table(nsplitopt)
@
Although for \Sexpr{sum(nsplitopt == 1)} runs of cross-validation a simple
tree with one split only is suggested, larger trees would have been favored
in \Sexpr{sum(nsplitopt > 1)} of the cases. This short analysis shows that
we should not trust the tree in Figure~\ref{RP:gl} too much.
\index{Bagging}
\index{Bootstrap approach!glaucoma diagnosis data}
One way out of this dilemma is the aggregation of multiple trees via bagging.
In \R{}, the bagging idea can be implemented by three or four lines
of code. Case count or weight vectors representing the bootstrap samples can be drawn from the
multinominal distribution with parameters $n$ and $p_1 = 1/n, \dots, p_n =
1/n$ via the \Rcmd{rmultinom} function. For each weight vector, one large tree is
constructed without pruning and the \Rclass{rpart} objects are stored in a
list, here called \Robject{trees}:
<>=
trees <- vector(mode = "list", length = 25)
n <- nrow(GlaucomaM)
bootsamples <- rmultinom(length(trees), n, rep(1, n)/n)
mod <- rpart(Class ~ ., data = GlaucomaM,
control = rpart.control(xval = 0))
for (i in 1:length(trees))
trees[[i]] <- update(mod, weights = bootsamples[,i])
@
The \Rcmd{update} function re-evaluates the call of \Robject{mod}, however,
with the weights being altered, i.e., fits a tree to a bootstrap sample
specified by the weights.
It is interesting to have a look at the structures of the multiple trees.
For example, the variable selected for splitting in the root of the tree is
not unique as can be seen by
<>=
table(sapply(trees, function(x) as.character(x$frame$var[1])))
@
Although \Robject{varg} is selected most of the time, other variables such
as \Robject{vari} occur as well -- a further indication that the tree in
Figure~\ref{RP:gl} is
questionable and that hard decisions are not appropriate for the glaucoma
data.
In order to make use of the ensemble of trees in the list \Robject{trees}
we estimate the conditional probability of suffering from glaucoma given the
covariates for each observation in the original data set by
<>=
classprob <- matrix(0, nrow = n, ncol = length(trees))
for (i in 1:length(trees)) {
classprob[,i] <- predict(trees[[i]],
newdata = GlaucomaM)[,1]
classprob[bootsamples[,i] > 0,i] <- NA
}
@
Thus, for each observation we get \Sexpr{length(trees)} estimates. However,
each observation has been used for growing one of the trees with
probability $0.632$ and thus was not used with probability $0.368$.
Consequently, the estimate from a tree where an observation was not used for
growing is better for judging the quality of the predictions and we label
the other estimates with \Robject{NA}.
Now, we can average the estimates and we vote for glaucoma when the average
of the estimates of the conditional glaucoma probability exceeds $0.5$. The
comparison between the observed and the predicted classes does not suffer from
overfitting since the predictions are computed from those trees for which
each single observation was \stress{not} used for growing.
<>=
avg <- rowMeans(classprob, na.rm = TRUE)
predictions <- factor(ifelse(avg > 0.5, "glaucoma",
"normal"))
predtab <- table(predictions, GlaucomaM$Class)
predtab
@
Thus, an honest estimate of the probability of
a glaucoma prediction when the patient is actually suffering from glaucoma is
<>=
round(predtab[1,1] / colSums(predtab)[1] * 100)
@
per cent. For
<>=
round(predtab[2,2] / colSums(predtab)[2] * 100)
@
percent of normal eyes, the ensemble does not predict glaucomateous
damage.
\begin{figure}
\begin{center}
<>=
library("lattice")
gdata <- data.frame(avg = rep(avg, 2),
class = rep(as.numeric(GlaucomaM$Class), 2),
obs = c(GlaucomaM[["varg"]], GlaucomaM[["vari"]]),
var = factor(c(rep("varg", nrow(GlaucomaM)),
rep("vari", nrow(GlaucomaM)))))
panelf <- function(x, y) {
panel.xyplot(x, y, pch = gdata$class)
panel.abline(h = 0.5, lty = 2)
}
print(xyplot(avg ~ obs | var, data = gdata,
panel = panelf,
scales = "free", xlab = "",
ylab = "Estimated Class Probability Glaucoma"))
@
\caption{Estimated class probabilities depending on two important
variables. The $0.5$ cut-off for the estimated glaucoma probability
is depicted as a horizontal line. Glaucomateous eyes are plotted as
circles and normal eyes are triangles. \label{RP:glplot}}
\end{center}
\end{figure}
\index{Random forest}
The bagging procedure is a special case of a more general approach
called \stress{random forest} \citep{HSAUR:Breiman2001b}. The package
\Rpackage{randomForest} \citep{PKG:randomForest}
can be used to compute such ensembles via
<>=
library("randomForest")
rf <- randomForest(Class ~ ., data = GlaucomaM)
@
and we obtain out-of-bag estimates for the prediction error via
<>=
table(predict(rf), GlaucomaM$Class)
@
\subsection{Trees Revisited}
For the body fat data, such a \stress{conditional inference tree}
can be computed using
the \Rcmd{ctree} function
\index{Conditional tree}
<>=
bodyfat_ctree <- ctree(DEXfat ~ age + waistcirc + hipcirc +
elbowbreadth + kneebreadth, data = bodyfat)
@
This tree doesn't require a pruning procedure because
an internal stop criterion based on formal statistical
tests prevents the procedure from overfitting the data.
The tree structure is shown in Figure~\ref{RP-bodyfat-ctree-plot}. Although
the structure of this tree and the tree depicted in Figure~\ref{RP-bodyfat-pruneplot}
are rather different, the corresponding predictions don't vary too much.
\begin{figure}
\begin{center}
<>=
plot(bodyfat_ctree, tp_args = list(id = FALSE))
@
\caption{Conditional inference tree with the distribution
of body fat content shown for each terminal leaf.
\label{RP-bodyfat-ctree-plot}}
\end{center}
\end{figure}
Very much the same code is needed to grow a tree on the glaucoma data:
<>=
glaucoma_ctree <- ctree(Class ~ ., data = GlaucomaM)
@
and a graphical representation is depicted in Figure~\ref{RP-glaucoma-ctree-plot} showing
both the cutpoints and the $p$-values of the associated independence tests for each node.
The first split is performed using a cutpoint defined with respect to the
volume of the optic nerve above some reference plane, but in the inferior part of
the eye only (\Robject{vari}).
\begin{figure}
\begin{center}
<>=
plot(glaucoma_ctree, tp_args = list(id = FALSE))
@
\caption{Conditional inference tree with the distribution
of glaucomateous eyes shown for each terminal leaf.
\label{RP-glaucoma-ctree-plot}}
\end{center}
\end{figure}
\subsection{Happiness in China}
\index{Chinese Health and Family Life Survey}
A conditional inference tree is a simple alternative to the proportional
odds model for the regression analysis of the happiness variable from the
Chinese Health and Family Life Survey. In each node, a linear association
test introduced in Section~\ref{CI:Lanza} taking the ordering of the
happiness levels into account is applied for selecting variables and
split-points. Before we fit the tree with the \Rcmd{ctree} function, we
recode the levels of the happiness variable to allow plotting of these
symbols with restricted page space:
\newpage
<>=
levels(CHFLS$R_happy)
levels(CHFLS$R_happy) <- LETTERS[1:4]
CHFLS_ctree <- ctree(R_happy ~ ., data = CHFLS)
@
The resulting tree is depicted in Figure~\ref{RP-CHFLS-ctree-plot} and very
nicely backs up the results obtained from the proportional odds model in
Chapter~\ref{GLM}. The distribution of self-reported happiness is shifted
from very unhappy to very happy with increasing values of self-reported
health, i.e., women that reported excellent health (mind the $>$ sign in the right
label of the root split!) were at least somewhat happy with only a few exceptions.
Women with poor or not good health reported being not too happy much more often. There
seems to be further differentiation with respect to geography and also income but the
differences in the distributions depicted in the terminal leaves are negligible.
\begin{figure}
\begin{center}
<>=
plot(CHFLS_ctree, ep_args = list(justmin = 10),
tp_args = list(id = FALSE))
@
\caption{Conditional inference tree with the distribution
of self-reported happiness shown for each terminal leaf. The levels
of happiness have been abbreviated (A: very unhappy, B: not too happy,
C: somewhat happy; D: very happy). The \Rcmd{justmin} argument ensures that
split descriptions longer than $10$ characters are displayed over two lines.
\label{RP-CHFLS-ctree-plot}}
\end{center}
\end{figure}
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_simple_inference.Rnw 0000644 0001762 0000144 00000052404 14416236370 017653 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Simple Inference}
%%\VignetteDepends{vcd}
\setcounter{chapter}{2}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
\chapter[Simple Inference]{Simple Inference: Guessing Lengths,
Wave Energy, Water Hardness, Piston Rings,
and Rearrests of Juveniles \label{SI}}
\section{Introduction}
<>=
library("vcd")
if (!interactive()) {
print.htest <- function (x, digits = 4, quote = TRUE, prefix = "", ...)
{
cat("\n")
cat(strwrap(x$method, prefix = "\t"), sep = "\n")
cat("\n")
cat("data: ", x$data.name, "\n")
out <- character()
if (!is.null(x$statistic))
out <- c(out, paste(names(x$statistic), "=", format(round(x$statistic,
4))))
if (!is.null(x$parameter))
out <- c(out, paste(names(x$parameter), "=", format(round(x$parameter,
3))))
if (!is.null(x$p.value)) {
fp <- format.pval(x$p.value, digits = digits)
out <- c(out, paste("p-value", if (substr(fp, 1, 1) ==
"<") fp else paste("=", fp)))
}
cat(strwrap(paste(out, collapse = ", ")), sep = "\n")
if (!is.null(x$conf.int)) {
cat(format(100 * attr(x$conf.int, "conf.level")), "percent confidence interval:\n",
format(c(x$conf.int[1], x$conf.int[2])), "\n")
}
if (!is.null(x$estimate)) {
cat("sample estimates:\n")
print(x$estimate, ...)
}
cat("\n")
invisible(x)
}
}
@
\section{Statistical Tests}
\section{Analysis Using \R{}}
\subsection{Estimating the Width of a Room}
The data shown in Table~\ref{SI-rw-tab} are available as \Robject{roomwidth}
\Rclass{data.frame} from the \Rpackage{HSAUR3} package and can be attached by
using
<>=
data("roomwidth", package = "HSAUR3")
@
If we convert the estimates of the room width in meters into feet by
multiplying each by $3.28$ then we would like to test the hypothesis that the mean
of the population of `metre' estimates is equal to the mean %'
of the population of `feet' estimates. We shall do this first %'
by using an independent samples $t$-test, but first it is good
practice to check, informally at least, the normality and equal
variance assumptions. Here we can use a combination of numerical
and graphical approaches. The first step should be to
convert the meter estimates into feet by a factor
<>=
convert <- ifelse(roomwidth$unit == "feet", 1, 3.28)
@
which equals one for all feet measurements and $3.28$ for the measurements
in meters. Now, we get the usual summary statistics and standard deviations
of each set of estimates using
<>=
tapply(roomwidth$width * convert, roomwidth$unit, summary)
tapply(roomwidth$width * convert, roomwidth$unit, sd)
@
where \Rcmd{tapply} applies \Rcmd{summary}, or \Rcmd{sd}, to the converted
widths for both groups of measurements given by \Robject{roomwidth\$unit}.
A boxplot of each set of estimates
might be useful and is depicted in Figure~\ref{SI-rw-bxp}. The
\Rcmd{layout} function (line 1 in Figure~\ref{SI-rw-bxp}) divides the
plotting area into three parts. The \Rcmd{boxplot} function produces a boxplot
in the upper part and the two \Rcmd{qqnorm} statements in lines 7 and 10
set up the normal probability plots that can be
used to assess the normality assumption of the $t$-test.
\index{Normal probability plot}
\numberSinput
\begin{figure}
\begin{center}
<>=
layout(matrix(c(1,2,1,3), nrow = 2, ncol = 2, byrow = FALSE))
boxplot(I(width * convert) ~ unit, data = roomwidth,
ylab = "Estimated width (feet)",
varwidth = TRUE, names = c("Estimates in feet",
"Estimates in meters (converted to feet)"))
feet <- roomwidth$unit == "feet"
qqnorm(roomwidth$width[feet],
ylab = "Estimated width (feet)")
qqline(roomwidth$width[feet])
qqnorm(roomwidth$width[!feet],
ylab = "Estimated width (meters)")
qqline(roomwidth$width[!feet])
@
\caption{Boxplots of estimates of room width in feet and meters (after
conversion to feet) and normal probability plots of estimates of room width
made in feet and in meters. \label{SI-rw-bxp}}
\end{center}
\end{figure}
\rawSinput
The boxplots indicate that both sets of estimates contain a number
of outliers and also that the estimates made in meters are skewed and more
variable than those made in feet, a point underlined by the numerical
summary statistics above. Both normal probability plots depart from
linearity, suggesting that the distributions of both sets of estimates
are not normal. The presence of outliers, the apparently different
variances and the evidence of non-normality all suggest caution
in applying the $t$-test, but for the moment we shall apply the
usual version of the test using the \Rcmd{t.test} function in \R{}.
The two-sample test problem is specified by a \Rclass{formula}, here by
<>=
I(width * convert) ~ unit
@
where the response, \Robject{width}, on
the left-hand side needs to be converted first and, because the star has a
special meaning in formulae as will be explained in \Sexpr{ch("ANOVA")},
the conversion needs to be embedded by \texttt{I}. The factor \Robject{unit} on
the right-hand side specifies the two groups to be compared.
<>=
tt <- t.test(I(width * convert) ~ unit, data = roomwidth,
var.equal = TRUE)
@
\renewcommand{\nextcaption}{\R{} output of the independent samples $t$-test for the
\Robject{roomwidth} data. \label{SI-roomwidth-tt-fig}}
\SchunkLabel
<>=
t.test(I(width * convert) ~ unit, data = roomwidth,
var.equal = TRUE)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the independent samples Welch test for the
\Robject{roomwidth} data. \label{SI-roomwidth-welch-fig}}
\SchunkLabel
<>=
t.test(I(width * convert) ~ unit, data = roomwidth,
var.equal = FALSE)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the Wilcoxon rank sum test for the
\Robject{roomwidth} data. \label{SI-roomwidth-wilcox-fig}}
\SchunkLabel
<>=
wilcox.test(I(width * convert) ~ unit, data = roomwidth,
conf.int = TRUE)
@
\SchunkRaw
<>=
pwt <- round(wilcox.test(I(width * convert) ~ unit, data =
roomwidth)$p.value, 3)
@
\subsection{Wave Energy Device Mooring}
The data from Table~\ref{SI-m-tab} are available as \Rclass{data.frame} \Robject{waves}
<>=
data("waves", package = "HSAUR3")
@
and requires the use of a matched pairs
$t$-test to answer the question of interest.
This test assumes that the differences between the matched
observations have a normal distribution so we can begin by checking
this assumption by constructing a boxplot and a normal probability
plot -- see Figure~\ref{SI-w-bxp}.
\begin{figure}
\begin{center}
<>=
mooringdiff <- waves$method1 - waves$method2
layout(matrix(1:2, ncol = 2))
boxplot(mooringdiff, ylab = "Differences (Newton meters)",
main = "Boxplot")
abline(h = 0, lty = 2)
qqnorm(mooringdiff, ylab = "Differences (Newton meters)")
qqline(mooringdiff)
@
\caption{Boxplot and normal probability plot for differences between the two
mooring methods. \label{SI-w-bxp}}
\end{center}
\end{figure}
\renewcommand{\nextcaption}{\R{} output of the paired $t$-test for the
\Robject{waves} data. \label{SI-waves-tt-fig}}
\SchunkLabel
<>=
t.test(mooringdiff)
@
\SchunkRaw
<>=
pwt <- round(wilcox.test(mooringdiff)$p.value, 3)
@
\renewcommand{\nextcaption}{\R{} output of the Wilcoxon signed rank test for the
\Robject{waves} data. \label{SI-waves-ws-fig}}
\SchunkLabel
<>=
wilcox.test(mooringdiff)
@
\SchunkRaw
\subsection{Mortality and Water Hardness}
There is a wide range of analyses we could apply to the
data in Table~\ref{SI-w-tab} available from
<>=
data("water", package = "HSAUR3")
@
But to begin we will construct a scatterplot
of the data enhanced somewhat by the addition of information
about the marginal distributions of water hardness (calcium concentration)
and mortality, and by adding the estimated linear regression
fit (see \Sexpr{ch("MLR")}) for mortality on hardness.
The plot and the required \R{} code are given along with
Figure~\ref{SI-water-sp}. In line 1 of Figure~\ref{SI-water-sp}, we divide
the plotting region into four areas of different size. The scatterplot (line
3) uses a plotting symbol depending on the location of the city (by
the \Rarg{pch} argument); a legend for the location is added in line 6.
We add a least squares fit (see \Sexpr{ch("MLR")})
to the scatterplot and,
finally, depict the marginal distributions by means of a boxplot and a
histogram. The scatterplot
shows that as hardness increases mortality decreases, and the
histogram for the water hardness shows it has a rather skewed distribution.
\numberSinput
\begin{figure}
\begin{center}
<>=
nf <- layout(matrix(c(2, 0, 1, 3), 2, 2, byrow = TRUE),
c(2, 1), c(1, 2), TRUE)
psymb <- as.numeric(water$location)
plot(mortality ~ hardness, data = water, pch = psymb)
abline(lm(mortality ~ hardness, data = water))
legend("topright", legend = levels(water$location),
pch = c(1,2), bty = "n")
hist(water$hardness)
boxplot(water$mortality)
@
\caption{Enhanced scatterplot of water hardness and mortality, showing both
the joint and the marginal distributions and, in addition,
the location of the city by different plotting symbols.
\label{SI-water-sp}}
\end{center}
\end{figure}
\rawSinput
\renewcommand{\nextcaption}{\R{} output of Pearsons' correlation coefficient %'
for the \Robject{water} data. \label{SI-water-c-fig}}
\SchunkLabel
<>=
cor.test(~ mortality + hardness, data = water)
@
\SchunkRaw
<>=
cr <- round(cor.test(~ mortality + hardness, data = water)$estimate, 3)
@
\subsection{Piston-ring Failures}
<>=
chisqt <- chisq.test(pistonrings)
@
\renewcommand{\nextcaption}{\R{} output of the chi-squared test
for the \Robject{pistonrings} data. \label{SI-pr-x2-fig}}
\SchunkLabel
<>=
data("pistonrings", package = "HSAUR3")
chisq.test(pistonrings)
@
\SchunkRaw
Rather than looking
at the simple differences of observed and expected values for
each cell which would be unsatisfactory since a difference of
fixed size is clearly more important for smaller samples, it
is preferable to consider a \stress{standardized residual}
\index{Standardized residual, for chi-squared tests}
given
by dividing the observed minus the expected difference by the square
root of the appropriate expected value. The $X^2$ statistic
for assessing independence is simply the sum, over all the cells
in the table, of the squares of these terms. We can find these
values extracting the \Robject{residuals} element of the object returned by
the \Rcmd{chisq.test} function
<>=
chisq.test(pistonrings)$residuals
@
A graphical representation of these residuals is called an \stress{association
plot}
\index{Association plot}
and is available via the \Rcmd{assoc} function from package
\Rpackage{vcd} \citep{PKG:vcd} applied to the contingency table of the two
categorical variables. Figure~\ref{SI-assoc-plot} depicts the residuals for
the piston ring data. The deviations from independence are largest for
C1 and C4 compressors in the center and south leg.
\begin{figure}
\begin{center}
<>=
library("vcd")
assoc(pistonrings)
@
\caption{Association plot of the residuals for the \Robject{pistonrings} data.
\label{SI-assoc-plot}}
\end{center}
\end{figure}
\subsection{Rearrests of Juveniles}
The data in Table~\ref{SI-r-tab} are available as \Rclass{table} object
via
<>=
data("rearrests", package = "HSAUR3")
rearrests
@
<>=
mcs <- round(mcnemar.test(rearrests, correct = FALSE)$statistic, 2)
@
and in \Robject{rearrests} the counts in the four cells refer
to the matched pairs of subjects; for example, in $\Sexpr{rearrests[1,1]}$
pairs both
members of the pair were rearrested. Here we need to use McNemar's %'
test to assess whether rearrest is associated with the type of court
where the juvenile was tried. We can use the \R{} function \Rcmd{mcnemar.test}.
The test statistic shown in Figure~\ref{SI-ra-mc-fig}
is $\Sexpr{mcs}$ with a single degree of freedom -- the
associated $p$-value is extremely small and there is strong evidence
that type of court and the probability of rearrest are related.
It appears that trial at a juvenile court is less likely to result
in rearrest (see Exercise~3.4). %
An exact version of McNemar's test %%'
can be obtained by testing whether $b$ and $c$ are equal using a binomial
test (see Figure~\ref{SI-ra-mcbin-fig}).
\renewcommand{\nextcaption}{\R{} output of McNemar's test %'
for the \Robject{rearrests} data. \label{SI-ra-mc-fig}}
\SchunkLabel
<>=
mcnemar.test(rearrests, correct = FALSE)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of an exact version of McNemar's test %'
for the \Robject{rearrests} data computed via a
binomial test. \label{SI-ra-mcbin-fig}}
\SchunkLabel
<>=
binom.test(rearrests[2], n = sum(rearrests[c(2,3)]))
@
\SchunkRaw
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_analysing_longitudinal_dataII.Rnw 0000644 0001762 0000144 00000053363 14416236367 022330 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Analyzing Longitudinal Data II}
%%\VignetteDepends{gee,lme4}
\setcounter{chapter}{13}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
options(digits = 3)
if (!interactive()) {
print.summary.gee <- function (x, digits = NULL, quote = FALSE, prefix = "", ...)
{
if (is.null(digits))
digits <- options()$digits
else options(digits = digits)
cat("...")
cat("\nModel:\n")
cat(" Link: ", x$model$link, "\n")
cat(" Variance to Mean Relation:", x$model$varfun, "\n")
if (!is.null(x$model$M))
cat(" Correlation Structure: ", x$model$corstr, ", M =",
x$model$M, "\n")
else cat(" Correlation Structure: ", x$model$corstr, "\n")
cat("\n...")
nas <- x$nas
if (!is.null(nas) && any(nas))
cat("\n\nCoefficients: (", sum(nas), " not defined because of singularities)\n",
sep = "")
else cat("\n\nCoefficients:\n")
print(x$coefficients, digits = digits)
cat("\nEstimated Scale Parameter: ", format(round(x$scale,
digits)))
cat("\n...\n")
invisible(x)
}
}
@
\chapter[Analyzing Longitudinal Data II]{
Analyzing Longitudinal Data II -- Generalized Estimation Equations and Linear Mixed Effect Models:
Treating Respiratory Illness and Epileptic Seizures
\label{ALDII}}
\section{Introduction}
\section{Methods for Non-normal Distributions}
\section{Analysis Using \R{}: GEE}
\subsection{Beat the Blues Revisited}
To use the \Rcmd{gee} function, package \Rpackage{gee} \citep{PKG:gee}
has to be installed and attached:
<>=
library("gee")
@
The \Rcmd{gee} function is used in a similar way to the \Rcmd{lme} function
met in \Sexpr{ch("ALDI")} with the addition of the features of the
\Rcmd{glm} function that specify the appropriate error distribution
for the response and the implied link function, and an argument
to specify the structure of the working correlation matrix. Here
we will fit an independence structure and then an exchangeable
structure. The \R{} code for fitting generalized estimation equations to the
\Robject{BtheB\_long} data (as constructed in \Sexpr{ch("ALDI")}) with
identity working correlation matrix is as follows (note that the \Rcmd{gee} function
assumes the rows of the \Rclass{data.frame} \Robject{BtheB\_long} to be
ordered with respect to subjects):
<>=
data("BtheB", package = "HSAUR3")
BtheB$subject <- factor(rownames(BtheB))
nobs <- nrow(BtheB)
BtheB_long <- reshape(BtheB, idvar = "subject",
varying = c("bdi.2m", "bdi.3m", "bdi.5m", "bdi.8m"),
direction = "long")
BtheB_long$time <- rep(c(2, 3, 5, 8), rep(nobs, 4))
names(BtheB_long)[names(BtheB_long) == "treatment"] <- "trt"
@
<>=
osub <- order(as.integer(BtheB_long$subject))
BtheB_long <- BtheB_long[osub,]
btb_gee <- gee(bdi ~ bdi.pre + trt + length + drug,
data = BtheB_long, id = subject, family = gaussian,
corstr = "independence")
@
and with exchangeable correlation matrix:
<>=
btb_gee1 <- gee(bdi ~ bdi.pre + trt + length + drug,
data = BtheB_long, id = subject, family = gaussian,
corstr = "exchangeable")
@
The \Rcmd{summary} method can be used to inspect the fitted models; the
results are shown in Figures~\ref{ALDII-gee-summary} and
\ref{ALDII-gee1-summary}.
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{btb\_gee} model (slightly abbreviated).
\label{ALDII-gee-summary}}
\SchunkLabel
<>=
summary(btb_gee)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{btb\_gee1} model (slightly abbreviated).
\label{ALDII-gee1-summary}}
\SchunkLabel
<>=
summary(btb_gee1)
@
\SchunkRaw
\subsection{Respiratory Illness \label{ALDII:resp}}
The baseline status, i.e., the status for \Robject{month == 0}, will enter
the models as an explanatory variable and thus we have to rearrange the
\Rclass{data.frame} \Robject{respiratory}
in order to create a new variable \Robject{baseline}:
<>=
data("respiratory", package = "HSAUR3")
resp <- subset(respiratory, month > "0")
resp$baseline <- rep(subset(respiratory, month == "0")$status,
rep(4, 111))
resp$nstat <- as.numeric(resp$status == "good")
resp$month <- resp$month[, drop = TRUE]
@
<>=
names(resp)[names(resp) == "treatment"] <- "trt"
levels(resp$trt)[2] <- "trt"
@
The new variable \Robject{nstat} is simply a dummy coding for a poor
respiratory status. Now we can use the data \Robject{resp} to fit a logistic regression model
and GEE models with an independent and an exchangeable correlation structure
as follows.
<>=
resp_glm <- glm(status ~ centre + trt + gender + baseline
+ age, data = resp, family = "binomial")
resp_gee1 <- gee(nstat ~ centre + trt + gender + baseline
+ age, data = resp, family = "binomial", id = subject,
corstr = "independence", scale.fix = TRUE,
scale.value = 1)
resp_gee2 <- gee(nstat ~ centre + trt + gender + baseline
+ age, data = resp, family = "binomial", id = subject,
corstr = "exchangeable", scale.fix = TRUE,
scale.value = 1)
@
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{resp\_glm} model.
\label{ALDII-resp-glm-summary}}
\SchunkLabel
<>=
summary(resp_glm)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{resp\_gee1} model (slightly abbreviated).
\label{ALDII-resp-gee1-summary}}
\SchunkLabel
<>=
summary(resp_gee1)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{resp\_gee2} model (slightly abbreviated).
\label{ALDII-resp-gee2-summary}}
\SchunkLabel
<>=
summary(resp_gee2)
@
\SchunkRaw
The estimated treatment effect taken from the exchangeable
structure GEE model is \Sexpr{round(coef(resp_gee2)["trttrt"], 3)}
which, using the robust standard
errors, has an associated $95\%$ confidence interval
<>=
se <- summary(resp_gee2)$coefficients["trttrt",
"Robust S.E."]
coef(resp_gee2)["trttrt"] +
c(-1, 1) * se * qnorm(0.975)
@
These values reflect effects on the log-odds scale. Interpretation
becomes simpler if we exponentiate the values to get the effects
in terms of odds. This gives a treatment effect of
\Sexpr{round(exp(coef(resp_gee2)["trttrt"]), 3)}
and a $95\%$ confidence interval of
<>=
exp(coef(resp_gee2)["trttrt"] +
c(-1, 1) * se * qnorm(0.975))
@
The odds of achieving a `good' respiratory status with the active treatment is between %'
about twice and seven times the corresponding odds for the placebo.
\subsection{Epilepsy}
Moving on to the count data in \Robject{epilepsy} from
Table~\ref{ALDII-epilepsy-tab}, we begin by calculating
the means and variances of the number of seizures for all
interactions between treatment and period:
<>=
data("epilepsy", package = "HSAUR3")
itp <- interaction(epilepsy$treatment, epilepsy$period)
tapply(epilepsy$seizure.rate, itp, mean)
tapply(epilepsy$seizure.rate, itp, var)
@
Some of the variances are considerably larger than the corresponding means, which for
a Poisson variable may suggest that overdispersion may be a problem, see
\Sexpr{ch("GLM")}.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, nrow = 1))
ylim <- range(epilepsy$seizure.rate)
placebo <- subset(epilepsy, treatment == "placebo")
progabide <- subset(epilepsy, treatment == "Progabide")
boxplot(seizure.rate ~ period, data = placebo,
ylab = "Number of seizures",
xlab = "Period", ylim = ylim, main = "Placebo")
boxplot(seizure.rate ~ period, data = progabide,
main = "Progabide", ylab = "Number of seizures",
xlab = "Period", ylim = ylim)
@
\caption{Boxplots of numbers of seizures in each two-week
period post randomization for placebo and active treatments.
\label{ALDII-plot1}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, nrow = 1))
ylim <- range(log(epilepsy$seizure.rate + 1))
boxplot(log(seizure.rate + 1) ~ period, data = placebo,
main = "Placebo", ylab = "Log number of seizures",
xlab = "Period", ylim = ylim)
boxplot(log(seizure.rate + 1) ~ period, data = progabide,
main = "Progabide", ylab = "Log number of seizures",
xlab = "Period", ylim = ylim)
@
\caption{Boxplots of log of numbers of seizures in
each two-week period post randomization for placebo and active
treatments. \label{ALDII-plot2}}
\end{center}
\end{figure}
We can now fit a Poisson regression model to the data
assuming independence using the \Rcmd{glm} function.
We also use the GEE approach to fit an
independence structure, followed by an exchangeable structure
using the following \R{} code:
<>=
per <- rep(log(2),nrow(epilepsy))
epilepsy$period <- as.numeric(epilepsy$period)
names(epilepsy)[names(epilepsy) == "treatment"] <- "trt"
fm <- seizure.rate ~ base + age + trt + offset(per)
epilepsy_glm <- glm(fm, data = epilepsy, family = "poisson")
epilepsy_gee1 <- gee(fm, data = epilepsy, family = "poisson",
id = subject, corstr = "independence", scale.fix = TRUE,
scale.value = 1)
epilepsy_gee2 <- gee(fm, data = epilepsy, family = "poisson",
id = subject, corstr = "exchangeable", scale.fix = TRUE,
scale.value = 1)
epilepsy_gee3 <- gee(fm, data = epilepsy, family = "poisson",
id = subject, corstr = "exchangeable", scale.fix = FALSE,
scale.value = 1)
@
As usual we inspect the fitted models using the \Rcmd{summary} method, the
results are given in Figures~\ref{ALDII-epilepsy-glm-summary},
\ref{ALDII-epilepsy-gee1-summary}, \ref{ALDII-epilepsy-gee2-summary}, and
\ref{ALDII-epilepsy-gee3-summary}.
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{epilepsy\_glm} model.
\label{ALDII-epilepsy-glm-summary}}
\SchunkLabel
<>=
summary(epilepsy_glm)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{epilepsy\_gee1} model (slightly abbreviated).
\label{ALDII-epilepsy-gee1-summary}}
\SchunkLabel
<>=
summary(epilepsy_gee1)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{epilepsy\_gee2} model (slightly abbreviated).
\label{ALDII-epilepsy-gee2-summary}}
\SchunkLabel
<>=
summary(epilepsy_gee2)
@
\SchunkRaw
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{epilepsy\_gee3} model (slightly abbreviated).
\label{ALDII-epilepsy-gee3-summary}}
\SchunkLabel
<>=
summary(epilepsy_gee3)
@
\SchunkRaw
\section{Analysis Using \R{}: Random Effects}
As an example of using generalized mixed models for the analysis
of longitudinal data with a non-normal response, the following
logistic model will be fitted to the respiratory illness data
\begin{eqnarray*}
\text{logit}(\P(\text{status} = \text{good})) & = &
\beta_0 + \beta_1 \text{treatment} + \beta_2 \text{time} + \beta_3 \text{gender} \\%
& & + \beta_4 \text{age} + \beta_5 \text{centre} +
\beta_6 \text{baseline} + u
\end{eqnarray*}
where $u$ is a subject-specific random effect.
The necessary \R{} code for fitting the model using the \Rcmd{glmer}
function from package \Rpackage{lme4} \citep{PKG:lme4,HSAUR:Bates2005}
is:
<>=
library("lme4")
resp_lmer <- glmer(status ~ baseline + month +
trt + gender + age + centre + (1 | subject),
family = binomial(), data = resp)
exp(fixef(resp_lmer))
@
The significance of the effects as estimated by this random effects
model and by the GEE model described in Section~\ref{ALDII:resp}
is generally similar.
But as expected from our previous discussion the estimated coefficients
are substantially larger. While the estimated effect of treatment
on a randomly sampled individual, given the set of observed covariates,
is estimated by the marginal model using GEE to increase the log-odds
of being disease free by $\Sexpr{round(coef(resp_gee2)["trttrt"], 3)}$,
the corresponding estimate from
the random effects model is
$\Sexpr{round(fixef(resp_lmer)["trttrt"], 3)}$.
These are not inconsistent
results but reflect the fact that the models are estimating
different parameters. The random effects estimate is conditional
upon the patient's random effect, a quantity that is rarely known
in practice. Were we to examine the log-odds of the average predicted
probabilities with and without treatment (averaged over the random
effects) this would give an estimate comparable to that estimated
within the marginal model.
<>=
su <- summary(resp_lmer)
if (!interactive()) {
summary <- function(x) {
cat("\n...\n")
cat("Fixed effects:\n")
lme4V <- packageDescription("lme4")$Version
if (compareVersion("0.999999-2", lme4V) >= 0) {
printCoefmat(su@coefs)
} else {
printCoefmat(su$coefficients)
}
cat("\n...\n")
}
}
@
\renewcommand{\nextcaption}{\R{} output of the \Rcmd{summary} method
for the \Robject{resp\_lmer} model (abbreviated).
\label{ALDII-resp-lmer-summary}}
\SchunkLabel
<>=
summary(resp_lmer)
@
\SchunkRaw
\clearpage
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/Ch_cluster_analysis.Rnw 0000644 0001762 0000144 00000043550 14416236367 017740 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Chapter Cluster Analysis}
%%\VignetteDepends{scatterplot3d,mclust,mvtnorm,lattice}
\setcounter{chapter}{20}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
%% lower png resolution for vignettes
\SweaveOpts{resolution = 100}
<>=
library("mclust")
library("mvtnorm")
mai <- par("mai")
options(SweaveHooks = list(rmai = function() { par(mai = mai * c(1,1,1,2))}))
data("pottery", package = "HSAUR3")
@
\chapter[Cluster Analysis]{Cluster Analysis: Classifying Romano-British Pottery and
Exoplanets \label{CA}}
\section{Introduction}
\section{Cluster Analysis}
\section{Analysis Using \R{}}
\subsection{Classifying Romano-British Pottery}
We start our analysis with computing the dissimilarity matrix
containing the Euclidean distance of the chemical measurements
on all $\Sexpr{nrow(pottery)}$ pots. The resulting
$\Sexpr{nrow(pottery)} \times \Sexpr{nrow(pottery)}$ matrix
can be inspected by an \stress{image plot}, here obtained from
\index{Image plot}
function \Rcmd{levelplot} available in package \Rpackage{lattice}
\citep{PKG:lattice, HSAUR:Sarkar2008}. Such a plot associates
each cell of the dissimilarity matrix with a color or a gray
value. We choose a very dark grey for cells with distance zero (i.e.,
the diagonal elements of the dissimilarity matrix) and pale values
for cells with greater Euclidean distance. Figure~\ref{CA-pottery-distplot}
leads to the impression that there are at least three distinct groups
with small inter-cluster differences (the dark rectangles) whereas
much larger distances can be observed for all other cells.
\begin{figure}
\begin{center}
<>=
pottery_dist <- dist(pottery[, colnames(pottery) != "kiln"])
library("lattice")
levelplot(as.matrix(pottery_dist), xlab = "Pot Number",
ylab = "Pot Number")
@
<>=
trellis.par.set(standard.theme(color = FALSE))
plot(levelplot(as.matrix(pottery_dist), xlab = "Pot Number", ylab = "Pot Number"))
@
\caption{Image plot of the dissimilarity matrix of the \Robject{pottery} data.
\label{CA-pottery-distplot}}
\end{center}
\end{figure}
We now construct three series of partitions using single, complete, and
average linkage hierarchical clustering as introduced in Subsections~\ref{CA:HC} and \ref{CA:diss}. The function \Rcmd{hclust}
performs all three procedures based on the
dissimilarity matrix of the data; its \Rcmd{method} argument is used
to specify how the distance between two clusters is assessed. The
corresponding \Rcmd{plot} method draws a dendrogram; the code and
results are given in Figure~\ref{CA-pottery-hclust}. Again,
all three dendrograms lead to the impression that three clusters
fit the data best (although this judgement is very informal).
\begin{figure}
\begin{center}
<>=
pottery_single <- hclust(pottery_dist, method = "single")
pottery_complete <- hclust(pottery_dist, method = "complete")
pottery_average <- hclust(pottery_dist, method = "average")
layout(matrix(1:3, ncol = 3))
plot(pottery_single, main = "Single Linkage",
sub = "", xlab = "")
plot(pottery_complete, main = "Complete Linkage",
sub = "", xlab = "")
plot(pottery_average, main = "Average Linkage",
sub = "", xlab = "")
@
\caption{Hierarchical clustering of \Robject{pottery} data and resulting dendrograms.
\label{CA-pottery-hclust}}
\end{center}
\end{figure}
From the \Robject{pottery\_average} object representing
the average linkage hierarchical clustering, we derive
the three-cluster solution by cutting the dendrogram at a
height of four (which, based on the right display in
Figure~\ref{CA-pottery-hclust}
leads to a partition of the data into three groups). Our interest
is now a comparison with the kiln sites at which the pottery was
found.
<>=
pottery_cluster <- cutree(pottery_average, h = 4)
xtabs(~ pottery_cluster + kiln, data = pottery)
@
The contingency table shows that cluster 1 contains all
pots found at kiln site number one, cluster 2 contains all pots
from kiln sites number two and three, and cluster three collects the
ten pots from kiln sites four and five. In fact, the five
kiln sites are from three different regions defined by one, two and three, and
four and five, so the clusters actually correspond to pots from
three different regions.
\subsection{Classifying Exoplanets}
\begin{figure}
\begin{center}
<>=
data("planets", package = "HSAUR3")
library("scatterplot3d")
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen + ifelse(planets$eccen == 0,
0.001, 0)),
type = "h", angle = 55, pch = 16,
y.ticklabs = seq(0, 10, by = 2),
y.margin.add = 0.1, scale.y = 0.7,
xlab = "log(mass)", ylab = "log(period)",
zlab = "log(eccen)")
@
\caption{3D scatterplot of the logarithms of the three variables available
for each of the exoplanets. \label{CA-planets-scatter}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
rge <- apply(planets, 2, max) - apply(planets, 2, min)
planet.dat <- sweep(planets, 2, rge, FUN = "/")
n <- nrow(planet.dat)
wss <- rep(0, 10)
wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var))
for (i in 2:10)
wss[i] <- sum(kmeans(planet.dat,
centers = i)$withinss)
plot(1:10, wss, type = "b", xlab = "Number of groups",
ylab = "Within groups sum of squares")
@
\caption{Within-cluster sum of squares for different numbers of clusters for
the exoplanet data. \label{CA-planets-ss}}
\end{center}
\end{figure}
Sadly Figure~\ref{CA-planets-ss} gives no completely convincing verdict on
the number of groups we should consider, but using a little imagination `little elbows' can be spotted at the three and five group solutions. %%'
We can find the number of planets in each group
using
<>=
planet_kmeans3 <- kmeans(planet.dat, centers = 3)
table(planet_kmeans3$cluster)
@
The centers of the clusters for the untransformed data can be computed using
a small convenience function
<>=
ccent <- function(cl) {
f <- function(i) colMeans(planets[cl == i,])
x <- sapply(sort(unique(cl)), f)
colnames(x) <- sort(unique(cl))
return(x)
}
@
which, applied to the three-cluster solution obtained by $k$-means gets
<>=
ccent(planet_kmeans3$cluster)
@
@
for the three-cluster solution and, for the five cluster solution using
<>=
planet_kmeans5 <- kmeans(planet.dat, centers = 5)
table(planet_kmeans5$cluster)
ccent(planet_kmeans5$cluster)
@
\subsection{Model-based Clustering in \R{}}
We now proceed to apply model-based clustering to the \Robject{planets} data.
\R{} functions for model-based clustering are available in package
\Rpackage{mclust} \citep{PKG:mclust,HSAUR:FraleyRaftery2002}.
Here we use the \Rcmd{Mclust} function since this
selects both the most appropriate model for the data \stress{and}
the optimal number of groups based on the values of the BIC
computed over several models and a range of
values for number of groups. The necessary code is:
<>=
library("mclust")
planet_mclust <- Mclust(planet.dat)
@
and we first examine a plot of BIC values using the \R{} code
that is displayed on top of Figure~\ref{CA-mclust1}.
In this diagram the different plotting symbols refer
to different model assumptions about the shape of clusters:
\begin{description}
\item[EII] spherical, equal volume,
\item[VII] spherical, unequal volume,
\item[EEI] diagonal, equal volume and shape,
\item[VEI] diagonal, varying volume, equal shape,
\item[EVI] diagonal, equal volume, varying shape,
\item[VVI] diagonal, varying volume and shape,
\item[EEE] ellipsoidal, equal volume, shape, and orientation,
\item[EEV] ellipsoidal, equal volume and equal shape,
\item[VEV] ellipsoidal, equal shape,
\item[VVV] ellipsoidal, varying volume, shape, and orientation
\end{description}
\begin{figure}
\begin{center}
<>=
plot(planet_mclust, planet.dat, what = "BIC", col = "black",
ylab = "-BIC", ylim = c(0, 350))
@
\caption{Plot of BIC values for a variety of models and a range of number of
clusters. \label{CA-mclust1}}
\end{center}
\end{figure}
The BIC selects model VVI (diagonal varying volume and varying shape) with
three clusters as the best solution as can be
seen from the \Rcmd{print} output:
<>=
print(planet_mclust)
@
This solution can be shown graphically as a scatterplot matrix.
The plot is shown in Figure~\ref{CA-planets-mclust-scatter}.
Figure~\ref{CA-planets-mclust-scatterclust} depicts the clustering solution
in the three-dimensional space.
\begin{figure}
\begin{center}
<>=
clPairs(planet.dat,
classification = planet_mclust$classification,
symbols = 1:3, col = "black")
@
\caption{Scatterplot matrix of planets data showing a three-cluster solution
from \Rcmd{Mclust}. \label{CA-planets-mclust-scatter}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
scatterplot3d(log(planets$mass), log(planets$period),
log(planets$eccen + ifelse(planets$eccen == 0,
0.001, 0)),
type = "h", angle = 55, scale.y = 0.7,
pch = planet_mclust$classification,
y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1,
xlab = "log(mass)", ylab = "log(period)",
zlab = "log(eccen)")
@
\caption{3D scatterplot of planets data showing a three-cluster solution
from \Rcmd{Mclust}. \label{CA-planets-mclust-scatterclust}}
\end{center}
\end{figure}
The number of planets in each cluster and the mean vectors of the three clusters
for the untransformed data can now be inspected by using
<>=
table(planet_mclust$classification)
ccent(planet_mclust$classification)
@
Cluster 1 consists of planets about the same size as Jupiter
with very short periods and eccentricities (similar to the first
cluster of the $k$-means solution). Cluster 2 consists of slightly
larger planets with moderate periods and large eccentricities,
and cluster 3 contains the very large planets with very large
periods. These two clusters do not match those found by the $k$-means
approach.
\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}
\end{document}
HSAUR3/vignettes/tables/ 0000755 0001762 0000144 00000000000 14172224353 014504 5 ustar ligges users HSAUR3/vignettes/tables/exMDS.tex 0000644 0001762 0000144 00000000443 14656356403 016220 0 ustar ligges users \begin{eqnarray*}
s_{ij} = \left\{ \begin{array}{lcl}
9 & \text{if} & i = j \\
8 & \text{if} & 1 \le | i - j | \le 3 \\
7 & \text{if} & 4 \le | i - j | \le 6 \\
& \cdots & \\
1 & \text{if} & 22 \le | i - j | \le 24 \\
0 & \text{if} & | i - j | \ge 25 \\
\end{array} \right.
\end{eqnarray*}
HSAUR3/vignettes/tables/CA_perm.tex 0000644 0001762 0000144 00000000576 14656356403 016555 0 ustar ligges users
\begin{center}
\begin{longtable}{rrl}
\caption{Number of possible partitions depending on the sample size $n$ and
number of clusters $k$. \label{CA:perm}} \\
$n$ & $k$ & Number of possible partitions \\ \hline
$15$ & $3$ & $2,375,101$ \\
$20$ & $4$ & $45,232,115,901$ \\
$25$ & $8$ & $690,223,721,118,368,580$ \\
$100$ & $5$ & $10^{68}$ \\
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/PCA_tab.tex 0000644 0001762 0000144 00000000560 14656356403 016471 0 ustar ligges users
\begin{center}
\begin{longtable}{cccccc}
\caption{Correlations for calculus measurements for the six anterior
mandibular teeth.} \\
\hline
1.00 & & & & & \\
0.54 & 1.00 & & & & \\
0.34 & 0.65 & 1.00 & & & \\
0.37 & 0.65 & 0.84 & 1.00 & & \\
0.36 & 0.59 & 0.67 & 0.80 & 1.00 & \\
0.62 & 0.49 & 0.43 & 0.42 & 0.55 & 1.00 \\
\hline
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/PCA_tab1.tex 0000644 0001762 0000144 00000000431 14172224353 016536 0 ustar ligges users \begin{table}
\begin{center}
\begin{tabular}{cccccc}
1.00 & & & & & \\\
0.54 & 1.00 & & & & \\\
0.34 & 0.65 & 1.00 & & & \\\
0.37 & 0.65 & 0.84 & 1.00 & & \\\
0.36 & 0.59 & 0.67 & 0.80 & 1.00 & \\\
0.62 & 0.49 & 0.43 & 0.42 & 0.55 & 1.00 \\\
\end{tabular}
\end{center}
\end{table}
HSAUR3/vignettes/tables/CI_rtimesc.tex 0000644 0001762 0000144 00000001234 14656356403 017260 0 ustar ligges users
\begin{center}
\begin{longtable}{cc|ccc|c}
\caption{The general $r \times c$ table. \label{SI:rtimesc}} \\
& & & $y$ & & \\\
& & $1$ & $\dots$ & $c$ & \\ \hline
& $1$ & $n_{11}$ & $\dots$ & $n_{1c}$ & $n_{1 \cdot}$ \\\
& $2$ & $n_{21}$ & $\dots$ & $n_{2c}$ & $n_{2 \cdot}$ \\\
$x$ & $\vdots$ & $\vdots$ & $\dots$ & $\vdots$ & $\vdots$ \\\
& $r$ & $n_{r1}$ & $\dots$ & $n_{rc}$ & $n_{r \cdot}$ \\ \hline
& & $n_{\cdot 1}$ & $\dots$ & $n_{\cdot c}$ & $n$ \\\
\end{longtable}
\end{center} HSAUR3/vignettes/tables/MLR-ANOVA-tab.tex 0000644 0001762 0000144 00000000677 14656356403 017311 0 ustar ligges users
\begin{center}
\begin{longtable}{lccc}
\caption{Analysis of variance table for the multiple linear regression model.
\label{MLR-ANOVA-tab}} \\
Source of variation & Sum of squares & Degrees of freedom \\ \hline
Regression & $\sum\limits_{i = 1}^n (\hat{y}_i - \bar{y})^2$ & $q$ \\
Residual & $\sum\limits_{i = 1}^n (\hat{y}_i - y_i)^2$ & $n - q - 1$ \\
Total & $\sum\limits_{i = 1}^n (y_i - \bar{y})^2$ & $n - 1$ \\
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/SI_mcnemar.tex 0000644 0001762 0000144 00000000420 14656356403 017250 0 ustar ligges users
\begin{center}
\begin{longtable}{cccc}
\caption{Frequencies in matched samples data. \label{SI:mcnemartab}} \\
& & \multicolumn{2}{c}{Sample 1} \\
& & present & absent \\
Sample 2 & present & $a$ & $b$ \\
& absent & $c$ & $d$ \\
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/Lanza.tex 0000644 0001762 0000144 00000000647 14656356403 016313 0 ustar ligges users
\begin{center}
\begin{longtable}{ll}
\caption{Classification system for the response variable. \label{CI:scores}}
\\
Classification & Endoscopy Examination \\ \hline
1 & No visible lesions \\
2 & One haemorrhage or erosion \\
3 & 2-10 haemorrhages or erosions \\
4 & 11-25 haemorrhages or erosions \\
5 & More than 25 haemorrhages or erosions \\
& or an invasive ulcer of any size\\ \hline
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/MLR-Xtab.tex 0000644 0001762 0000144 00000000473 14656356403 016571 0 ustar ligges users \begin{eqnarray*}
\X = \left( \begin{array}{ccccc}
1 & x_{11} & x_{12} & \dots & x_{1q} \\
1 & x_{21} & x_{22} & \dots & x_{2q} \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
1 & x_{n1} & x_{n2} & \dots & x_{nq} \\
\end{array} \right).
\end{eqnarray*}
HSAUR3/vignettes/tables/rec.tex 0000644 0001762 0000144 00000001200 14660150060 015763 0 ustar ligges users \begin{tabular}{llll}
\Rpackage{Matrix} & \Rpackage{lattice} & \Rpackage{mgcv} & \Rpackage{survival}\\
\Rpackage{KernSmooth} & \Rpackage{MASS} & \Rpackage{base} & \Rpackage{boot}\\
\Rpackage{class} & \Rpackage{cluster} & \Rpackage{codetools} & \Rpackage{compiler}\\
\Rpackage{datasets} & \Rpackage{foreign} & \Rpackage{grDevices} & \Rpackage{graphics}\\
\Rpackage{grid} & \Rpackage{methods} & \Rpackage{nlme} & \Rpackage{nnet}\\
\Rpackage{parallel} & \Rpackage{rpart} & \Rpackage{spatial} & \Rpackage{splines}\\
\Rpackage{stats} & \Rpackage{stats4} & \Rpackage{tcltk} & \Rpackage{tools}\\
\Rpackage{utils} & & & \\
\end{tabular}
HSAUR3/vignettes/tables/MA_table.tex 0000644 0001762 0000144 00000000310 14656356403 016675 0 ustar ligges users
\begin{center}
\begin{longtable}{cccc}
& & \multicolumn{2}{c}{response} \\
& & success & failure \\
group & control & $a$ & $b$ \\
& treatment & $c$ & $d$ \\
\end{longtable}
\end{center}
HSAUR3/vignettes/tables/SI_rtimesc.tex 0000644 0001762 0000144 00000001234 14656356403 017300 0 ustar ligges users
\begin{center}
\begin{longtable}{cc|ccc|c}
\caption{The general $r \times c$ table. \label{SI:rtimesc}} \\
& & & $y$ & & \\\
& & $1$ & $\dots$ & $c$ & \\ \hline
& $1$ & $n_{11}$ & $\dots$ & $n_{1c}$ & $n_{1 \cdot}$ \\\
& $2$ & $n_{21}$ & $\dots$ & $n_{2c}$ & $n_{2 \cdot}$ \\\
$x$ & $\vdots$ & $\vdots$ & $\dots$ & $\vdots$ & $\vdots$ \\\
& $r$ & $n_{r1}$ & $\dots$ & $n_{rc}$ & $n_{r \cdot}$ \\ \hline
& & $n_{\cdot 1}$ & $\dots$ & $n_{\cdot c}$ & $n$ \\\
\end{longtable}
\end{center} HSAUR3/vignettes/Ch_multiple_linear_regression.Rnw 0000644 0001762 0000144 00000056065 14416236367 022006 0 ustar ligges users
\documentclass{chapman}
%%% copy Sweave.sty definitions
%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave}
\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}
%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
fontshape=it,
fontsize=\small}
\rawSinput
}
%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
\renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
\end{figure} }
\DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline,
samepage = true,
fontfamily=courier,
fontshape=it,
fontsize=\relsize{-1}}
}
%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%% numbers=left
}
\newcommand{\numberSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}
%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@{\fontseries{b}\selectfont #1} package} {\fontseries{b}\selectfont #1}}
\newcommand{\rpackage}[1]{{\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}
%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}}
\newcommand{\stress}[1]{\textit{#1}}
\newcommand{\booktitle}[1]{\textit{#1}} %%'
%%% Math symbols
\usepackage{amstext}
\usepackage{amsmath}
\newcommand{\E}{\mathsf{E}}
\newcommand{\Var}{\mathsf{Var}}
\newcommand{\Cov}{\mathsf{Cov}}
\newcommand{\Cor}{\mathsf{Cor}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\C}{\mathbf{C}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\bS}{\mathbf{S}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\L}{L}
\renewcommand{\P}{\mathsf{P}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\m}{\mathbf{m}}
\newcommand{\argmin}{\operatorname{argmin}\displaylimits}
\newcommand{\argmax}{\operatorname{argmax}\displaylimits}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bbeta}{\mathbf{\beta}}
%%% links
\usepackage{hyperref}
\hypersetup{%
pdftitle = {A Handbook of Statistical Analyses Using R (3rd Edition)},
pdfsubject = {Book},
pdfauthor = {Torsten Hothorn and Brian S. Everitt},
colorlinks = {black},
linkcolor = {black},
citecolor = {black},
urlcolor = {black},
hyperindex = {true},
linktocpage = {true},
}
%%% captions & tables
%% : conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%%
\usepackage{longtable}
\usepackage[figuresright]{rotating}
%%% R symbol in chapter 1
\usepackage{wrapfig}
%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse
%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}
%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\stepcounter{exercise} \item{Ex.~\arabic{chapter}.\arabic{exercise} }}
%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}
%%% for manual corrections
%\renewcommand{\baselinestretch}{2}
%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}
%%% color
\usepackage{color}
%%% hyphenations
\hyphenation{drop-out}
\hyphenation{mar-gi-nal}
%%% new bidirectional quotes need
\usepackage[utf8]{inputenc}
%\usepackage{setspace}
\definecolor{sidebox_todo}{rgb}{1,1,0.2}
\newcommand{\todo}[1]{
\hspace{0pt}%
\marginpar{%
\fcolorbox{black}{sidebox_todo}{%
\parbox{\marginparwidth} {
\raggedright\sffamily\footnotesize{TODO: #1}%
}
}%
}
}
\begin{document}
%% Title page
\title{A Handbook of Statistical Analyses Using \R{} --- 3rd Edition}
\author{Torsten Hothorn and Brian S. Everitt}
\maketitle
%%\VignetteIndexEntry{Multiple Linear Regression}
%%\VignetteDepends{wordcloud}
\setcounter{chapter}{5}
\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE}
<>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
"package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+ ",
width = 63, # digits = 4,
show.signif.stars = FALSE,
SweaveHooks = list(leftpar = function()
par(mai = par("mai") * c(1, 1.05, 1, 1)),
bigleftpar = function()
par(mai = par("mai") * c(1, 1.7, 1, 1))))
HSAURpkg <- require("HSAUR3")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR3"))
rm(HSAURpkg)
### hm, R-2.4.0 --vanilla seems to need this
a <- Sys.setlocale("LC_ALL", "C")
###
book <- TRUE
refs <- cbind(c("AItR", "DAGD", "SI", "CI", "ANOVA", "MLR", "GLM",
"DE", "RP", "GAM", "SA", "ALDI", "ALDII", "SIMC", "MA", "PCA",
"MDS", "CA"), 1:18)
ch <- function(x) {
ch <- refs[which(refs[,1] == x),]
if (book) {
return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
} else {
return(paste("Chapter~", ch[2], sep = ""))
}
}
if (file.exists("deparse.R"))
source("deparse.R")
setHook(packageEvent("lattice", "attach"), function(...) {
lattice.options(default.theme =
function()
standard.theme("pdf", color = FALSE))
})
@
\pagestyle{headings}
<>=
book <- FALSE
@
<>=
library("wordcloud")
@
\chapter[Simple and Multiple Linear Regression]{Simple and Multiple Linear Regression: \\ How Old is the Universe and Cloud Seeding \label{MLR}}
\section{Introduction}
\index{Age of the Universe}
\cite{HSAUR:Freedmanetal2001} give the relative velocity and the distance of $24$ galaxies,
according to measurements made using the Hubble Space Telescope -- the data
are contained in the \Rpackage{gamair} package accompanying \cite{HSAUR:Wood2006}, see
Table~\ref{MLR-hubble-tab}.
Velocities are assessed by measuring the Doppler red shift in the spectrum of
light observed from the galaxies concerned, although some correction
for `local' velocity components is required. Distances are measured
using the known relationship between the period of Cepheid variable
stars and their luminosity. How can these data be used to estimate
the age of the universe? Here we shall show how this can be done
using simple linear regression.
<>=
data("hubble", package = "gamair")
names(hubble) <- c("galaxy", "velocity", "distance")
toLatex(HSAURtable(hubble, package = "gamair"), pcol = 2,
caption = paste("Distance and velocity for 24 galaxies."),
label = "MLR-hubble-tab")
@
\vspace*{-1cm}
\textit{Source}: From Freedman W. L., et al., \textit{The Astrophysical Journal},
553, 47--72, 2001. With permission.
\vspace*{1cm}
\index{Cloud seeding}
{\tabcolsep3.5pt
<>=
data("clouds", package = "HSAUR3")
names(clouds) <- c("seeding", "time", "sne", "cloudc", "prewet", "EM", "rain")
toLatex(HSAURtable(clouds), pcol = 1,
caption = paste("Cloud seeding experiments in Florida -- see text for",
"explanations of the variables. Note that the \\Robject{clouds} data set has slightly different variable names."),
label = "MLR-clouds-tab")
@
}
Weather modification, or cloud seeding, is the treatment of individual
clouds or storm systems with various inorganic and organic materials
in the hope of achieving an increase in rainfall. Introduction
of such material into a cloud that contains supercooled water,
that is, liquid water colder than zero degrees Celsius,
has the aim of
inducing freezing, with the consequent ice particles growing
at the expense of liquid droplets and becoming heavy enough to
fall as rain from clouds that otherwise would produce none.
The data shown in Table~\ref{MLR-clouds-tab} were collected in the summer
of 1975 from an experiment to investigate the use of massive
amounts of silver iodide ($100$ to $1000$ grams per cloud) in cloud
seeding to increase rainfall \citep{HSAUR:Woodleyetal1977}.
In the experiment, which was conducted
in an area of Florida, 24 days were judged suitable for seeding
on the basis that a measured suitability criterion, denoted \stress{S-Ne},
was not less than $1.5$. Here \stress{S} is the `seedability', %'
the difference between the maximum height of a cloud if seeded
and the same cloud if not seeded predicted by a suitable cloud
model, and \stress{Ne} is the number of hours between $1300$
and $1600$ G.M.T. with $10$ centimeter echoes in the target; this
quantity biases the decision for experimentation against naturally
rainy days. Consequently, optimal days for seeding are those
on which seedability is large and the natural rainfall early
in the day is small.
On suitable days, a decision was taken at random as to whether
to seed or not. For each day the following variables were measured:
\begin{description}
\item[\Robject{seeding}] a factor indicating whether seeding action occurred (yes or no),
\item[\Robject{time}] number of days after the first day of the experiment,
\item[\Robject{cloudc}] the percentage cloud cover in the experimental area,
measured using radar,
\item[\Robject{prewet}] the total rainfall in the target area one hour before seeding
(in cubic meters $\times 10^{7}$),
\item[\Robject{EM}] a factor showing whether the radar echo was moving or
stationary,
\item[\Robject{rain}] the amount of rain in cubic meters $\times 10^{7}$,
\item[\Robject{sne}] suitability criterion, see above.
\end{description}
The objective in analyzing these data is to see how rainfall
is related to the explanatory variables and, in particular, to
determine the effectiveness of seeding. The method to be used
is \stress{multiple linear regression}.
\section{Simple Linear Regression}
\section{Multiple Linear Regression \label{MLR-MLR}}
\subsection{Regression Diagnostics}
\section{Analysis Using \R{}}
\subsection{Estimating the Age of the Universe}
Prior to applying a simple regression to the data it will
be useful to look at a plot to assess their major features.
The \R{} code given in Figure~\ref{MLR-hubble-plot} produces
a scatterplot of velocity and distance.
\begin{figure}
\begin{center}
<>=
plot(velocity ~ distance, data = hubble)
@
\caption{Scatterplot of velocity and distance. \label{MLR-hubble-plot}}
\end{center}
\end{figure}
The diagram shows a clear, strong relationship between velocity
and distance. The next step is to fit a simple linear regression model
to the data, but in this case the nature of the data requires a model
without intercept because if distance is zero so is relative speed.
So the model to be fitted to these data is
\begin{eqnarray*}
\text{velocity} = \beta_1 \text{distance} + \varepsilon.
\end{eqnarray*}
This is essentially what astronomers call Hubble's Law and
$\beta_1$ is known as Hubble's constant; $\beta_1^{-1}$ gives
an approximate age of the universe.
To fit this model we are estimating $\beta_1$ using formula
(\ref{MLR:beta1}). Although this
operation is rather easy
<>=
sum(hubble$distance * hubble$velocity) /
sum(hubble$distance^2)
@
it is more convenient to apply \R's linear modeling function
<>=
hmod <- lm(velocity ~ distance - 1, data = hubble)
@
Note that the model formula specifies a model without intercept.
We can now extract the estimated model coefficients via
<>=
coef(hmod)
@
and add this estimated regression line to the scatterplot; the
result is shown in Figure~\ref{MLR-hubble-lmplot}. In addition,
we produce a scatterplot of the residuals $y_i - \hat{y}_i$ against
fitted values $\hat{y}_i$ to assess the quality of the model fit.
It seems that for higher distance values the variance of velocity
increases; however, we are interested in only the estimated
parameter $\hat{\beta}_1$ which remains valid under variance
heterogeneity (in contrast to $t$-tests and associated $p$-values).
Now we can use the estimated value of $\beta_1$ to find an approximate value
for the age of the universe. The Hubble constant itself has units of
$\text{km} \times \text{sec}^{-1} \times \text{Mpc}^{-1}$. A mega-parsec (Mpc)
is $3.09 \times 10^{19}$km, so we need to divide the estimated value of $\beta_1$
by this amount in order to obtain Hubble's constant with units of $\text{sec}^{-1}$.
The approximate age of the universe in seconds will then be the inverse of
this calculation. Carrying out the necessary computations
<>=
Mpc <- 3.09 * 10^19
ysec <- 60^2 * 24 * 365.25
Mpcyear <- Mpc / ysec
1 / (coef(hmod) / Mpcyear)
@
gives an estimated age of roughly $12.8$ billion years.
\begin{figure}
\begin{center}
<>=
layout(matrix(1:2, ncol = 2))
plot(velocity ~ distance, data = hubble)
abline(hmod)
plot(hmod, which = 1)
@
\caption{Scatterplot of velocity and distance with estimated
regression line (left) and plot of residuals against fitted values (right).
\label{MLR-hubble-lmplot}}
\end{center}
\end{figure}
\subsection{Cloud Seeding}
Again, a graphical display highlighting the most important
aspects of the data will be helpful.
Here we will construct boxplots of the rainfall in each category
of the dichotomous explanatory variables and scatterplots of
rainfall against each of the continuous explanatory variables.
\begin{figure}
\begin{center}
<>=
data("clouds", package = "HSAUR3")
layout(matrix(1:2, nrow = 2))
bxpseeding <- boxplot(rain ~ seeding, data = clouds,
ylab = "Rainfall", xlab = "Seeding")
bxpecho <- boxplot(rain ~ EM, data = clouds,
ylab = "Rainfall", xlab = "Echo Motion")
@
<>=
layout(matrix(1:2, nrow = 2))
bxpseeding <- boxplot(rain ~ seeding, data = clouds,
ylab = "Rainfall", xlab = "Seeding")
bxpecho <- boxplot(rain ~ EM, data = clouds,
ylab = "Rainfall", xlab = "Echo Motion")
@
\caption{Boxplots of \Robject{rain}. \label{MLR-rainfall-boxplot}}
\end{center}
\end{figure}
\begin{figure}
\begin{center}
<>=
layout(matrix(1:4, nrow = 2))
plot(rain ~ time, data = clouds)
plot(rain ~ cloudc, data = clouds)
plot(rain ~ sne, data = clouds, xlab="S-Ne criterion")
plot(rain ~ prewet, data = clouds)
@
\caption{Scatterplots of \Robject{rain} against the continuous
covariates. \label{MLR-rainfall-scplot}}
\end{center}
\end{figure}
Both the boxplots (Figure~\ref{MLR-rainfall-boxplot}) and the scatterplots
(Figure~\ref{MLR-rainfall-scplot}) show some evidence
of outliers. The row names of the extreme observations in the
\Robject{clouds} \Rclass{data.frame} can be identified via
<>=
rownames(clouds)[clouds$rain %in% c(bxpseeding$out,
bxpecho$out)]
@
where \Robject{bxpseeding} and \Robject{bxpecho} are variables
created by \Rcmd{boxplot} in Figure~\ref{MLR-rainfall-boxplot}.
Now we shall not remove these observations but bear
in mind during the modeling process that they may cause problems.
In this example it is sensible to assume that the effect of
some of the other explanatory variables is modified by seeding
and therefore consider a model that includes seeding as
covariate and, furthermore, allows interaction terms
\index{Interaction}
for \Robject{seeding} with each of the covariates except \Robject{time}.
This model can be described by the \Rclass{formula}
<>=
clouds_formula <- rain ~ seeding +
seeding:(sne + cloudc + prewet + EM) +
time
@
and the design matrix $\X^\star$ can be computed via
<>=
Xstar <- model.matrix(clouds_formula, data = clouds)
@
By default, treatment contrasts have been applied to the dummy codings of
the factors \Robject{seeding} and \Robject{EM} as can be seen from
the inspection of the \Robject{contrasts} attribute of the model matrix
<>=
attr(Xstar, "contrasts")
@
The default contrasts can be changed via the \Rarg{contrasts.arg}
argument to \Rcmd{model.matrix} or the \Robject{contrasts} argument to the
fitting function, for example \Rcmd{lm} or \Rcmd{aov} as shown in
\Sexpr{ch("ANOVA")}.
However, such internals are hidden and performed by high-level model-fitting
functions such as \Rcmd{lm} which will be used to fit the linear model
defined by the \Rclass{formula} \Robject{clouds\_formula}:
<>=
clouds_lm <- lm(clouds_formula, data = clouds)
class(clouds_lm)
@
The result of the model fitting is an object of class \Rclass{lm} for which
a \Rcmd{summary} method showing the conventional regression analysis
output is available. The output in Figure~\ref{MLR-clouds-summary}
shows the estimates $\hat{\beta}^\star$ with corresponding standard errors
and $t$-statistics as well as the $F$-statistic with associated $p$-value.
\renewcommand{\nextcaption}{\R{} output of the linear model fit
for the \Robject{clouds} data.
\label{MLR-clouds-summary}}
\SchunkLabel
<>=
summary(clouds_lm)
@
\SchunkRaw
Many methods are available for extracting components of the fitted model.
The estimates $\hat{\beta}^\star$ can be assessed via
\newpage
<>=
betastar <- coef(clouds_lm)
betastar
@
and the corresponding covariance matrix $\Cov(\hat{\beta}^\star)$ is available
from the \Rcmd{vcov} method
<>=
Vbetastar <- vcov(clouds_lm)
@
where the square roots of the diagonal elements are the standard errors as
shown in Figure~\ref{MLR-clouds-summary}
<>=
sqrt(diag(Vbetastar))
@
\begin{figure}
\begin{center}
<