readstata13/0000755000176200001440000000000013302354454012371 5ustar liggesusersreadstata13/inst/0000755000176200001440000000000013077075424013354 5ustar liggesusersreadstata13/inst/extdata/0000755000176200001440000000000013077075572015012 5ustar liggesusersreadstata13/inst/extdata/missings.dta0000644000176200001440000000242713077075424017341 0ustar liggesusers
118LSF11 Jul 2016 23:28
<7Cf missings%9.0g=  K K =?d=  =KB=KB L L?d    KA==K (08@HPX`hpx
readstata13/inst/extdata/underscore.do0000644000176200001440000000020213077075424017475 0ustar liggesusersclear all set obs 2 gen v_1 = _n gen v_2 = _n gen long_name_multiple_underscores = _n compress save "underscore.dta", replace readstata13/inst/extdata/statacar.dta0000644000176200001440000002751713077075424017316 0ustar liggesusers
118LSF  6 Sep 2016 14:04
P UC),*-C/O/ide20brandsmodeltypehpdroommaxknmileageecarhldateg_circleldatecalmodelStrL%8.0g%8.0gc%8.0g%8.0g%20sc%8.0g%8.0gc%8.0g%%20s%8.0g%6.2f%8.0g%13.0g%8.0g%10.0g%9.0g%8.0g%tbsp500%tdg%9sgakenakeodeltype_enurationivisionivisionivisionriginriginNumeric IDXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKBrand of carXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKCar modelarXKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKCar classificationKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKHorse PowercationKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKMaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaximum speedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKLaunch datedtionKXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKLaunch date (calendar)KXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLKaunch date (calendar)KXnKXHLKXXHLՐKpdHL KpdqL XL  XX ՐK7HLHLK_dta⥂__YZ@@PE`_E` _غZZ@|[_lang_cen_lang_v_en ^ZX ^Zenldatecal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Einführungsdatum (Kalender)ldateal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Einführungsdatummaxeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Höchstgeschwindigkeithpeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Pferdestärken typeal__YZ@@PE`_E` _غZZ@|[_lang_l_de!"#$%&'()*+,-./:;<=>?@[\]^type_detypeal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Klassifikation modelal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Automodellbrandal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Herstellermarkeidndal__YZ@@PE`_E` _غZZ@|[_lang_v_de!"#$%&'()*+,-./:;<=>?@[\]^Numerische ID_dtaal__YZ@@PE`_E` _غZZ@|[_lang_listRpj0Rpj2en deMeyerordSpeed Start 2000e@f@33#AjF MeyerrdHappy Family!@b b@33@oF AkikoitSusumu 1 #yE@-̬]@XjF AkikonturySusumu 3 (;@P33333_@@mF HutchectraLumberjack 3000q=@fffffc@33cAnlF EriksonbreE-Car 2000+R.@d@lF EriksonreeMaxinator"07@Y#~dtjF EriksonleEMimizer *;@#jF GSO Speed Start 2000GSO  Happy FamilyGSO  Susumu 1GSO  Susumu 3GSO Lumberjack 3000GSO  E-Car 2000GSO  MaxinatorGSO Mimizerptype_deXHLX'' 84#0GeländewagenSportwagenStadtautoFamilienautomaxminftype_enXHLX'' .* &Off-RoadRoadsterCity carFamily carmaxmin
readstata13/inst/extdata/sp500.stbcal0000644000176200001440000000073613077075424017055 0ustar liggesusers* Business calendar "sp500" created by -bcal create- * Created/replaced on 18 Nov 2014 version 12.1 purpose "S&P 500 for 2001" dateformat ymd range 2001jan02 2001dec31 centerdate 2001jan02 omit dayofweek (Sa Su) omit date 2001jan15 omit date 2001feb19 omit date 2001apr13 omit date 2001may28 omit date 2001jul04 omit date 2001sep03 omit date 2001sep11 omit date 2001sep12 omit date 2001sep13 omit date 2001sep14 omit date 2001nov22 omit date 2001dec25 readstata13/inst/extdata/missings_msf.dta0000644000176200001440000000256613077075424020212 0ustar liggesusers
117MSF05 Sep 2016 20:29
 I >Mjvbins%9.0g%9.0g%9.0g%2s?1
readstata13/inst/extdata/statacar.do0000644000176200001440000000363713077075424017145 0ustar liggesusers clear all input int(id) str20 brand str20 model long(type) int(hp) double(max) float(mileage) byte(ecar) long(ldate) str20(ldatecal) 1 "Meyer" "Speed Start 2000" 2 150 176.5 10.2 0 1 2001-01-03 2 "Meyer" "Happy Family" 4 98 145 5.6 0 247 2001-12-31 3 "Akiko" "Susumu 1" 3 45 118.7 -1 0 14 2001-01-23 4 "Akiko" "Susumu 3" 4 80 127.3 6.8 0 134 2001-07-16 5 "Hutch" "Lumberjack 3000" 1 180 156.2 14.2 0 110 2001-06-11 6 "Erikson" "E-Car 2000" 3 . . -2 1 100 2001-05-25 7 "Erikson" "Maxinator" 2147483620 32740 8.988e+307 1.701e+38 100 19 2001-01-30 7 "Erikson" "Mimizer" -2147483647 -32767 -1.798e+308 -1.701e+38 -127 1 2001-01-03 end gen ldatecal2 = date(ldatecal, "YMD") generate strL modelStrL = model drop ldatecal rename ldatecal2 ldatecal // bcal uses a special format. // %tb for business calendar and following the calendar name format ldatecal %td format ldate %tbsp500 // missings replace mileage = .a if mileage ==-1 // no info replace mileage = .b if mileage ==-2 // not applicable // Label en label language en, rename label var id "Numeric ID" label var brand "Brand of car" label var type "Car classification" label var model "Car model" label var hp "Horse Power" label var max "Maximum speed" label var ldate "Launch date" label var ldatecal "Launch date (calendar)" label define type_en 1 "Off-Road" 2 "Roadster" 3 "City car" 4 "Family car" 2147483620 "max" -2147483647 "min", modify label value type type_en // Label de label language de, new label var id "Numerische ID" label var brand "Herstellermarke" label var type "Klassifikation" label var model "Automodell" label var hp "Pferdestrken" label var max "Hchstgeschwindigkeit" label var ldate "Einfhrungsdatum" label var ldatecal "Einfhrungsdatum (Kalender)" label define type_de 1 "Gelndewagen" 2 "Sportwagen" 3 "Stadtauto" 4 "Familienauto" 2147483620 "max" -2147483647 "min", modify label value type type_de label language en save "statacar.dta", replace readstata13/inst/extdata/encodecp.dta0000644000176200001440000000121713077075424017261 0ustar liggesusersst9;9t9ߖJ 1 Sep 2016 17:16numchr%8.0g%9snumlabeltmp/sd04321.000000"cp.dta"acter vtmp/sd04321.000000"cp.dta"acter vEUROEGnumlabel;9t9 EUROEreadstata13/inst/extdata/encode.do0000644000176200001440000000037113077075424016570 0ustar liggesusersclear all set obs 6 gen int num = _n label variable num äöüß label define numlabel 1 "ä" 2 "ö" 3 "ü" 4 "ß" 5 "€" 6 "Œ" label values num numlabel // create character variable from labels decode num, gen(chr) save "encode.dta", replace readstata13/inst/extdata/missings.do0000644000176200001440000000057713077075424017177 0ustar liggesusersclear all set obs 27 gen missing = _n mvdecode missing, mv( 1 = . \ 2 = .a \ 3 = .b \ 4 = .c \ 5 = .d \ 6 = .e \ 7 = .f \ /// 8 = .g \ 9 = .h \ 10 = .i \ 11 = .j \ 12 = .k \ 13 = .l \ 14 = .m \ /// 15 = .n \ 16 = .o \ 17 = .p \ 18 = .q \ 19 = .r \ 20 = .s \ 21 = .t \ /// 22 = .u \ 23 = .v \ 24 = .w \ 25 = .x \ 26 = .y \ 27 = .z ) save "missings.dta", replace readstata13/inst/extdata/gen_fac.do0000644000176200001440000000016613077075424016717 0ustar liggesusersclear all set obs 2 gen v1 = _n label define v1 1 "one" label values v1 v1 compress save "gen_fac.dta", replace readstata13/inst/extdata/underscore.dta0000644000176200001440000000466113077075424017660 0ustar liggesusers
118LSF 1 Sep 2016 15:07
@]C f y v_1_name_multiple_underscoresv_2_name_multiple_underscoreslong_name_multiple_underscores%9.0g%9.0g%9.0go==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooKo==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooKo==K=nK=oPo==oՐKp?oKp?qLXL  ==ՐK?ooK
readstata13/inst/extdata/encode.dta0000644000176200001440000000404713077075424016742 0ustar liggesusers
118LSF 1 Sep 2016 17:13
>Up 'numchr%8.0g%9snumlabeläöüßcVpcVKpcVnKpcV::pcVpcV:ՐK:KqLXL  pcVpcVՐK::KäöüßcVpcVKpcVnKpcV::pcVpcV:ՐK:KqLXL  pcVpcVՐK::Käöü߀ŒKnumlabelpcVpcVՐK:pcV äöü߀Œ
readstata13/inst/extdata/nonint.dta0000644000176200001440000000060713077075424017010 0ustar liggesuserssc`-g12 Jul 2016 00:54v1%10.0gv1c&D0cpD[\]^?333333?v1pJ/ponereadstata13/inst/extdata/test.zip0000644000176200001440000000370513077075572016522 0ustar liggesusersPKx_JݳO/ statacar.dtaUT {X{Xux dZ[hU4^4*1f2Nǭ5m$izvtgvgfFxyPbAPj"*ZDTDlPAQ,'mEk6f??B"bdcb[g1R! 02ˬ1l.C6mf `LE)욌_Fx6m6 ZoַjC/K ]⡂)UaIo :*fAo^(W unYwVy+{\tlg{qPpl:LlߥJg& &KG c8DRm3mF=(ll^dxh?#a-` F?6 1dC,7 %~`,$#Q{i+OB B‘ix( uof~OxOGu™5ˏ8 軄}9x\sa*(0 Е)[I>J /v40" W"#C#u"58'FlnQ=׎o`n+z8v}SїUͫ04r;a0]Bu:IU:? %URQ=MbΦ;&ye+{]֞SkQTs^fug&D48g-v,1 C> 6h=-(`lCYuF *歬w^r 8o#,pO. +B B?&j;U_ޚ:BJr(4!Ptt1uukO DIC@-gm*、m/?t4#Ch0D*"Edo0q/nۗ-_r=zYpzcyO ᡇy :/ :.V8U XU`U`gcX sa',G4&6co\e(nD:^4W,:Ͱɉit{-(edc|Ff `!ݔi86sxA0; E͖]^vp -WYoEPoX3 d`+|bdT|z=m,D$8t3eYtb`k0eҔR6B¸y;NĞ#%i)͘}LsN:%?\K-+E +k|sI:kY2_(ˇE2̲o;8,ܲQ`dW~JsU& lqe;d7rlφ f9(.Yǔ5KR@*m7ͣ'. XkJN4>ur]KOKaF B`Glu'#&2u }UР"k!(aWm8*c*E*1(-Pܔk< &ҧ6'fVZ7J*/:Bjɳt}&cṇ`؛texP~t^ws<͆X9i %Y)y>7`{:ݳG6 lrq~,ܖ\v|?PKx_JݳO/ statacar.dtaUT{Xux dPKR]readstata13/inst/extdata/missings_lsf.dta0000644000176200001440000000256613077075424020211 0ustar liggesusers
117LSF05 Sep 2016 20:27
I >Mjvbins%9.0g%9.0g%9.0g%2s?1
readstata13/inst/extdata/gen_fac.dta0000644000176200001440000000252413077075424017065 0ustar liggesusers
118LSF12 Jul 2016 00:22
<7CfuHTv1%9.0gv1?c`)`)K`)K`)?c?d?c`)`)?cKPO?cKPO L L?d  `)`)K"?c?cKv1K?d`)?d`)K?c`)one
readstata13/inst/extdata/nonint.do0000644000176200001440000000020413077075424016633 0ustar liggesusersclear all set obs 2 gen double v1 = _n recode v1 2 = 1.2 label define v1 1 "one" label values v1 v1 save "nonint.dta", replace readstata13/inst/include/0000755000176200001440000000000013302210140014750 5ustar liggesusersreadstata13/inst/include/read_pre13_dta.h0000644000176200001440000000171013302177220017710 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READPRE13DTA_H #define READPRE13DTA_H Rcpp::List read_pre13_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows, const Rcpp::CharacterVector selectcols); #endif readstata13/inst/include/read_data.h0000644000176200001440000000215113302177220017037 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READDATA_H #define READDATA_H Rcpp::List read_data(FILE * file, const Rcpp::IntegerVector vartype_kk, const bool missing, const int8_t release, const uint64_t nn, uint32_t kk, const Rcpp::IntegerVector vartype_sj, const std::string byteorder, const bool swapit); #endif readstata13/inst/include/readstata.h0000644000176200001440000001322413302210140017073 0ustar liggesusers/* * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READSTATA_H #define READSTATA_H #include #include #include #include #include #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) /* Test for GCC < 4.9.0 */ #if GCC_VERSION < 40900 & !__clang__ typedef signed char int8_t; typedef unsigned char uint8_t; typedef signed short int16_t; typedef unsigned short uint16_t; typedef signed int int32_t; typedef unsigned int uint32_t; #else #include #endif #ifdef __APPLE__ # define off64_t off_t # define fseeko64 fseeko #endif #include "read_dta.h" #include "read_pre13_dta.h" #include "statadefines.h" #include "swap_endian.h" template T readbin( T t , FILE * file, bool swapit) { if (fread(&t, sizeof(t), 1, file) != 1) { if (feof(file)) return 0; // this is expected after reading the labeltable } else if (ferror(file)){ Rcpp::warning("num: a binary read error occurred."); } if (swapit==0) return(t); else return(swap_endian(t)); } template T readuint48( T t , FILE * file, bool swapit) { char uint48[6]; if (fread(uint48, sizeof(uint48), 1, file) != 1) { if (feof(file)) return 0; // this is expected after reading the labeltable } else if (ferror(file)){ Rcpp::warning("num: a binary read error occurred."); } t = *(uint64_t *)&uint48; if (swapit==0) return(t); else return(swap_endian(t)); } static void readstring(std::string &mystring, FILE * fp, int nchar) { if (!fread(&mystring[0], nchar, 1, fp)) Rcpp::warning("char: a binary read error occurred"); } inline void test(std::string testme, FILE * file) { std::string test(testme.size(), '\0'); readstring(test,file, test.size()); if (testme.compare(test)!=0) { fclose(file); Rcpp::warning("\n testme:%s \n test: %s\n", testme.c_str(), test.c_str()); Rcpp::stop("When attempting to read %s: Something went wrong!", testme.c_str()); } } template static void writebin(T t, std::fstream& dta, bool swapit) { if (swapit==1){ T t_s = swap_endian(t); dta.write((char*)&t_s, sizeof(t_s)); } else { dta.write((char*)&t, sizeof(t)); } } template static void writestr(std::string val_s, T len, std::fstream& dta) { std::stringstream val_stream; val_stream << std::left << std::setw(len) << std::setfill('\0') << val_s; std::string val_strl = val_stream.str(); dta.write(val_strl.c_str(),val_strl.length()); } inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) { uint32_t k = vartype.size(); Rcpp::IntegerVector rlen(k); // calculate row length in byte for (uint32_t i=0; i(ms) Rcpp::Rcout << "Variable " << ms << " was not found in dta-file." << std::endl; } // report position for found cases mm = Rcpp::match(y, x); return(mm); } // calculate the maximum jump. This calculates the maximum space we can skip if // reading only a single variable. Before we skipped over each variable. Now we // skip over them combined. Therefore if a value in x is positive push it // into a new vector. If negative, sum the length up. inline Rcpp::IntegerVector calc_jump(Rcpp::IntegerVector x) { Rcpp::IntegerVector y; int64_t val = 0; bool last = 0; uint32_t k = x.size(); for (uint32_t i=0; i 0) & (last == 0)) y.push_back(val); val = value; y.push_back(val); last = 1; } if ((i+1 == k) & (last == 0)) { y.push_back(val); } } return(y); } #endif readstata13/inst/include/read_dta.h0000644000176200001440000000202113302177220016672 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef READDTA_H #define READDTA_H Rcpp::List read_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows, const Rcpp::CharacterVector selectcols, const bool strlexport, const Rcpp::CharacterVector strlpath); #endif readstata13/inst/include/swap_endian.h0000644000176200001440000000224213077075424017440 0ustar liggesusers#ifndef SWAP_ENDIAN #define SWAP_ENDIAN /*#include */ #include #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) /* Test for GCC < 4.8.0 */ #if GCC_VERSION < 40800 & !__clang__ static inline unsigned short __builtin_bswap16(unsigned short a) { return (a<<8)|(a>>8); } #endif template T swap_endian(T t) { if (typeid(T) == typeid(int16_t)) return __builtin_bswap16(t); if (typeid(T) == typeid(uint16_t)) return __builtin_bswap16(t); if (typeid(T) == typeid(int32_t)) return __builtin_bswap32(t); if (typeid(T) == typeid(uint32_t)) return __builtin_bswap32(t); if (typeid(T) == typeid(int64_t)) return __builtin_bswap64(t); if (typeid(T) == typeid(uint64_t)) return __builtin_bswap64(t); union v { double d; float f; uint32_t i32; uint64_t i64; } val; if (typeid(T) == typeid(float)){ val.f = t; val.i32 = __builtin_bswap32(val.i32); return val.f; } if (typeid(T) == typeid(double)){ val.d = t; val.i64 = __builtin_bswap64(val.i64); return val.d; } else return t; } #endif readstata13/inst/include/statadefines.h0000644000176200001440000000445213302177220017613 0ustar liggesusers/* * Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #ifndef STATADEFINES #define STATADEFINES /* Test for a little-endian machine */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define sbyteorder "LSF" #define SBYTEORDER 2 #else #define sbyteorder "MSF" #define SBYTEORDER 1 #endif #define swapit FALSE /*Define missings*/ #define STATA_BYTE_NA_MIN -127 #define STATA_BYTE_NA_MAX +100 #define STATA_BYTE_NA +101 #define STATA_BYTE_NA_104 +127 // guess. #define STATA_SHORTINT_NA_MIN -32767 #define STATA_SHORTINT_NA_MAX +32740 #define STATA_SHORTINT_NA +32741 #define STATA_INT_NA_MIN -2147483647 #define STATA_INT_NA_MAX +2147483620 #define STATA_INT_NA +2147483621 #define STATA_INT_NA_108 2147483647 #define STATA_FLOAT_NA_MAX (1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+14/pow(16.0,6))*pow(2.0,126) #define STATA_FLOAT_NA_MIN -STATA_FLOAT_NA_MAX #define STATA_FLOAT_NA 1+pow(2.0,127) #define STATA_DOUBLE_NA_MAX (1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+15/pow(16.0,6)+15/pow(16.0,7)+15/pow(16.0,8)+15/pow(16.0,9)+15/pow(16.0,10)+15/pow(16.0,11)+15/pow(16.0,12)+15/pow(16.0,13))*pow(2.0,1022) #define STATA_DOUBLE_NA_MIN -1*(1+15/pow(16.0,1)+15/pow(16.0,2)+15/pow(16.0,3)+15/pow(16.0,4)+15/pow(16.0,5)+15/pow(16.0,6)+15/pow(16.0,7)+15/pow(16.0,8)+15/pow(16.0,9)+15/pow(16.0,10)+15/pow(16.0,11)+15/pow(16.0,12)+15/pow(16.0,13))*pow(2.0,1023) #define STATA_DOUBLE_NA pow(2.0,1023) #define STATA_BYTE 65530 #define STATA_SHORTINT 65529 #define STATA_INT 65528 #define STATA_FLOAT 65527 #define STATA_DOUBLE 65526 #define STATA_STR 2045 #define STATA_SHORT_STR 244 #define STATA_STRL 32768 #endif readstata13/tests/0000755000176200001440000000000013077075424013541 5ustar liggesusersreadstata13/tests/testthat.R0000644000176200001440000000010213077075424015515 0ustar liggesuserslibrary(testthat) library(readstata13) test_check("readstata13") readstata13/tests/testthat/0000755000176200001440000000000013302354454015373 5ustar liggesusersreadstata13/tests/testthat/test_read.R0000644000176200001440000001133513302177220017465 0ustar liggesuserslibrary(readstata13) context("Reading datasets") datacompare <- function(x, y) { res <- unlist(Map(all.equal, x, y)) # with all(unlist(res)) if not TRUE, a warning is thrown res <- all(unlist(lapply(res, isTRUE))) res } #### missings #### # missings.do creates missings.dta # missings.dta contains variable missings containing ., .a, .b, ..., .z # # Note: prior Stata 8 there was only a single missing value dd <- data.frame(missings = as.numeric(rep(NA, 27))) missings <- system.file("extdata", "missings.dta", package="readstata13") dd118 <- read.dta13(missings, missing.type = FALSE) dd118_m <- read.dta13(missings, missing.type = TRUE) mvals <- attr(dd118_m, "missing")$missings test_that("missings", { expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd118_m)) expect_identical(mvals, as.numeric(0:26)) }) # rm(list = files) #### missings msf/lsf #### dd <- data.frame(b = as.logical(c(1,NA)), i=as.integer(c(1,NA)), n=as.numeric(c(1,NA)), s=c("1", ""), stringsAsFactors = FALSE) dd$b <- as.integer(dd$b) missings_msf <- system.file("extdata", "missings_msf.dta", package="readstata13") missings_lsf <- system.file("extdata", "missings_lsf.dta", package="readstata13") dd_msf <- read.dta13(missings_msf) dd_lsf <- read.dta13(missings_lsf) test_that("missings msf/lsf", { expect_true(datacompare(dd, dd_msf)) expect_true(datacompare(dd, dd_lsf)) }) #### generate factors TRUE #### dd <- data.frame(v1 = as.numeric(1:2)) dd$v1 <- factor(x = dd$v1, levels = 1:2, labels = c("one", "2")) gen_fac <- system.file("extdata", "gen_fac.dta", package="readstata13") dd118 <- read.dta13(gen_fac, convert.factors = TRUE, generate.factors = TRUE) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) # rm(list = files) #### generate factors FALSE #### dd <- data.frame(v1 = as.numeric(1:2)) gen_fac <- system.file("extdata", "gen_fac.dta", package="readstata13") suppressWarnings(dd118 <- read.dta13(gen_fac, convert.factors = TRUE, generate.factors = FALSE)) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### convert.underscore = TRUE #### dd <- data.frame(v.1 = as.numeric(1:2), v.2 = as.numeric(1:2), long.name.multiple.underscores = as.numeric(1:2)) underscore <- system.file("extdata", "underscore.dta", package="readstata13") dd118 <- read.dta13(underscore, convert.underscore = T) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### convert.underscore = FALSE #### dd <- data.frame(v.1 = as.numeric(1:2), v.2 = as.numeric(1:2), long_name_multiple_underscores = as.numeric(1:2)) underscore <- system.file("extdata", "underscore.dta", package="readstata13") dd118 <- read.dta13(underscore, convert.underscore = F) test_that("generate.factors TRUE", { expect_true(datacompare(dd, dd118)) }) #### noint.factors TRUE #### dd <- data.frame(v1 = as.numeric(1:2)) dd$v1 <- factor(x = dd$v1, levels = 1:2, labels = c("one", "1.2")) nonint <- system.file("extdata", "nonint.dta", package="readstata13") dd118 <- read.dta13(nonint, convert.factors = TRUE, generate.factors = TRUE, nonint.factors = TRUE) test_that("nonint.factors TRUE", { expect_true(datacompare(dd, dd118)) }) # rm(list = files) #### encoding TRUE #### umlauts <- c("ä","ö","ü","ß","€","Œ") Encoding(umlauts) <- "UTF-8" ddcp <- dd <- data.frame(num = factor(1:6, levels = 1:6, labels = umlauts), chr = umlauts, stringsAsFactors = FALSE) # Dataset in CP1252 levels(ddcp$num)[5:6] <- c("EUR","OE") ddcp$chr[5:6] <- c("EUR","OE") # Stata 14 encode <- system.file("extdata", "encode.dta", package="readstata13") # Stata 12 encodecp <- system.file("extdata", "encodecp.dta", package="readstata13") ddutf_aE <- read.dta13(encode, convert.factors = TRUE, generate.factors = TRUE, encoding="UTF-8") # On windows the last two characters will fail on default (not in latin1) dd_aE <- read.dta13(encode, convert.factors = TRUE, generate.factors = TRUE) ddcp_aE <- read.dta13(encodecp, convert.factors = TRUE, generate.factors = TRUE) test_that("encoding CP1252", { expect_true(datacompare(ddcp, ddcp_aE)) }) test_that("encoding UTF-8 (Stata 14)", { expect_true(datacompare(dd$chr[1:4], dd_aE$chr[1:4])) expect_true(datacompare(dd, ddutf_aE)) }) test_that("Reading of strls", { strl <- system.file("extdata", "statacar.dta", package="readstata13") ddstrlf <- read.dta13(strl, replace.strl = F) ddstrlfref <- paste0("11_", 1:8) expect_equal(ddstrlf$modelStrL, ddstrlfref) ddstrl <- read.dta13(strl, replace.strl = T) expect_equal(ddstrl$model, ddstrl$modelStrL) }) readstata13/tests/testthat/test_save.R0000644000176200001440000011230713302177220017511 0ustar liggesuserslibrary(readstata13) context("Saving datasets") # ToDo: Fix this. # load(system.file("extdata/statacar.RData", package="readstata13")) # # saveandload <- function(x, ...) { # file <- tempfile(pattern="readstata13_", fileext=".dta") # save.dta13(x, file=file, ...) # all(unlist(Map(identical, x, read.dta13(file)))) # } # # test_that("Saved file is identical: Version 118", { # expect_true(saveandload(statacar, version="118", convert.factors=T)) # }) datacompare <- function(x, y) { all(unlist(Map(all.equal, x, y))) } namescompare <- function(x, y){ all(identical(names(x), names(y))) } files <- c("dd118", "dd117", "dd115", "dd114", "dd113", "dd112", "dd111", "dd110", "dd108", "dd107", "dd106", "dd105", "dd104", "dd103", "dd102", "dd") data(mtcars) #### version #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_15mp.dta", version = "15mp") save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd15mp<- read.dta13("data/dta_15mp.dta") dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("version", { expect_true(datacompare(dd, dd15mp)) expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### compress #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119, compress = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, compress = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, compress = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, compress = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, compress = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, compress = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, compress = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, compress = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, compress = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, compress = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, compress = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, compress = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, compress = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, compress = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, compress = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, compress = TRUE) dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("compress", { expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.factors TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.factors = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.factors = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.factors = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.factors = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.factors = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.factors = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = TRUE) # save.dta13(dd, "data/dta_106.dta", version = 106, convert.factors = TRUE) # save.dta13(dd, "data/dta_105.dta", version = 105, convert.factors = TRUE) # save.dta13(dd, "data/dta_104.dta", version = 104, convert.factors = TRUE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = TRUE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = TRUE) dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") no factors # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") # rm -r unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) no factors # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.factors FALSE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars dd$am <- factor(x = dd$am, levels = c(0,1), labels = c("auto", "man")) save.dta13(dd, "data/dta_119.dta", version = 119, convert.factors = FALSE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.factors = FALSE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.factors = FALSE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.factors = FALSE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.factors = FALSE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.factors = FALSE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.factors = FALSE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.factors = FALSE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.factors = FALSE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.factors = FALSE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.factors = FALSE) # save.dta13(dd, "data/dta_106.dta", version = 106, convert.factors = FALSE) # save.dta13(dd, "data/dta_105.dta", version = 105, convert.factors = FALSE) # no factors | expect_warning ? # save.dta13(dd, "data/dta_104.dta", version = 104, convert.factors = FALSE) # save.dta13(dd, "data/dta_103.dta", version = 103, convert.factors = FALSE) # save.dta13(dd, "data/dta_102.dta", version = 102, convert.factors = FALSE) dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") no factors | expect_warning ? # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") # add one (because of stupid factor) dd <- mtcars dd$am <- dd$am + 1 # rm -r unlink("data", recursive = TRUE) test_that("convert.factors TRUE", { expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) no factors # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### add rownames TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119, add.rownames = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, add.rownames = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, add.rownames = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, add.rownames = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, add.rownames = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, add.rownames = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, add.rownames = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, add.rownames = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, add.rownames = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, add.rownames = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, add.rownames = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, add.rownames = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, add.rownames = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, add.rownames = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, add.rownames = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, add.rownames = TRUE) dd119 <- read.dta13("data/dta_119.dta", add.rownames = TRUE) dd118 <- read.dta13("data/dta_118.dta", add.rownames = TRUE) dd117 <- read.dta13("data/dta_117.dta", add.rownames = TRUE) dd115 <- read.dta13("data/dta_115.dta", add.rownames = TRUE) dd114 <- read.dta13("data/dta_114.dta", add.rownames = TRUE) dd113 <- read.dta13("data/dta_113.dta", add.rownames = TRUE) dd112 <- read.dta13("data/dta_112.dta", add.rownames = TRUE) dd111 <- read.dta13("data/dta_111.dta", add.rownames = TRUE) dd110 <- read.dta13("data/dta_110.dta", add.rownames = TRUE) dd108 <- read.dta13("data/dta_108.dta", add.rownames = TRUE) dd107 <- read.dta13("data/dta_107.dta", add.rownames = TRUE) dd106 <- read.dta13("data/dta_106.dta", add.rownames = TRUE) dd105 <- read.dta13("data/dta_105.dta", add.rownames = TRUE) dd104 <- read.dta13("data/dta_104.dta", add.rownames = TRUE) dd103 <- read.dta13("data/dta_103.dta", add.rownames = TRUE) dd102 <- read.dta13("data/dta_102.dta", add.rownames = TRUE) # rm -r unlink("data", recursive = TRUE) test_that("add.rownames TRUE", { # Check that rownames are identical expect_true(identical(rownames(dd), rownames(dd119))) expect_true(identical(rownames(dd), rownames(dd118))) expect_true(identical(rownames(dd), rownames(dd117))) expect_true(identical(rownames(dd), rownames(dd115))) expect_true(identical(rownames(dd), rownames(dd114))) expect_true(identical(rownames(dd), rownames(dd113))) expect_true(identical(rownames(dd), rownames(dd112))) expect_true(identical(rownames(dd), rownames(dd111))) expect_true(identical(rownames(dd), rownames(dd110))) expect_true(identical(rownames(dd), rownames(dd108))) expect_true(identical(rownames(dd), rownames(dd107))) expect_true(identical(rownames(dd), rownames(dd106))) expect_true(identical(rownames(dd), rownames(dd105))) expect_true(identical(rownames(dd), rownames(dd104))) expect_true(identical(rownames(dd), rownames(dd103))) expect_true(identical(rownames(dd), rownames(dd102))) # Check that data is identical expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### data label TRUE #### dl <- "mtcars data file" if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119, data.label = dl) save.dta13(dd, "data/dta_118.dta", version = 118, data.label = dl) save.dta13(dd, "data/dta_117.dta", version = 117, data.label = dl) save.dta13(dd, "data/dta_115.dta", version = 115, data.label = dl) save.dta13(dd, "data/dta_114.dta", version = 114, data.label = dl) save.dta13(dd, "data/dta_113.dta", version = 113, data.label = dl) save.dta13(dd, "data/dta_112.dta", version = 112, data.label = dl) save.dta13(dd, "data/dta_111.dta", version = 111, data.label = dl) save.dta13(dd, "data/dta_110.dta", version = 110, data.label = dl) save.dta13(dd, "data/dta_108.dta", version = 108, data.label = dl) save.dta13(dd, "data/dta_107.dta", version = 107, data.label = dl) save.dta13(dd, "data/dta_106.dta", version = 106, data.label = dl) save.dta13(dd, "data/dta_105.dta", version = 105, data.label = dl) save.dta13(dd, "data/dta_104.dta", version = 104, data.label = dl) save.dta13(dd, "data/dta_103.dta", version = 103, data.label = dl) # save.dta13(dd, "data/dta_102.dta", version = 102, data.label = dl) # no data label dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("data label", { # Check that rownames are identical expect_equal(dl, attr(dd119, "datalabel")) expect_equal(dl, attr(dd118, "datalabel")) expect_equal(dl, attr(dd117, "datalabel")) expect_equal(dl, attr(dd115, "datalabel")) expect_equal(dl, attr(dd114, "datalabel")) expect_equal(dl, attr(dd113, "datalabel")) expect_equal(dl, attr(dd112, "datalabel")) expect_equal(dl, attr(dd111, "datalabel")) expect_equal(dl, attr(dd110, "datalabel")) expect_equal(dl, attr(dd108, "datalabel")) expect_equal(dl, attr(dd107, "datalabel")) expect_equal(dl, attr(dd106, "datalabel")) expect_equal(dl, attr(dd105, "datalabel")) expect_equal(dl, attr(dd104, "datalabel")) expect_equal(dl, attr(dd103, "datalabel")) # expect_equal(dl, attr(dd102, "datalabel")) }) # rm(list = files) #### convert dates TRUE #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- data.frame( dat = Sys.Date() ) save.dta13(dd, "data/dta_119.dta", version = 119, convert.dates = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.dates = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.dates = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.dates = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.dates = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.dates = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.dates = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.dates = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.dates = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.dates = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.dates = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, convert.dates = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, convert.dates = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, convert.dates = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.dates = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.dates = TRUE) dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("convert.dates TRUE", { # Check that rownames are identical expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### strl save #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") # strLs can be of length any length up to 2 billion characters. Starting with # 2046 a string is handled as a strL dd <- data.frame( dat = c(paste(replicate(2046, "a"), collapse = ""), paste(replicate(2046, "b"), collapse = "")), stringsAsFactors = FALSE) save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) # save.dta13(dd, "data/dta_115.dta", version = 115) # no strl # save.dta13(dd, "data/dta_114.dta", version = 114) # save.dta13(dd, "data/dta_113.dta", version = 113) # save.dta13(dd, "data/dta_112.dta", version = 112) # save.dta13(dd, "data/dta_111.dta", version = 111) # save.dta13(dd, "data/dta_110.dta", version = 110) # save.dta13(dd, "data/dta_108.dta", version = 108) # save.dta13(dd, "data/dta_107.dta", version = 107) # save.dta13(dd, "data/dta_106.dta", version = 106) # save.dta13(dd, "data/dta_105.dta", version = 105) # save.dta13(dd, "data/dta_104.dta", version = 104) # save.dta13(dd, "data/dta_103.dta", version = 103) # save.dta13(dd, "data/dta_102.dta", version = 102) dd119 <- read.dta13("data/dta_119.dta", replace.strl = TRUE) dd118 <- read.dta13("data/dta_118.dta", replace.strl = TRUE) dd117 <- read.dta13("data/dta_117.dta", replace.strl = TRUE) # dd115 <- read.dta13("data/dta_115.dta") # dd114 <- read.dta13("data/dta_114.dta") # dd113 <- read.dta13("data/dta_113.dta") # dd112 <- read.dta13("data/dta_112.dta") # dd111 <- read.dta13("data/dta_111.dta") # dd110 <- read.dta13("data/dta_110.dta") # dd108 <- read.dta13("data/dta_108.dta") # dd107 <- read.dta13("data/dta_107.dta") # dd106 <- read.dta13("data/dta_106.dta") # dd105 <- read.dta13("data/dta_105.dta") # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("replace.strl TRUE", { # Check that rownames are identical expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) # expect_true(datacompare(dd, dd115)) # expect_true(datacompare(dd, dd114)) # expect_true(datacompare(dd, dd113)) # expect_true(datacompare(dd, dd112)) # expect_true(datacompare(dd, dd111)) # expect_true(datacompare(dd, dd110)) # expect_true(datacompare(dd, dd108)) # expect_true(datacompare(dd, dd107)) # expect_true(datacompare(dd, dd106)) # expect_true(datacompare(dd, dd105)) # expect_true(datacompare(dd, dd104)) # expect_true(datacompare(dd, dd103)) # expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### convert.underscore save #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- data.frame(x.1 = 1) save.dta13(dd, "data/dta_119.dta", version = 119, convert.underscore = TRUE) save.dta13(dd, "data/dta_118.dta", version = 118, convert.underscore = TRUE) save.dta13(dd, "data/dta_117.dta", version = 117, convert.underscore = TRUE) save.dta13(dd, "data/dta_115.dta", version = 115, convert.underscore = TRUE) save.dta13(dd, "data/dta_114.dta", version = 114, convert.underscore = TRUE) save.dta13(dd, "data/dta_113.dta", version = 113, convert.underscore = TRUE) save.dta13(dd, "data/dta_112.dta", version = 112, convert.underscore = TRUE) save.dta13(dd, "data/dta_111.dta", version = 111, convert.underscore = TRUE) save.dta13(dd, "data/dta_110.dta", version = 110, convert.underscore = TRUE) save.dta13(dd, "data/dta_108.dta", version = 108, convert.underscore = TRUE) save.dta13(dd, "data/dta_107.dta", version = 107, convert.underscore = TRUE) save.dta13(dd, "data/dta_106.dta", version = 106, convert.underscore = TRUE) save.dta13(dd, "data/dta_105.dta", version = 105, convert.underscore = TRUE) save.dta13(dd, "data/dta_104.dta", version = 104, convert.underscore = TRUE) save.dta13(dd, "data/dta_103.dta", version = 103, convert.underscore = TRUE) save.dta13(dd, "data/dta_102.dta", version = 102, convert.underscore = TRUE) dd119 <- read.dta13("data/dta_119.dta") dd118 <- read.dta13("data/dta_118.dta") dd117 <- read.dta13("data/dta_117.dta") dd115 <- read.dta13("data/dta_115.dta") dd114 <- read.dta13("data/dta_114.dta") dd113 <- read.dta13("data/dta_113.dta") dd112 <- read.dta13("data/dta_112.dta") dd111 <- read.dta13("data/dta_111.dta") dd110 <- read.dta13("data/dta_110.dta") dd108 <- read.dta13("data/dta_108.dta") dd107 <- read.dta13("data/dta_107.dta") dd106 <- read.dta13("data/dta_106.dta") dd105 <- read.dta13("data/dta_105.dta") dd104 <- read.dta13("data/dta_104.dta") dd103 <- read.dta13("data/dta_103.dta") dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) names(dd) <- "x_1" test_that("convert.underscore TRUE", { # check numerics expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) # check names expect_true(namescompare(dd, dd119)) expect_true(namescompare(dd, dd118)) expect_true(namescompare(dd, dd117)) expect_true(namescompare(dd, dd115)) expect_true(namescompare(dd, dd114)) expect_true(namescompare(dd, dd113)) expect_true(namescompare(dd, dd112)) expect_true(namescompare(dd, dd111)) expect_true(namescompare(dd, dd110)) expect_true(namescompare(dd, dd108)) expect_true(namescompare(dd, dd107)) expect_true(namescompare(dd, dd106)) expect_true(namescompare(dd, dd105)) expect_true(namescompare(dd, dd104)) expect_true(namescompare(dd, dd103)) expect_true(namescompare(dd, dd102)) }) # rm(list = files) #### select.rows #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd119 <- read.dta13("data/dta_119.dta", select.rows = 5) dd118 <- read.dta13("data/dta_118.dta", select.rows = 5) dd117 <- read.dta13("data/dta_117.dta", select.rows = 5) dd115 <- read.dta13("data/dta_115.dta", select.rows = 5) dd114 <- read.dta13("data/dta_114.dta", select.rows = 5) dd113 <- read.dta13("data/dta_113.dta", select.rows = 5) dd112 <- read.dta13("data/dta_112.dta", select.rows = 5) dd111 <- read.dta13("data/dta_111.dta", select.rows = 5) dd110 <- read.dta13("data/dta_110.dta", select.rows = 5) dd108 <- read.dta13("data/dta_108.dta", select.rows = 5) dd107 <- read.dta13("data/dta_107.dta", select.rows = 5) dd106 <- read.dta13("data/dta_106.dta", select.rows = 5) dd105 <- read.dta13("data/dta_105.dta", select.rows = 5) dd104 <- read.dta13("data/dta_104.dta", select.rows = 5) dd103 <- read.dta13("data/dta_103.dta", select.rows = 5) dd102 <- read.dta13("data/dta_102.dta", select.rows = 5) unlink("data", recursive = TRUE) dd <- dd[1:5,] test_that("select.rows = 5", { # check numerics expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd119 <- read.dta13("data/dta_119.dta", select.rows = c(5,10)) dd118 <- read.dta13("data/dta_118.dta", select.rows = c(5,10)) dd117 <- read.dta13("data/dta_117.dta", select.rows = c(5,10)) dd115 <- read.dta13("data/dta_115.dta", select.rows = c(5,10)) dd114 <- read.dta13("data/dta_114.dta", select.rows = c(5,10)) dd113 <- read.dta13("data/dta_113.dta", select.rows = c(5,10)) dd112 <- read.dta13("data/dta_112.dta", select.rows = c(5,10)) dd111 <- read.dta13("data/dta_111.dta", select.rows = c(5,10)) dd110 <- read.dta13("data/dta_110.dta", select.rows = c(5,10)) dd108 <- read.dta13("data/dta_108.dta", select.rows = c(5,10)) dd107 <- read.dta13("data/dta_107.dta", select.rows = c(5,10)) dd106 <- read.dta13("data/dta_106.dta", select.rows = c(5,10)) dd105 <- read.dta13("data/dta_105.dta", select.rows = c(5,10)) dd104 <- read.dta13("data/dta_104.dta", select.rows = c(5,10)) dd103 <- read.dta13("data/dta_103.dta", select.rows = c(5,10)) dd102 <- read.dta13("data/dta_102.dta", select.rows = c(5,10)) unlink("data", recursive = TRUE) dd <- dd[5:10,] test_that("select.rows = c(5,10)", { # check numerics expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### select.cols #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) save.dta13(dd, "data/dta_104.dta", version = 104) save.dta13(dd, "data/dta_103.dta", version = 103) save.dta13(dd, "data/dta_102.dta", version = 102) dd119 <- read.dta13("data/dta_119.dta", select.cols = c("disp", "drat")) dd118 <- read.dta13("data/dta_118.dta", select.cols = c("disp", "drat")) dd117 <- read.dta13("data/dta_117.dta", select.cols = c("disp", "drat")) dd115 <- read.dta13("data/dta_115.dta", select.cols = c("disp", "drat")) dd114 <- read.dta13("data/dta_114.dta", select.cols = c("disp", "drat")) dd113 <- read.dta13("data/dta_113.dta", select.cols = c("disp", "drat")) dd112 <- read.dta13("data/dta_112.dta", select.cols = c("disp", "drat")) dd111 <- read.dta13("data/dta_111.dta", select.cols = c("disp", "drat")) dd110 <- read.dta13("data/dta_110.dta", select.cols = c("disp", "drat")) dd108 <- read.dta13("data/dta_108.dta", select.cols = c("disp", "drat")) dd107 <- read.dta13("data/dta_107.dta", select.cols = c("disp", "drat")) dd106 <- read.dta13("data/dta_106.dta", select.cols = c("disp", "drat")) dd105 <- read.dta13("data/dta_105.dta", select.cols = c("disp", "drat")) dd104 <- read.dta13("data/dta_104.dta", select.cols = c("disp", "drat")) dd103 <- read.dta13("data/dta_103.dta", select.cols = c("disp", "drat")) dd102 <- read.dta13("data/dta_102.dta", select.cols = c("disp", "drat")) unlink("data", recursive = TRUE) dd <- dd[,c("disp", "drat")] test_that("select.cols = c('disp', 'drat')", { # check numerics expect_true(datacompare(dd, dd119)) expect_true(datacompare(dd, dd118)) expect_true(datacompare(dd, dd117)) expect_true(datacompare(dd, dd115)) expect_true(datacompare(dd, dd114)) expect_true(datacompare(dd, dd113)) expect_true(datacompare(dd, dd112)) expect_true(datacompare(dd, dd111)) expect_true(datacompare(dd, dd110)) expect_true(datacompare(dd, dd108)) expect_true(datacompare(dd, dd107)) expect_true(datacompare(dd, dd106)) expect_true(datacompare(dd, dd105)) expect_true(datacompare(dd, dd104)) expect_true(datacompare(dd, dd103)) expect_true(datacompare(dd, dd102)) }) # rm(list = files) #### expansion.fields #### if (readstata13:::dir.exists13("data")) unlink("data", recursive = TRUE) dir.create("data") dd <- mtcars # create expansion.fields: In stata use command notes: They are constructed as # follows: # # 1. on what is the note : can be _dta or a variable name # 2. string "note" + number of note # 3. the note # initializiation of a one line note on a dta-file is done using: Ordering does # not matter: # # line1: _dta note0 1 # # line2: _dta note1 a note attached to the dta ef <- list( c("_dta", "note1", "note written in R"), c("_dta", "note0", "1"), c("mpg", "note1", "Miles/(US) gallon"), c("mpg", "note0", "1") ) attr(dd, "expansion.fields") <- ef save.dta13(dd, "data/dta_119.dta", version = 119) save.dta13(dd, "data/dta_118.dta", version = 118) save.dta13(dd, "data/dta_117.dta", version = 117) save.dta13(dd, "data/dta_115.dta", version = 115) save.dta13(dd, "data/dta_114.dta", version = 114) save.dta13(dd, "data/dta_113.dta", version = 113) save.dta13(dd, "data/dta_112.dta", version = 112) save.dta13(dd, "data/dta_111.dta", version = 111) save.dta13(dd, "data/dta_110.dta", version = 110) save.dta13(dd, "data/dta_108.dta", version = 108) save.dta13(dd, "data/dta_107.dta", version = 107) save.dta13(dd, "data/dta_106.dta", version = 106) save.dta13(dd, "data/dta_105.dta", version = 105) # save.dta13(dd, "data/dta_104.dta", version = 104) # save.dta13(dd, "data/dta_103.dta", version = 103) # save.dta13(dd, "data/dta_102.dta", version = 102) dd119 <- attr(read.dta13("data/dta_119.dta"), "expansion.fields") dd118 <- attr(read.dta13("data/dta_118.dta"), "expansion.fields") dd117 <- attr(read.dta13("data/dta_117.dta"), "expansion.fields") dd115 <- attr(read.dta13("data/dta_115.dta"), "expansion.fields") dd114 <- attr(read.dta13("data/dta_114.dta"), "expansion.fields") dd113 <- attr(read.dta13("data/dta_113.dta"), "expansion.fields") dd112 <- attr(read.dta13("data/dta_112.dta"), "expansion.fields") dd111 <- attr(read.dta13("data/dta_111.dta"), "expansion.fields") dd110 <- attr(read.dta13("data/dta_110.dta"), "expansion.fields") dd108 <- attr(read.dta13("data/dta_108.dta"), "expansion.fields") dd107 <- attr(read.dta13("data/dta_107.dta"), "expansion.fields") dd106 <- attr(read.dta13("data/dta_106.dta"), "expansion.fields") dd105 <- attr(read.dta13("data/dta_105.dta"), "expansion.fields") # dd104 <- read.dta13("data/dta_104.dta") # dd103 <- read.dta13("data/dta_103.dta") # dd102 <- read.dta13("data/dta_102.dta") unlink("data", recursive = TRUE) test_that("expansinon.fields", { # check numerics expect_equal(ef, dd119) expect_equal(ef, dd118) expect_equal(ef, dd117) expect_equal(ef, dd115) expect_equal(ef, dd114) expect_equal(ef, dd113) expect_equal(ef, dd112) expect_equal(ef, dd111) expect_equal(ef, dd110) expect_equal(ef, dd108) expect_equal(ef, dd107) expect_equal(ef, dd106) expect_equal(ef, dd105) # expect_equal(ef, dd104) # expect_equal(ef, dd103) # expect_equal(ef, dd102) }) readstata13/src/0000755000176200001440000000000013302234744013157 5ustar liggesusersreadstata13/src/read.cpp0000644000176200001440000000364213302234744014603 0ustar liggesusers/* * Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include using namespace Rcpp; // Reads the binary Stata file // // @param filePath The full systempath to the dta file you want to import. // @param missing logical if missings should be converted outside of Rcpp. // @import Rcpp // @export // [[Rcpp::export]] List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols, const bool strlexport, const CharacterVector strlpath) { FILE *file = NULL; // File pointer /* * Open the file in binary mode using the "rb" format string * This also checks if the file exists and/or can be opened for reading * correctly */ if ((file = fopen(filePath, "rb")) == NULL) throw std::range_error("Could not open specified file."); /* * check the first byte. */ std::string fbit(1, '\0'); readstring(fbit, file, fbit.size()); std::string expfbit = "<"; // create df List df(0); if (fbit.compare(expfbit) == 0) df = read_dta(file, missing, selectrows, selectcols, strlexport, strlpath); else df = read_pre13_dta(file, missing, selectrows, selectcols); fclose(file); return df; } readstata13/src/Makevars0000644000176200001440000000016113302234744014651 0ustar liggesusers## -*- mode: makefile; -*- PKG_CPPFLAGS = -I../inst/include -I. PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) readstata13/src/read_data.cpp0000644000176200001440000001362613302234744015577 0ustar liggesusers/* * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" using namespace Rcpp; using namespace std; List read_data(FILE * file, const IntegerVector vartype_kk, const bool missing, const int8_t release, const uint64_t nn, uint32_t kk, const IntegerVector vartype_sj, const std::string byteorder, const bool swapit) { // 1. create the list List df(kk); for (uint32_t i=0; i0) & (type < 2046)) ? STATA_STR : type) { // double case STATA_DOUBLE: { double val_d = 0; val_d = readbin(val_d, file, swapit); if ((missing == 0) && !(val_d == R_NegInf) && ((val_dSTATA_DOUBLE_NA_MAX)) ) REAL(VECTOR_ELT(df,ii))[j] = NA_REAL; else REAL(VECTOR_ELT(df,ii))[j] = val_d; break; } // float case STATA_FLOAT: { float val_f = 0; val_f = readbin(val_f, file, swapit); if ((missing == 0) && ((val_fSTATA_FLOAT_NA_MAX)) ) REAL(VECTOR_ELT(df,ii))[j] = NA_REAL; else REAL(VECTOR_ELT(df,ii))[j] = val_f; break; } // long case STATA_INT: { int32_t val_l = 0; val_l = readbin(val_l, file, swapit); if ((missing == 0) && ((val_lSTATA_INT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,ii))[j] = val_l; break; } // int case STATA_SHORTINT: { int16_t val_i = 0; val_i = readbin(val_i, file, swapit); if ((missing == 0) && ((val_iSTATA_SHORTINT_NA_MAX)) ) INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,ii))[j] = val_i; break; } // byte case STATA_BYTE: { int8_t val_b = 0; val_b = readbin(val_b, file, swapit); if (missing == 0 && ( (val_bSTATA_BYTE_NA_MAX)) ) INTEGER(VECTOR_ELT(df,ii))[j] = NA_INTEGER; else INTEGER(VECTOR_ELT(df,ii))[j] = val_b; break; } // strings with 2045 or fewer characters case STATA_STR: { int32_t len = 0; len = vartype_sj[i]; std::string val_s (len, '\0'); readstring(val_s, file, val_s.size()); as(df[ii])[j] = val_s; break; } // string of any length case STATA_STRL: {// strL 2*4bit or 2 + 6 bit // FixMe: Strl in 118 switch (release) { case 117: { uint32_t v = 0, o = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); stringstream val_stream; val_stream << v << '_' << o; string val_strl = val_stream.str(); as(df[ii])[j] = val_strl; break; } case 118: { int16_t v = 0; int64_t o = 0, z = 0; z = readbin(z, file, swapit); // works for LSF on little- and big-endian if (byteorder.compare("LSF")==0) { v = (int16_t)z; o = (z >> 16); } // works if we read a big-endian file on little-endian if (byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 16) - 1); o = z & ((1 << 16) - 1); } stringstream val_stream; val_stream << v << '_' << o; string val_strl = val_stream.str(); as(df[ii])[j] = val_strl; break; } case 119: { int32_t v = 0; int64_t o = 0, z = 0; z = readbin(z, file, swapit); // works for LSF on little- and big-endian if (byteorder.compare("LSF")==0) { v = (int32_t)z & ((1 << 24) - 1); o = (z >> 24); } // FixMe: works if we read a big-endian file on little-endian if (byteorder.compare("MSF")==0) { v = (z >> 48) & ((1 << 24) - 1); o = z & ((1 << 24) - 1); } stringstream val_stream; val_stream << v << '_' << o; string val_strl = val_stream.str(); as(df[ii])[j] = val_strl; break; } } break; } // case < 0: default: { // skip to the next valid case fseeko64(file, abs(type), SEEK_CUR); break; } } if (type >= 0) ii += 1; checkUserInterrupt(); } } return(df); } readstata13/src/save_dta.cpp0000644000176200001440000004325113302234744015456 0ustar liggesusers/* * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include using namespace Rcpp; using namespace std; // Writes the binary Stata file // // @param filePath The full systempath to the dta file you want to export. // @param dat an R-Object of class data.frame. // @export // [[Rcpp::export]] int stata_save(const char * filePath, Rcpp::DataFrame dat) { uint32_t k = dat.size(); uint64_t n = dat.nrows(); const string timestamp = dat.attr("timestamp"); string datalabel = dat.attr("datalabel"); datalabel[datalabel.size()] = '\0'; CharacterVector valLabels = dat.attr("vallabels"); CharacterVector nvarnames = dat.attr("names"); List chs = dat.attr("expansion.fields"); List formats = dat.attr("formats"); List labeltable = dat.attr("label.table"); List varLabels = dat.attr("var.labels"); List vartypes = dat.attr("types"); const string version = dat.attr("version"); uint8_t const release = atoi(version.c_str()); uint8_t nformatslen = 0, ntimestamp = 0; uint16_t nvarnameslen = 0, nvarLabelslen = 0, nvalLabelslen = 0, ndlabel = 0, lbllen = 0; uint32_t chlen = 0, maxdatalabelsize = 0, maxlabelsize = 32000; switch (release) { case 117: nvarnameslen = 33; nformatslen = 49; nvalLabelslen = 33; nvarLabelslen = 81; maxdatalabelsize = 80; chlen = 33; lbllen = 33; break; case 118: case 119: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; nvarLabelslen = 321; maxdatalabelsize = 320; // in utf8 4 * 80 byte chlen = 129; lbllen = 129; break; } const string head = "
"; const string byteord = ""; const string K = ""; const string num = ""; const string lab = ""; const string endheader = "
"; const string startmap = ""; const string endmap = ""; const string startvart = ""; const string endvart = ""; const string startvarn = ""; const string endvarn = ""; const string startsor = ""; const string endsor = ""; const string startform = ""; const string endform = ""; const string startvalLabel = ""; const string endvalLabel = ""; const string startvarlabel= ""; const string endvarlabel= ""; const string startcharacteristics = ""; const string endcharacteristics = ""; const string startch = ""; const string endch = ""; const string startdata = ""; const string enddata = ""; const string startstrl = ""; const string endstrl = ""; const string startvall = ""; const string endvall = ""; const string startlbl = ""; const string endlbl = ""; string end = "
"; end[end.size()] = '\0'; fstream dta (filePath, ios::out | ios::binary); if (dta.is_open()) { /* Stata 13 uses to store 14 byte positions in a dta-file. This * vector is now created and filled with the correct map positions. At * the end of the creation process, all 14 values are known and map will * be filled with the correct values. */ NumericVector map(14); map(0) = dta.tellg(); writestr(head, head.size(), dta); writestr(version, 3, dta); // 117|118 (e.g. Stata 13|14) writestr(byteord, byteord.size(), dta); writestr(sbyteorder, 3, dta); // LSF writestr(K, K.size(), dta); if (release < 119) writebin((int16_t)k, dta, swapit); if (release == 119) writebin(k, dta, swapit); writestr(num, num.size(), dta); if (release == 117) writebin((int32_t)n, dta, swapit); if ((release == 118) | (release == 119)) writebin(n, dta, swapit); writestr(lab, lab.size(), dta); /* write a datalabel */ if (!datalabel.empty()) { if (datalabel.size() > maxdatalabelsize) { Rcpp::warning("Datalabel to long. Resizing. Max size is %d.", maxdatalabelsize); datalabel.resize(maxdatalabelsize); datalabel[datalabel.size()] = '\0'; } ndlabel = datalabel.size(); if (release == 117) writebin((uint8_t)ndlabel, dta, swapit); if ((release == 118) | (release == 119)) writebin(ndlabel, dta, swapit); writestr(datalabel,datalabel.size(), dta); } else { // empty data label defined by byte(s) of zero uint8_t zero = 0; if (release == 117) { writebin(zero, dta, swapit); } if ((release == 118) | (release == 119)) { writebin(zero, dta, swapit); writebin(zero, dta, swapit); } } /* timestamp size is 0 (= no timestamp) or 17 */ writestr(timest, timest.size(), dta); if (!timestamp.empty()) { ntimestamp = 17; writebin(ntimestamp, dta, swapit); writestr(timestamp, timestamp.size(), dta); }else{ writebin(ntimestamp, dta, swapit); } writestr(endheader, endheader.size(), dta); /* ... */ map(1) = dta.tellg(); writestr(startmap, startmap.size(), dta); for (int32_t i = 0; i <14; ++i) { uint64_t nmap = 0; writebin(nmap, dta, swapit); } writestr(endmap, endmap.size(), dta); /* ... */ map(2) = dta.tellg(); writestr(startvart, startvart.size(), dta); uint16_t nvartype; for (uint32_t i = 0; i < k; ++i) { nvartype = as(vartypes[i]); writebin(nvartype, dta, swapit); } writestr(endvart, endvart.size(), dta); /* ... */ map(3) = dta.tellg(); writestr(startvarn, startvarn.size(), dta); for (uint32_t i = 0; i < k; ++i ) { string nvarname = as(nvarnames[i]); nvarname[nvarname.size()] = '\0'; if (nvarname.size() > nvarnameslen) Rcpp::warning("Varname to long. Resizing. Max size is %d", nvarnameslen - 1); writestr(nvarname, nvarnameslen, dta); } writestr(endvarn, endvarn.size(), dta); /* ... */ map(4) = dta.tellg(); writestr(startsor, startsor.size(), dta); uint64_t big_k = k+1; for (uint64_t i = 0; i < big_k; ++i) { uint16_t nsortlist = 0; writebin(nsortlist, dta, swapit); } writestr(endsor, endsor.size(), dta); /* ... */ map(5) = dta.tellg(); writestr(startform, startform.size(), dta); for (uint32_t i = 0; i < k; ++i ) { string nformats = as(formats[i]); if (nformats.size() >= nformatslen) Rcpp::warning("Formats to long. Resizing. Max size is %d", nformatslen); writestr(nformats, nformatslen, dta); } writestr(endform, endform.size(), dta); /* ... */ map(6) = dta.tellg(); writestr(startvalLabel, startvalLabel.size(), dta); for (uint32_t i = 0; i < k; ++i) { string nvalLabels = as(valLabels[i]); nvalLabels[nvalLabels.size()] = '\0'; if (nvalLabels.size() > nvalLabelslen) Rcpp::warning("Vallabel to long. Resizing. Max size is %d", nvalLabelslen - 1); writestr(nvalLabels, nvalLabelslen, dta); } writestr(endvalLabel, endvalLabel.size(), dta); /* ... */ map(7) = dta.tellg(); writestr(startvarlabel, startvarlabel.size(), dta); for (uint32_t i = 0; i < k; ++i) { if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) { string nvarLabels = as(varLabels[i]); if (nvarLabels.size() > nvarLabelslen) Rcpp::warning("Varlabel to long. Resizing. Max size is %d", nvarLabelslen - 1); nvarLabels[nvarLabels.size()] = '\0'; writestr(nvarLabels, nvarLabelslen, dta); } else { string nvarLabels = ""; nvarLabels[nvarLabels.size()] = '\0'; writestr(nvarLabels, nvarLabelslen, dta); } } writestr(endvarlabel, endvarlabel.size(), dta); /* ... */ map(8) = dta.tellg(); writestr(startcharacteristics, startcharacteristics.size(), dta); /* ... */ if (chs.size()>0){ for (int32_t i = 0; i(chs[i]); string ch1 = as(ch[0]); ch1[ch1.size()] = '\0'; string ch2 = as(ch[1]); ch2[ch2.size()] = '\0'; string ch3 = as(ch[2]); ch3[ch3.size()] = '\0'; uint32_t nnocharacter = chlen*2 + ch3.size() +1; writebin(nnocharacter, dta, swapit); writestr(ch1, chlen, dta); writestr(ch2, chlen, dta); writestr(ch3,ch3.size()+1, dta); writestr(endch, endch.size(), dta); } } writestr(endcharacteristics, endcharacteristics.size(), dta); /* ... */ map(9) = dta.tellg(); writestr(startdata, startdata.size(), dta); IntegerVector V, O; CharacterVector STRL; for(uint64_t j = 0; j < n; ++j) { for (uint32_t i = 0; i < k; ++i) { int const type = vartypes[i]; switch(type < 2046 ? 2045 : type) { // store numeric as Stata double (double) case 65526: { double val_d = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | R_IsNA(val_d) | R_IsNaN(val_d) | std::isinf(val_d) ) val_d = STATA_DOUBLE_NA; writebin(val_d, dta, swapit); break; } // float case 65527: { double val_d = 0; float val_f = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | (R_IsNA(val_d)) | R_IsNaN(val_d) | std::isinf(val_d) ) val_f = STATA_FLOAT_NA; else val_f = (double)(val_d); writebin(val_f, dta, swapit); break; } // store integer as Stata long (int32_t) case 65528: { int32_t val_l = 0; val_l = as(dat[i])[j]; if ( (val_l == NA_INTEGER) | (R_IsNA(val_l)) | R_IsNaN(val_l) | std::isinf(val_l) ) val_l = STATA_INT_NA; writebin(val_l, dta, swapit); break; } // int case 65529: { int16_t val_i = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_i = STATA_SHORTINT_NA; else val_i = val_l; writebin(val_i, dta, swapit); break; } // byte case 65530: { int8_t val_b = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_b = STATA_BYTE_NA; else val_b = val_l; writebin(val_b, dta, swapit); break; } // str case 2045: { int32_t const len = vartypes[i]; string val_s = as(as(dat[i])[j]); if (val_s == "NA") val_s.clear(); writestr(val_s, len, dta); break; } // strL case 32768: { /* Stata uses +1 */ int64_t z = 0; CharacterVector b = as(dat[i]); const string val_strl = as(b[j]); if (!val_strl.empty()) { switch (release) { case 117: { uint32_t v = i+1, o = j+1; writebin(v, dta, swapit); writebin(o, dta, swapit); // push back every v, o and val_strl V.push_back(v); O.push_back(o); break; } case 118: { int16_t v = i+1; int64_t o = j+1; char z[8]; // push back every v, o and val_strl V.push_back(v); O.push_back(o); // z is 'vv-- ----' memcpy(&z[0], &v, sizeof(v)); if (SBYTEORDER == 1) { o <<= 16; } memcpy(&z[2], &o, 6); // z is 'vvoo oooo' dta.write((char*)&z, sizeof(z)); // writestr((char*)&z, sizeof(z), dta); break; } case 119: { int32_t v = i+1; int64_t o = j+1; char z[8]; // push back every v, o and val_strl V.push_back(v); O.push_back(o); // z is 'vv-- ----' memcpy(&z[0], &v, sizeof(v)); if (SBYTEORDER == 1) { o <<= 24; } memcpy(&z[3], &o, 5); // z is 'vvvo oooo' dta.write((char*)&z, sizeof(z)); // writestr((char*)&z, sizeof(z), dta); break; } } STRL.push_back(val_strl); } else { writestr((char*)&z, sizeof(z), dta); } break; } } } } writestr(enddata, enddata.size(), dta); /* ... */ map(10) = dta.tellg(); writestr(startstrl, startstrl.size(), dta); int32_t strlsize = STRL.length(); for(int i =0; i < strlsize; ++i ) { const string gso = "GSO"; int32_t v = V[i]; int64_t o = O[i]; uint8_t t = 129; //Stata binary type, no trailing zero. const string strL = as(STRL[i]); uint32_t len = strL.size(); writestr(gso, gso.size(), dta); writebin(v, dta, swapit); if (release == 117) writebin((uint32_t)o, dta, swapit); if ((release == 118) | (release == 119)) writebin(o, dta, swapit); writebin(t, dta, swapit); writebin(len, dta, swapit); writestr(strL, strL.size(), dta); } writestr(endstrl, endstrl.size(), dta); /* ... */ map(11) = dta.tellg(); writestr(startvall, startvall.size(), dta); if (labeltable.size()>0) { CharacterVector labnames = labeltable.attr("names"); int8_t padding = 0; for (int32_t i=0; i < labnames.size(); ++i) { int32_t txtlen = 0; const string labname = as(labnames[i]); IntegerVector labvalue = labeltable[labname]; int32_t N = labvalue.size(); CharacterVector labelText = labvalue.attr("names"); IntegerVector off; /* * Fill off with offset position and create txtlen */ for (int32_t i = 0; i < labelText.size(); ++i) { string label = as(labelText[i]); uint32_t labellen = label.size()+1; if (labellen > maxlabelsize+1) labellen = maxlabelsize+1; txtlen += labellen; off.push_back ( txtlen-labellen ); } int32_t offI, labvalueI; int32_t nlen = sizeof(N) + sizeof(txtlen) + sizeof(offI)*N + sizeof(labvalueI)*N + txtlen; writestr(startlbl, startlbl.size(), dta); writebin(nlen, dta, swapit); writestr(labname, lbllen, dta); writestr((char*)&padding, 3, dta); writebin(N, dta, swapit); writebin(txtlen, dta, swapit); for (int32_t i = 0; i < N; ++i) { offI = off[i]; writebin(offI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { labvalueI = labvalue[i]; writebin(labvalueI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { string labtext = as(labelText[i]); if (labtext.size() > maxlabelsize) { Rcpp::warning("Label to long. Resizing. Max size is %d", maxlabelsize); labtext.resize(maxlabelsize); // labtext[labtext.size()] = '\0'; } writestr(labtext, labtext.size()+1, dta); } writestr(endlbl, endlbl.size(), dta); } } writestr(endvall, endvall.size(), dta); /* */ map(12) = dta.tellg(); writestr(end, end.size(), dta); /* end-of-file */ map(13) = dta.tellg(); /* seek up to to rewrite it*/ /* ... */ dta.seekg(map(1)); writestr(startmap, startmap.size(), dta); for (int i=0; i <14; ++i) { uint64_t nmap = 0; uint32_t hi = 0, lo = 0; nmap = map(i); hi = (nmap >> 32); lo = nmap; if (SBYTEORDER == 2) { // LSF writebin(lo, dta, swapit); writebin(hi, dta, swapit); } else { // MSF writebin(hi, dta, swapit); writebin(lo, dta, swapit); } } writestr(endmap, endmap.size(), dta); dta.close(); return 0; } else { throw std::range_error("Unable to open file."); return -1; } } readstata13/src/read_pre13_dta.cpp0000644000176200001440000003303513302234744016444 0ustar liggesusers/* * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" #include "read_data.h" using namespace Rcpp; using namespace std; List read_pre13_dta(FILE * file, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols) { int8_t release = 0; rewind(file); release = readbin(release, file, 0); if (release<102 || release == 109 || release>115) stop("First byte: Not a dta-file we can read."); IntegerVector versionIV(1); versionIV(0) = release; /* * byteorder is a 4 byte character e.g. "LSF". MSF referes to big-endian. */ uint16_t ndlabel = 81; uint8_t nvarnameslen = 33; int8_t nformatslen = 49; uint8_t nvalLabelslen = 33; uint16_t nvarLabelslen = 81; int32_t chlen = 33; uint8_t lbllen = 33; switch(release) { case 102: ndlabel = 30; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 103: case 104: ndlabel = 32; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 105: case 106: chlen = 9; ndlabel = 32; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; nvarLabelslen = 32; lbllen = 9; break; case 107: case 108: chlen = 9; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; lbllen = 9; break; case 110: case 111: case 112: case 113: nformatslen = 12; break; } CharacterVector byteorderC(1); IntegerVector byteorderI(1); bool swapit = 0; int8_t byteorder_i = 0; byteorder_i = readbin(byteorder_i, file, 0); // 1 = MSF 2 = LSF swapit = std::abs(SBYTEORDER-byteorder_i); byteorderI(0) = byteorder_i; std::string byteorder(3, '\0'); if (byteorder_i == 1) byteorder = "MSF"; else byteorder = "LSF"; // filetype: unknown? int8_t ft = 0; ft = readbin(ft, file, swapit); int8_t unused = 0; unused = readbin(unused, file, swapit); /* * Number of Variables */ uint16_t k = 0; k = readbin(k, file, swapit); /* * Number of Observations */ uint32_t n = 0; n = readbin(n, file, swapit); // dim to return original dim for partial read files IntegerVector dim(2); dim(0) = n; dim(1) = k; /* * A dataset may have a label e.g. "Written by R". * First we read its length (ndlabel), later the actual label (datalabel). * ndlabel: length of datalabel (excl. binary 0) * datalabel: string max length 80 */ CharacterVector datalabelCV(1); std::string datalabel(ndlabel, '\0'); if (ndlabel > 0) readstring(datalabel, file, datalabel.size()); else datalabel = ""; datalabelCV(0) = datalabel; CharacterVector timestampCV(1); std::string timestamp(18, '\0'); switch (release) { case 102: case 103: case 104: { timestamp = ""; break; } default: { readstring(timestamp, file, timestamp.size()); break; } } timestampCV(0) = timestamp; /* * vartypes. * 0-2045: strf (String: Max length 2045) * 32768: strL (long String: Max length 2 billion) * 65526: double * 65527: float * 65528: long * 65529: int * 65530: byte */ IntegerVector vartype(k); switch (release) { case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 110: case 112: { uint8_t nvartypec = 0; for (uint16_t i=0; i127) vartype[i] = nvartypec - 127; } break; } case 111: case 113: case 114: case 115: { uint8_t nvartype = 0; for (uint16_t i=0; i ... */ List ch = List(); if (release > 104) { int8_t datatype = 0; uint32_t len = 0; datatype = readbin(datatype, file, swapit); if (release <= 108) len = readbin((uint16_t)len, file, swapit); else len = readbin(len, file, swapit); while (!(datatype==0) && !(len==0)) { std::string chvarname(chlen, '\0'); std::string chcharact(chlen, '\0'); std::string nnocharacter(len-chlen*2, '\0'); readstring(chvarname, file, chvarname.size()); readstring(chcharact, file, chcharact.size()); readstring(nnocharacter, file, nnocharacter.size()); // chs vector CharacterVector chs(3); chs[0] = chvarname; chs[1] = chcharact; chs[2] = nnocharacter; // add characteristics to the list ch.push_front( chs ); datatype = readbin(datatype, file, swapit); if (release <= 108) len = readbin((uint16_t)len, file, swapit); else len = readbin(len, file, swapit); } } /* * data. First a list is created with vectors. The vector type is defined by * vartype. Stata stores data columnwise so we loop over it and store the * data in the list of the first step. Third variable- and row-names are * attached and the list type is changed to data.frame. */ /* replace vartypes of Stata 8 - 12 with Stata 13 values. */ // 117 contains new variable types (longer strings and strL) std::replace (vartype.begin(), vartype.end(), 251, STATA_BYTE); std::replace (vartype.begin(), vartype.end(), 252, STATA_SHORTINT); std::replace (vartype.begin(), vartype.end(), 253, STATA_INT); std::replace (vartype.begin(), vartype.end(), 254, STATA_FLOAT); std::replace (vartype.begin(), vartype.end(), 255, STATA_DOUBLE); uint64_t nmin = selectrows(0), nmax = selectrows(1); uint64_t nn = 0; // if selectrows is c(0,0) use full data if ((nmin == 0) && (nmax == 0)){ nmin = 1; nmax = n; } // make sure that n is not greater than nmax or nmin if (n < nmax) nmax = n; if (n < nmin) nmin = n; // sequences of column and row IntegerVector cvec = seq(0, (k-1)); IntegerVector rvec = seq(nmin, nmax); nn = rvec.size(); // use c indexing starting at 0 nmin = nmin -1; nmax = nmax -1; // calculate length of each variable stored in file. Calculate row length IntegerVector rlen = calc_rowlength(vartype); uint64_t rlength = sum(rlen); // check if vars are selected std::string selcols = as(selectcols(0)); bool selectvars = selcols != ""; // select vars: either select every var or only matched cases. This will // return index positions of the selected variables. If non are selected the // index position is cvec IntegerVector select = cvec, nselect; if (selectvars) select = choose(selectcols, varnames); // separate the selected from the not selected cases LogicalVector ll = is_na(select); nselect = cvec[ll == 1]; select = cvec[ll == 0]; uint32_t kk = select.size(); // shrink variables to selected size CharacterVector varnames_kk = varnames[select]; IntegerVector vartype_kk = vartype[select]; IntegerVector vartype_s = vartype; IntegerVector types_kk = types[select]; // replace not selected cases with their negative size values IntegerVector rlen2 = rlen[nselect]; rlen2 = -rlen2; vartype_s[nselect] = rlen2; // Use vartype_s to calulate jump IntegerVector vartype_sj = calc_jump(vartype_s); // 2. fill it with data // skip into the data part fseeko64(file, rlength * nmin, SEEK_CUR); List df = read_data(file, vartype_kk, missing, release, nn, kk, vartype_sj, byteorder, swapit); // skip to end of data part fseeko64(file, rlength * (n - nmax -1), SEEK_CUR); // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames_kk; df.attr("class") = "data.frame"; /* * labels are separated by -tags. Labels may appear in any order e.g. * 2 "female" 1 "male 9 "missing". They are stored as tables. * nlen: length of label. * nlabname: label name. * labn: number of labels in this set (e.g. "male" "female" = 2) * txtlen: length of the label text. * off: offset defines where to read a new label in txtlen. */ List labelList = List(); //put labels into this list if (release>105) { // FixMe: the while statement differs and the final check int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0; std::string tag(5, '\0'); bool haslabel = false; // length of value_label_table nlen = readbin(nlen, file, swapit); if (!(feof(file) || ferror(file))) haslabel = true; while(haslabel) { // name of this label set std::string nlabname(lbllen, '\0'); readstring(nlabname, file, nlabname.size()); //padding fseek(file, 3, SEEK_CUR); // value_label_table for actual label set labn = readbin(labn, file, swapit); txtlen = readbin(txtlen, file, swapit); // offset for each label // off0 : label 0 starts at off0 // off1 : label 1 starts at off1 ... IntegerVector off(labn); for (int i=0; i < labn; ++i) { noff = readbin(noff, file, swapit); off[i] = noff; } // needed for match IntegerVector laborder = clone(off); //laborder.erase(labn+1); IntegerVector labordersort = clone(off); //labordersort.erase(labn+1); std::sort(labordersort.begin(), labordersort.end()); // needs txtlen for loop off.push_back(txtlen); // sort offsets so we can read labels sequentially std::sort(off.begin(), off.end()); // create an index to sort labels along the code values // this is done while factor creation IntegerVector indx(labn); indx = match(laborder,labordersort); // code for each label IntegerVector code(labn); for (int i=0; i < labn; ++i) { val = readbin(val, file, swapit); code[i] = val; } // label text CharacterVector label(labn); for (int i=0; i < labn; ++i) { int lablen = off[i+1]-off[i]; std::string lab (lablen, '\0'); readstring(lab, file, lablen); label[i] = lab; } // sort labels according to indx CharacterVector labelo(labn); for (int i=0; i < labn; ++i) { labelo[i] = label[indx[i]-1]; } // create table for actual label set string const labset = nlabname; code.attr("names") = labelo; // add this set to output list labelList.push_front( code, labset); // length of value_label_table nlen = readbin(nlen, file, swapit); if (feof(file) || ferror(file)) break; } } /* * assign attributes to the resulting data.frame */ formats = formats[select]; valLabels = valLabels[select]; varLabels = varLabels[select]; df.attr("datalabel") = datalabelCV; df.attr("time.stamp") = timestampCV; df.attr("formats") = formats; df.attr("types") = types_kk; df.attr("val.labels") = valLabels; df.attr("var.labels") = varLabels; df.attr("version") = versionIV; df.attr("label.table") = labelList; df.attr("expansion.fields") = ch; df.attr("byteorder") = byteorderI; df.attr("orig.dim") = dim; return df; } readstata13/src/save_pre13_dta.cpp0000644000176200001440000002747413302234744016501 0ustar liggesusers/* * Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include using namespace Rcpp; using namespace std; // Writes the binary Stata file // // @param filePath The full systempath to the dta file you want to export. // @param dat an R-Object of class data.frame. // @export // [[Rcpp::export]] int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat) { uint16_t k = dat.size(); uint32_t n = dat.nrows(); int8_t byteorder = SBYTEORDER; string timestamp = dat.attr("timestamp"); timestamp.resize(18); string datalabel = dat.attr("datalabel"); datalabel[datalabel.size()] = '\0'; CharacterVector valLabels = dat.attr("vallabels"); CharacterVector nvarnames = dat.attr("names"); List chs = dat.attr("expansion.fields"); List formats = dat.attr("formats"); List labeltable = dat.attr("label.table"); List varLabels = dat.attr("var.labels"); List vartypes = dat.attr("types"); int8_t version = as(dat.attr("version")); fstream dta (filePath, ios::out | ios::binary); if (dta.is_open()) { uint32_t ndlabel = 81; uint32_t nformatslen = 49; uint32_t nvarnameslen = 33; uint32_t nvalLabelslen = 33; uint32_t nvarLabelslen = 81; uint32_t chlen = 33; uint32_t maxlabelsize = 32000; uint32_t maxstrsize = 244; if (version<111 || version==112) maxstrsize = 80; switch(version) { case 102: ndlabel = 30; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 103: case 104: ndlabel = 32; nvarnameslen = 9; nformatslen = 7; nvalLabelslen = 9; nvarLabelslen = 32; break; case 105: case 106:// unknown version (SE?) chlen = 9; ndlabel = 32; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; nvarLabelslen = 32; break; case 107: // unknown version (SE?) case 108: chlen = 9; nvarnameslen = 9; nformatslen = 12; nvalLabelslen = 9; case 110: case 111: case 112: case 113: nformatslen = 12; break; } writebin(version, dta, swapit); // format writebin(byteorder, dta, swapit); // LSF int8_t ft = 1; // filetype writebin(ft, dta, swapit); int8_t unused = 0; // unused writebin(unused, dta, swapit); writebin(k, dta, swapit); // nvars writebin(n, dta, swapit); // nobs /* write a datalabel */ if (datalabel.size() > ndlabel) Rcpp::warning("Datalabel too long. Resizing. Max size is %d.", ndlabel - 1); writestr(datalabel, ndlabel, dta); /* timestamp size is 17 */ if (version > 104) { if (timestamp.size() > 18) { Rcpp::warning("Timestamp too long. Dropping."); timestamp = ""; } writestr(timestamp, timestamp.size(), dta); } /* ... */ uint8_t nvartype; for (uint16_t i = 0; i < k; ++i) { nvartype = as(vartypes[i]); if(version<111 || version==112) { char c[2]; switch(nvartype) { case 255: strcpy(c, "d"); c[1] = '\0'; dta.write(c, 1); break; case 254: strcpy(c, "f"); c[1] = '\0'; dta.write(c, 1); break; case 253: strcpy(c, "l"); c[1] = '\0'; dta.write(c, 1); break; case 252: strcpy(c, "i"); c[1] = '\0'; dta.write(c, 1); break; case 251: strcpy(c,"b"); c[1] = '\0'; dta.write(c, 1); break; default: char d = char(nvartype+127); dta.write(&d, 1); break; } } else writebin(nvartype, dta, swapit); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nvarname = as(nvarnames[i]); if (nvarname.size() > nvarnameslen) Rcpp::warning("Varname too long. Resizing. Max size is %d", nvarnameslen - 1); writestr(nvarname, nvarnameslen, dta); } /* ... */ uint32_t big_k = k+1; for (uint32_t i = 0; i < big_k; ++i) { uint16_t nsortlist = 0; writebin(nsortlist, dta, swapit); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nformats = as(formats[i]); if (nformats.size() > nformatslen) Rcpp::warning("Formats too long. Resizing. Max size is %d", nformatslen - 1); writestr(nformats, nformatslen, dta); } /* ... */ for (uint16_t i = 0; i < k; ++i ) { string nvalLabels = as(valLabels[i]); if (nvalLabels.size() > nvalLabelslen) Rcpp::warning("Vallabel too long. Resizing. Max size is %d", nvalLabelslen - 1); writestr(nvalLabels, nvalLabelslen, dta); } /* ... */ for (uint16_t i = 0; i < k; ++i) { string nvarLabels = ""; if (!Rf_isNull(varLabels) && Rf_length(varLabels) > 1) { nvarLabels = as(varLabels[i]); if (nvarLabels.size() > nvarLabelslen) Rcpp::warning("Varlabel too long. Resizing. Max size is %d", nvarLabelslen - 1); } writestr(nvarLabels, nvarLabelslen, dta); } /* ... */ if (version > 104) { int8_t datatype = 0; uint32_t len = 0; if (chs.size()>0) { for (int32_t i = 0; i(chs[i]); string ch1 = as(ch[0]); ch1[ch1.size()] = '\0'; string ch2 = as(ch[1]); ch2[ch2.size()] = '\0'; string ch3 = as(ch[2]); ch3[ch3.size()] = '\0'; len = chlen + chlen + ch3.size()+1; datatype = 1; writebin(datatype, dta, swapit); if(version<=108) writebin((int16_t)len, dta, swapit); else writebin(len, dta, swapit); writestr(ch1, chlen, dta); writestr(ch2, chlen, dta); writestr(ch3, ch3.size()+1, dta); } } // five bytes of zero end characteristics datatype = 0; len = 0; writebin(datatype, dta, swapit); if (version<=108) writebin((int16_t)len, dta, swapit); else writebin(len, dta, swapit); } /* ... */ for(uint32_t j = 0; j < n; ++j) { for (uint16_t i = 0; i < k; ++i) { int const type = vartypes[i]; switch(type) { // store numeric as Stata double (double) case 255: { double val_d = 0; val_d = as(dat[i])[j]; if ( (val_d == NA_REAL) | R_IsNA(val_d) ) val_d = STATA_DOUBLE_NA; writebin(val_d, dta, swapit); break; } // float case 254: { double val_d = 0; float val_f = 0; val_d = as(dat[i])[j]; if ((val_d == NA_REAL) | (R_IsNA(val_d)) ) val_f = STATA_FLOAT_NA; else val_f = (float)(val_d); writebin(val_f, dta, swapit); break; } // store integer as Stata long (int32_t) case 253: { int32_t val_l = 0; val_l = as(dat[i])[j]; if ( (val_l == NA_INTEGER) | (R_IsNA(val_l)) ) { if(version>111) val_l = STATA_INT_NA; else val_l = STATA_INT_NA_108; } writebin(val_l, dta, swapit); break; } // int case 252: { int16_t val_i = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) val_i = STATA_SHORTINT_NA; else val_i = val_l; writebin(val_i, dta, swapit); break; } // byte case 251: { int8_t val_b = 0; int32_t val_l = 0; val_l = as(dat[i])[j]; if (val_l == NA_INTEGER) { if (version>104) val_b = STATA_BYTE_NA; else val_b = STATA_BYTE_NA_104; } else { val_b = val_l; } writebin(val_b, dta, swapit); break; } default: { int32_t len = vartypes[i]; string val_s = as(as(dat[i])[j]); if(val_s == "NA") val_s.clear(); // Stata 6-12 can only store 244 byte strings if(val_s.size()>maxstrsize) { Rcpp::warning("Character value too long. Resizing. Max size is %d.", maxstrsize); } writestr(val_s, len, dta); break; } } } } /* ... */ if ((labeltable.size()>0) & (version>105)) { CharacterVector labnames = labeltable.attr("names"); int8_t padding = 0; for (int32_t i=0; i < labnames.size(); ++i) { int32_t txtlen = 0; string labname = as(labnames[i]); IntegerVector labvalue = labeltable[labname]; int32_t N = labvalue.size(); CharacterVector labelText = labvalue.attr("names"); IntegerVector off; /* * Fill off with offset position and create txtlen */ for (int32_t i = 0; i < labelText.size(); ++i) { string label = as(labelText[i]); uint32_t labellen = label.size()+1; if (labellen > maxlabelsize+1) labellen = maxlabelsize+1; txtlen += labellen; off.push_back ( txtlen-labellen ); } int32_t offI, labvalueI; int32_t nlen = sizeof(N) + sizeof(txtlen) + sizeof(offI)*N + sizeof(labvalueI)*N + txtlen; writebin(nlen, dta, swapit); writestr(labname, nvarnameslen, dta); writestr((char*)&padding, 3, dta); writebin(N, dta, swapit); writebin(txtlen, dta, swapit); for (int32_t i = 0; i < N; ++i) { offI = off[i]; writebin(offI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { labvalueI = labvalue[i]; writebin(labvalueI, dta, swapit); } for (int32_t i = 0; i < N; ++i) { string labtext = as(labelText[i]); if (labtext.size() > maxlabelsize) { Rcpp::warning("Label too long. Resizing. Max size is %d", maxlabelsize); labtext.resize(maxlabelsize); // labtext[labtext.size()] = '\0'; } writestr(labtext, labtext.size()+1, dta); } } } dta.close(); return 0; } else { Rcpp::stop("Unable to open file."); return -1; } } readstata13/src/read_dta.cpp0000644000176200001440000004120013302234744015423 0ustar liggesusers/* * Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program. If not, see . */ #include "readstata.h" #include "read_data.h" using namespace Rcpp; using namespace std; List read_dta(FILE * file, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols, const bool strlexport, const CharacterVector strlpath) { // stata_dta>
test("stata_dta>
", file); test("", file); /* * version is a 4 byte character e.g. "117" */ int8_t fversion = 117L; //f = first int8_t lversion = 119L; //l = last std::string version(3, '\0'); readstring(version, file, version.size()); int8_t const release = atoi(version.c_str()); IntegerVector versionIV(1); versionIV(0) = release; // check the release version. if (releaselversion) { warning("File version is %d.\nVersion: Not a version 13/14 dta-file", release); return -1; } uint8_t nvarnameslen = 0; int8_t nformatslen = 0; uint8_t nvalLabelslen = 0; uint16_t nvarLabelslen = 0; int32_t chlen = 0; uint8_t lbllen = 0; switch(release) { case 117: nvarnameslen = 33; nformatslen = 49; nvalLabelslen = 33; nvarLabelslen = 81; chlen = 33; lbllen = 33; break; case 118: case 119: nvarnameslen = 129; nformatslen = 57; nvalLabelslen = 129; nvarLabelslen = 321; chlen = 129; lbllen = 129; break; } // test("", file); test("", file); /* * byteorder is a 4 byte character e.g. "LSF". MSF refers to big-endian. */ std::string byteorder(3, '\0'); readstring(byteorder,file, byteorder.size()); // test("", file); test("", file); bool swapit = 0; swapit = strcmp(byteorder.c_str(), sbyteorder); /* * Number of Variables */ uint32_t k = 0; if (release < 119) k = readbin((uint16_t)k, file, swapit); if (release == 119) k = readbin(k, file, swapit); // test("", file); test("", file); /* * Number of Observations */ uint64_t n = 0; if (release == 117) n = readbin((uint32_t)n, file, swapit); if ((release == 118) | (release == 119)) n = readbin(n, file, swapit); // test("", file); test(" test("", file); test("", file); /* * A dataset may have a timestamp. If it has a timestamp the length of the * timestamp (ntimestamp) is 17. Else it is zero. * ntimestamp: 0 or 17 * timestamp: empty or 17 byte string */ uint8_t ntimestamp = 0; ntimestamp = readbin(ntimestamp, file, swapit); std::string timestamp(17, '\0'); if (ntimestamp == 17) // ntimestap is 0 or 17 { readstring(timestamp, file, timestamp.size()); } else { timestamp = ""; } CharacterVector timestampCV = timestamp; //
test("
", file); test("", file); /* * Stata stores the byteposition of certain areas of the file here. Currently * this is of no use to us. * 1. * 2. * 3. * 4. * 5. * 6. * 7. * 8. * 9. * 10. * 11. * 12. * 13. * 14. end-of-file */ NumericVector map(14); for (int i=0; i <14; ++i) { uint64_t nmap = 0; nmap = readbin(nmap, file, swapit); map[i] = nmap; } // test("
", file); test("", file); /* * vartypes. * 0-2045: strf (String: Max length 2045) * 32768: strL (long String: Max length 2 billion) * 65526: double * 65527: float * 65528: long * 65529: int * 65530: byte */ IntegerVector vartype(k); for (uint32_t i=0; i test("", file); test("", file); /* * varnames. */ std::string nvarnames(nvarnameslen, '\0'); CharacterVector varnames(k); for (uint32_t i=0; i test("", file); test("", file); /* * sortlist. Stata stores the information which variable of a dataset was * sorted. Depending on byteorder sortlist is written differently. Currently we * do not use this information. * Vector size is k+1. */ uint64_t big_k = k+1; IntegerVector sortlist(big_k); for (uint64_t i=0; i test("", file); test("", file); /* * formats handle how Stata prints a variable. Currently we do not use this * information. */ std::string nformats(nformatslen, '\0'); CharacterVector formats(k); for (uint32_t i=0; i test("", file); test("",file); /* * value_label_names. Stata stores variable labels by names. * nvalLabels: length of the value_label_name * valLabels: */ std::string nvalLabels(nvalLabelslen, '\0'); CharacterVector valLabels(k); for (uint32_t i=0; i test("", file); test("", file); /* * variabel_labels */ std::string nvarLabels (nvarLabelslen, '\0'); CharacterVector varLabels(k); for (uint32_t i=0; i test("", file); test("", file); /* * characteristics. Stata can store additional information this way. It may * contain notes (for the dataset or a variable) or about label language sets. * Characteristics are not documented. We export them as attribute: * expansion.fields. Characteristics are separated by tags. Each has: * nocharacter: length of the characteristics * chvarname: varname (binary 0 terminated) * chcharact: characteristicsname (binary 0 terminated) * nnocharacter: contes (binary 0 terminated) */ std::string chtag = ""; std::string tago(4, '\0'); readstring(tago, file, tago.size()); List ch = List(); CharacterVector chs(3); while (chtag.compare(tago)==0) { uint32_t nocharacter = 0; nocharacter = readbin(nocharacter, file, swapit); std::string chvarname(chlen, '\0'); std::string chcharact(chlen, '\0'); std::string nnocharacter(nocharacter-chlen*2, '\0'); readstring(chvarname, file, chvarname.size()); readstring(chcharact, file, chcharact.size()); readstring(nnocharacter, file, nnocharacter.size()); // chs vector CharacterVector chs(3); chs[0] = chvarname; chs[1] = chcharact; chs[2] = nnocharacter; // add characteristics to the list ch.push_front( chs ); // test("", file); // read next tag readstring(tago, file, tago.size()); } //[ test("aracteristics>", file); test("", file); /* * data. First a list is created with vectors. The vector type is defined by * vartype. Stata stores data columnwise so we loop over it and store the * data in the list of the first step. Third variable- and row-names are * attached and the list type is changed to data.frame. */ uint64_t nmin = selectrows(0), nmax = selectrows(1); uint64_t nn = 0; // if selectrows is c(0,0) use full data if ((nmin == 0) && (nmax == 0)){ nmin = 1; nmax = n; } // make sure that n is not greater than nmax or nmin if (n < nmax) nmax = n; if (n < nmin) nmin = n; // sequences of column and row IntegerVector cvec = seq(0, (k-1)); IntegerVector rvec = seq(nmin, nmax); nn = rvec.size(); // use c indexing starting at 0 nmin = nmin -1; nmax = nmax -1; // calculate length of each variable stored in file. Calculate row length IntegerVector rlen = calc_rowlength(vartype); uint64_t rlength = sum(rlen); // check if vars are selected std::string selcols = as(selectcols(0)); bool selectvars = selcols != ""; // select vars: either select every var or only matched cases. This will // return index positions of the selected variables. If non are selected the // index position is cvec IntegerVector select = cvec, nselect; if (selectvars) select = choose(selectcols, varnames); // separate the selected from the not selected cases LogicalVector ll = is_na(select); nselect = cvec[ll == 1]; select = cvec[ll == 0]; uint32_t kk = select.size(); // shrink variables to selected size CharacterVector varnames_kk = varnames[select]; IntegerVector vartype_kk = vartype[select]; IntegerVector vartype_s = vartype; // replace not selected cases with their negative size values IntegerVector rlen2 = rlen[nselect]; rlen2 = -rlen2; vartype_s[nselect] = rlen2; // Use vartype_s to calculate jump IntegerVector vartype_sj = calc_jump(vartype_s); // 2. fill it with data // skip into the data part fseeko64(file, rlength * nmin, SEEK_CUR); List df = read_data(file, vartype_kk, missing, release, nn, kk, vartype_sj, byteorder, swapit); // skip to end of data part fseeko64(file, rlength * (n - nmax -1), SEEK_CUR); // 3. Create a data.frame df.attr("row.names") = rvec; df.attr("names") = varnames_kk; df.attr("class") = "data.frame"; // test("", file); test("", file); /* * strL. Stata 13 introduced long strings up to 2 billion characters. strLs are * separated by "GSO". * (v,o): Position in the data.frame. * t: 129/130 defines whether or not the strL is stored with a binary 0. * len: length of the strL. * strl: long string. */ std::string gso = "GSO"; std::string tags(3, '\0'); readstring(tags, file, tags.size()); //put strLs into a named vector CharacterVector strlvalues(0); CharacterVector strlnames(0); while (gso.compare(tags)==0) { CharacterVector strls(2); string ref; // FixMe: Strl in 118 switch (release) { case 117: { uint32_t v = 0, o = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); break; } case 118: case 119: { uint32_t v = 0; uint64_t o = 0; v = readbin(v, file, swapit); o = readbin(o, file, swapit); stringstream val_stream; val_stream << v << '_' << o; ref.assign(val_stream.str()); break; } } // (129 = binary) | (130 = ascii) Note: // if 130 full len contains the string. if 130 len includes trailing \0. // that does not affect us. we read the full len, and if \0 occurs R // will print only the string up to that position. we write 129 uint8_t t = 0; t = readbin(t, file, swapit); uint32_t len = 0; len = readbin(len, file, swapit); std::string strl(len, '\0'); readstring(strl, file, strl.size()); // write strl to file. Stata allows binary files in strls if (strlexport) { std::string path = Rcpp::as(strlpath); std::string outputpath = path + "/" + ref; ofstream file1(outputpath.c_str(), ios::out | ios::binary); if (file1.good()) { file1.write(strl.c_str(), strl.size()); file1.close(); } else { Rcpp::Rcout << "strl export failed" << std::endl; } } strlvalues.push_back( strl ); strlnames.push_back( ref ); readstring(tags, file, tags.size()); } // set identifier as name strlvalues.attr("names") = strlnames; // after strls //[ test("trls>", file); test("", file); /* * labels are separated by -tags. Labels may appear in any order e.g. * 2 "female" 1 "male 9 "missing". They are stored as tables. * nlen: length of label. * nlabname: label name. * labn: number of labels in this set (e.g. "male" "female" = 2) * txtlen: length of the label text. * off: offset defines where to read a new label in txtlen. */ std::string lbltag = ""; std::string tag(5, '\0'); readstring(tag, file, tag.size()); List labelList = List(); //put labels into this list while (lbltag.compare(tag)==0) { int32_t nlen = 0, labn = 0, txtlen = 0, noff = 0, val = 0; // length of value_label_table nlen = readbin(nlen, file, swapit); // name of this label set std::string nlabname(lbllen, '\0'); readstring(nlabname, file, nlabname.size()); //padding fseek(file, 3, SEEK_CUR); // value_label_table for actual label set labn = readbin(labn, file, swapit); txtlen = readbin(txtlen, file, swapit); // offset for each label // off0 : label 0 starts at off0 // off1 : label 1 starts at off1 ... IntegerVector off(labn); for (int i=0; i < labn; ++i) { noff = readbin(noff, file, swapit); off[i] = noff; } // needed for match IntegerVector laborder = clone(off); //laborder.erase(labn+1); IntegerVector labordersort = clone(off); //labordersort.erase(labn+1); std::sort(labordersort.begin(), labordersort.end()); // needs txtlen for loop off.push_back(txtlen); // sort offsets so we can read labels sequentially std::sort(off.begin(), off.end()); // create an index to sort lables along the code values // this is done while factor creation IntegerVector indx(labn); indx = match(laborder,labordersort); // code for each label IntegerVector code(labn); for (int i=0; i < labn; ++i) { val = readbin(val, file, swapit); code[i] = val; } // label text CharacterVector label(labn); for (int i=0; i < labn; ++i) { int lablen = off[i+1]-off[i]; std::string lab (lablen, '\0'); readstring(lab, file, lablen); label[i] = lab; } // sort labels according to indx CharacterVector labelo(labn); for (int i=0; i < labn; ++i) { labelo[i] = label[indx[i]-1]; } // create table for actual label set string const labset = nlabname; code.attr("names") = labelo; // add this set to output list labelList.push_front( code, labset); fseek(file, 6, SEEK_CUR); // readstring(tag, file, tag.size()); } /* * Final test if we reached the end of the file * close the file */ // [ test("ue_labels>", file); test("", file); /* * assign attributes to the resulting data.frame */ formats = formats[select]; valLabels = valLabels[select]; varLabels = varLabels[select]; df.attr("datalabel") = datalabelCV; df.attr("time.stamp") = timestampCV; df.attr("formats") = formats; df.attr("types") = vartype_kk; df.attr("val.labels") = valLabels; df.attr("var.labels") = varLabels; df.attr("version") = versionIV; df.attr("label.table") = labelList; df.attr("expansion.fields") = ch; df.attr("strl") = strlvalues; df.attr("byteorder") = wrap(byteorder); df.attr("orig.dim") = dim; return df; } readstata13/src/Makevars.win0000644000176200001440000000016113302234744015445 0ustar liggesusers## -*- mode: makefile; -*- PKG_CPPFLAGS = -I../inst/include -I. PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) readstata13/src/RcppExports.cpp0000644000176200001440000000523613302234744016162 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // stata_read List stata_read(const char * filePath, const bool missing, const IntegerVector selectrows, const CharacterVector selectcols, const bool strlexport, const CharacterVector strlpath); RcppExport SEXP _readstata13_stata_read(SEXP filePathSEXP, SEXP missingSEXP, SEXP selectrowsSEXP, SEXP selectcolsSEXP, SEXP strlexportSEXP, SEXP strlpathSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< const bool >::type missing(missingSEXP); Rcpp::traits::input_parameter< const IntegerVector >::type selectrows(selectrowsSEXP); Rcpp::traits::input_parameter< const CharacterVector >::type selectcols(selectcolsSEXP); Rcpp::traits::input_parameter< const bool >::type strlexport(strlexportSEXP); Rcpp::traits::input_parameter< const CharacterVector >::type strlpath(strlpathSEXP); rcpp_result_gen = Rcpp::wrap(stata_read(filePath, missing, selectrows, selectcols, strlexport, strlpath)); return rcpp_result_gen; END_RCPP } // stata_save int stata_save(const char * filePath, Rcpp::DataFrame dat); RcppExport SEXP _readstata13_stata_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); rcpp_result_gen = Rcpp::wrap(stata_save(filePath, dat)); return rcpp_result_gen; END_RCPP } // stata_pre13_save int stata_pre13_save(const char * filePath, Rcpp::DataFrame dat); RcppExport SEXP _readstata13_stata_pre13_save(SEXP filePathSEXP, SEXP datSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); rcpp_result_gen = Rcpp::wrap(stata_pre13_save(filePath, dat)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_readstata13_stata_read", (DL_FUNC) &_readstata13_stata_read, 6}, {"_readstata13_stata_save", (DL_FUNC) &_readstata13_stata_save, 2}, {"_readstata13_stata_pre13_save", (DL_FUNC) &_readstata13_stata_pre13_save, 2}, {NULL, NULL, 0} }; RcppExport void R_init_readstata13(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } readstata13/NAMESPACE0000644000176200001440000000113113302177220013576 0ustar liggesusers# Generated by roxygen2: do not edit by hand export("varlabel<-") export(as.caldays) export(get.label) export(get.label.name) export(get.label.tables) export(get.lang) export(get.origin.codes) export(read.dta13) export(save.dta13) export(set.label) export(set.lang) export(stbcal) export(varlabel) import(Rcpp) importFrom(stats,complete.cases) importFrom(stats,na.omit) importFrom(stats,setNames) importFrom(utils,download.file) importFrom(utils,localeToCharset) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) useDynLib(readstata13) useDynLib(readstata13, .registration = TRUE) readstata13/NEWS0000644000176200001440000000606013302215634013066 0ustar liggesusers[0.9.2] - fix build on OSX [0.9.1] - allow reading only pre-selected variables - experimental support for format 119 - improve partial reading - export of binary data from dta-files - new function get.label.tables() to show all Stata label sets - fix check for duplicate labels - fixes in set.lang [0.9.0] - generate unique factor labels to prevent errors in factor definition - check interrupt for long read - fix storage size of character vectors in save.dta13 - fix saving characters containing missings - implement partial reading of dta-files - fix an integer bug with saving data.frames of length requiring uint64_t 0.8.5 - fix errors on big-endian systems 0.8.4 - fix valgrind errors. converting from dta.write to writestr - fix for empty data label - make replace.strl default 0.8.3 - restrict length of varnames to 32 chars for compatibility with Stata 14 - Stop compression of doubles as floats. Now test if compression of doubles as interger types is possible. - add many function tests 0.8.2 - save NA values in character vector as empty string - convert.underscore=T will convert all non-literal characters to underscores - fix saving of Dates - save with convert.factors by default - test for NaN and inf values while writing missing values and replace with NA - remove message about saving factors 0.8.1 - convert non-integer variables to factors (nonint.factors=T) - working with strL variables is now a lot faster (thank to Magnus Thor Torfason) - fix handling of large datasets - some code cleanups 0.8 - implement reading all version prior 13. - clean up code. - fix a crash when varlables do not match ncols. - update leap seconds R code with foreign. 0.7.1 - fix saving of files > 2GB 0.7 - read and write Stata 14 files (ver 118) - fix save for variables without non-missing values - read strings from different file encodings - code cleanups 0.6.1 - fix heap overflow 0.6 - various fixes - reading stbcal-files 0.5 - write dta-files - read/write LSF and MSF files - source testing and cleaning - support for multiple label languages (see http://www.stata.com/manuals13/dlabellanguage.pdf) - additional tools for label handling 0.4 - convert.dates from foreign::read.dta() - handle different NA values - convert strings to system encoding - some checks on label assignment 0.3 - reading file from url. Example: `read.dta13("http://www.stata-press.com/data/r13/auto.dta")` - convert.underscore from foreign::read.dta(): converts _ to . - missing.type parts from foreign::read.dta(). If TRUE return "missing" - replace.strl option to replace the reference to a STRL string in the data.frame with the actual value 0.2 - read stata characteristics and save them in extension.table attribute - more robust handling of factor labels - set file encoding for all strings and convert them to system encoding - fixed compiler warnings 0.1 - reading data files and create a data.frame - assign variable names - read the new strL strings and save them as attribute - convert stata label to factors and save them as attribute - read some meta data (timestamp, dataset label, formats,...) readstata13/R/0000755000176200001440000000000013302177220012564 5ustar liggesusersreadstata13/R/readstata13.R0000644000176200001440000000103713302177220015024 0ustar liggesusers#' Import Stata Data Files #' #' Function to read the Stata file format into a data.frame. #' #' #' @author Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' #' @name readstata13 #' @docType package #' @useDynLib readstata13, .registration = TRUE #' @import Rcpp #' @note If you catch a bug, please do not sue us, we do not have any money. #' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from #' Stata Versions < 13 NULL readstata13/R/RcppExports.R0000644000176200001440000000101413302177220015174 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 stata_read <- function(filePath, missing, selectrows, selectcols, strlexport, strlpath) { .Call(`_readstata13_stata_read`, filePath, missing, selectrows, selectcols, strlexport, strlpath) } stata_save <- function(filePath, dat) { .Call(`_readstata13_stata_save`, filePath, dat) } stata_pre13_save <- function(filePath, dat) { .Call(`_readstata13_stata_pre13_save`, filePath, dat) } readstata13/R/tools.R0000644000176200001440000004033013302177220014047 0ustar liggesusers# # Copyright (C) 2014-2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # Wrapper Around iconv Calls for Code Readability # # @param x element to be converted # @param encoding encoding to be used. # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} read.encoding <- function(x, fromEncoding, encoding) { iconv(x, from=fromEncoding, to=encoding , sub="byte") } save.encoding <- function(x, encoding) { iconv(x, to=encoding, sub="byte") } # Function to check if directory exists # @param x file path dir.exists13 <-function(x) { path <- dirname(x) return(file.exists(path)) } # Construct File Path # # @param path path to dta file # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} get.filepath <- function(path="") { if (substring(path, 1, 1) == "~") { filepath <- path.expand(path) } else { filepath <- path } if (!file.exists(filepath)) { return("File does not exist.") } return(filepath) } #' Show Default Label Language #' #' Displays informations about the defined label languages. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param print \emph{logical.} If \code{TRUE}, print available languages and #' default language. #' @return Returns a list with two components: #' \describe{ #' \item{languages:}{Vector of label languages used in the dataset} #' \item{default:}{Name of the actual default label language, otherwise NA} #' } #' @details Stata allows to define multiple label sets in different languages. #' This functions reports the available languages and the selected default #' language. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.lang <- function(dat, print=T) { ex <- attr(dat, "expansion.fields") lang <- list() if (length(grep("_lang_list", ex)) > 0) { lang$languages <- strsplit(ex[[grep("_lang_list", ex)]][3], " ")[[1]] } else { lang$languages <- NA } lang$default <- ifelse(length(grep("_lang_c", ex)) > 0, ex[[grep("_lang_c", ex)]][3], NA) if (print) { cat("Available languages:\n ") cat(paste0(lang$languages, "\n")) cat("\nDefault language:\n") cat(paste0(" ",lang$default, "\n")) return(invisible(lang)) } return(lang) } #' Get Names of Stata Label Set #' #' Retrieves the Stata label set in the dataset for all or an vector of variable #' names. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character vector.} Variable names. If \code{NULL}, get #' names of all label sets. #' @param lang \emph{character.} Label language. Default language defined by #' \code{\link{get.lang}} is used if NA #' @return Returns an named vector of variable labels #' @details Stata stores factor labels in variable independent labels sets. This #' function retrieves the name of the label set for a variable. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.label.name <- function(dat, var.name=NULL, lang=NA) { vnames <- names(dat) if (is.na(lang) | lang == get.lang(dat, F)$default) { labelsets <- attr(dat, "val.labels") names(labelsets) <- vnames } else if (is.character(lang)) { ex <- attr(dat, "expansion.fields") has_no_label_lang <- identical( integer(0), unlist(lapply(ex, grep, pattern ="_lang_l_")) ) if (has_no_label_lang) { return("") } varname <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], function(x) x[1]) labelsets.tmp <- sapply(ex[grep(paste0("_lang_l_", lang), ex)], function(x) x[3]) names(labelsets.tmp) <- varname labelsets <- rep("", length(vnames)) names(labelsets) <- vnames labelsets[varname] <- labelsets.tmp[varname] } if (is.null(var.name)) { return(labelsets) } else { return(labelsets[var.name]) } } #' Get Origin Code Numbers for Factors #' #' Recreates the code numbers of a factor as stored in the Stata dataset. #' #' @param x \emph{factor.} Factor to obtain code for #' @param label.table \emph{table.} Table with factor levels obtained by #' \code{\link{get.label}}. #' @return Returns an integer with original codes #' @details While converting numeric variables into factors, the original code #' numbers are lost. This function reconstructs the codes from the attribute #' \code{label.table}. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' labname <- get.label.name(dat,"type") #' labtab <- get.label(dat, labname) #' #' # comparsion #' get.origin.codes(dat$type, labtab) #' as.integer(dat$type) #' @export get.origin.codes <- function(x, label.table) { if (is.factor(x)) { fac <- as.character(x) return(as.integer(label.table[fac])) } else { message("x is no factor.") } } #' Get Stata Label Table for a Label Set #' #' Retrieve the value labels for a specific Stata label set. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param label.name \emph{character.} Name of the Stata label set #' @return Returns a named vector of code numbers #' @details This function returns the table of factor levels which represent #' a Stata label set. The name of a label set for a variable can be obtained #' by \code{\link{get.label.name}}. #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' labname <- get.label.name(dat,"type") #' get.label(dat, labname) #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @export get.label <- function(dat, label.name) { return(attr(dat, "label.table")[label.name][[1]]) } #' Get all Stata Label Sets for a Data.frame #' #' Retrieve the value labels for all variables. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @return Returns a named list of label tables #' @details This function returns the factor levels which represent #' a Stata label set for all variables. #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' get.label.tables(dat) #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @importFrom stats setNames #' @export get.label.tables <- function(dat) { varnames <- setNames(names(dat), names(dat)) lapply(varnames, function(varname) get.label(dat, get.label.name(dat, varname))) } #' Assign Stata Labels to a Variable #' #' Assign value labels from a Stata label set to a variable. If duplicated #' labels are found, unique labels will be generated according the following #' scheme: "label_(integer code)". Levels without labels will become . #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character.} Name of the variable in the data.frame #' @param lang \emph{character.} Label language. Default language defined by #' \code{\link{get.lang}} is used if NA #' @return Returns a labeled factor #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), #' convert.factors=FALSE) #' #' # compare vectors #' set.label(dat, "type") #' dat$type #' #' # German label #' set.label(dat, "type", "de") #' @export set.label <- function(dat, var.name, lang=NA) { if (is.factor(dat[,var.name])) { tmp <- get.origin.codes(dat[,var.name], get.label(dat, get.label.name(dat, var.name))) } else { tmp <- dat[,var.name] } labtable <- get.label(dat, get.label.name(dat, var.name, lang)) #check for duplicated labels labcount <- table(names(labtable)) if (any(labcount > 1)) { warning(paste0("\n ",var.name, ":\n Duplicated factor levels detected -", "generating unique labels.\n")) labdups <- names(labtable) %in% names(labcount[labcount > 1]) # generate unique labels from assigned label and code number names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") } return(factor(tmp, levels=labtable, labels=names(labtable)) ) } #' Get and assign Stata Variable Labels #' #' Retrieve or set variable labels for a dataset. #' #' @name varlabel #' @rdname varlabel #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param var.name \emph{character vector.} Variable names. If NULL, get label #' for all variables. #' @param lang \emph{character.} Label language. Default language defined by #' \code{\link{get.lang}} is used if NA #' @param value \emph{character vector.} Vector of variable names. #' @return Returns an named vector of variable labels #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @aliases varlabel #' @aliases 'varlabel<-' NULL #' @rdname varlabel #' @export varlabel <- function(dat, var.name=NULL, lang=NA) { vnames <- names(dat) if (is.na(lang) | lang == get.lang(dat, F)$default) { varlabel <- attr(dat, "var.labels") names(varlabel) <- vnames } else if (is.character(lang)) { ex <- attr(dat, "expansion.fields") varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1]) varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3]) names(varlabel) <- varname } if (is.null(var.name)) { # order by data.frame columns and return return(varlabel[vnames]) } else { return(varlabel[var.name]) } } #' @rdname varlabel #' @export 'varlabel<-' <- function(dat, value) { nlabs <- length(attr(dat, "var.labels")) if (length(value)==nlabs) { attr(x, "var.labels") <- value } else { warning(paste("Vector of new labels must have",nlabs,"entries.")) } dat } #' Assign Stata Language Labels #' #' Changes default label language for a dataset. #' Variables with generated labels (option generate.labels=TRUE) are kept unchanged. #' #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}. #' @param lang \emph{character.} Label language. Default language defined by #' \code{\link{get.lang}} is used if NA #' @param generate.factors \emph{logical.} If \code{TRUE}, missing factor levels #' are generated. #' @return Returns a data.frame with value labels in language "lang". #' @examples #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' get.lang(dat) #' varlabel(dat) #' #' # set German label #' datDE <- set.lang(dat, "de") #' get.lang(datDE) #' varlabel(datDE) #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @importFrom stats na.omit #' @importFrom utils txtProgressBar setTxtProgressBar #' @export set.lang <- function(dat, lang=NA, generate.factors=FALSE) { if (is.na(lang) | lang == get.lang(dat, F)$default) { return(dat) } else if (is.character(lang)) { vnames <- names(dat) types <- attr(dat, "types") label <- attr(dat, "label.table") val.labels <- get.label.name(dat, NULL, lang) oldval.labels <- get.label.name(dat) oldval.labels <- oldval.labels[!is.na(oldval.labels)] oldval.labtab <- lapply(oldval.labels, function(x) get.label(dat, x)) oldlang <- get.lang(dat, F)$default cat("Replacing value labels. This might take some time...\n") pb <- txtProgressBar(min=1,max=length(val.labels)+1) for (i in which(val.labels != "")) { labname <- val.labels[i] vartype <- types[i] labtable <- label[[labname]] varname <- names(val.labels)[i] # get old codes if (is.factor(dat[, varname])) { oldlabname <- oldval.labels[names(oldval.labels) == varname] oldlabtab <- oldval.labtab[[names(oldlabname)]] codes <- get.origin.codes(dat[,varname], oldlabtab) varunique <- na.omit(unique(codes)) } else { varunique <- na.omit(unique(dat[,varname])) } if (labname %in% names(label) & is.factor(dat[,varname])) { # assign label if label set is complete if (all(varunique %in% labtable)) { dat[,varname] <- factor(codes, levels=labtable, labels=names(labtable)) } # else generate labels from codes } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) dat[,varname] <- factor(codes, levels=gen.lab, labels=names(gen.lab)) } else { warning(paste(vnames[i], "Missing factor labels - no labels assigned. Set option generate.factors=T to generate labels.")) } setTxtProgressBar(pb, i) } close(pb) # Save old default labels to expansion.fields. This is necessary to save # original labels for further use. vnames <- names(oldval.labels) names(oldval.labels) <- NULL tmp <- list() for (i in seq_along(val.labels)) { tmp[[i]] <- c(vnames[i],paste0("_lang_l_",oldlang), oldval.labels[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) # variable label old.varlabel <- attr(dat, "var.labels") tmp <- list() for (i in seq_along(old.varlabel)) { tmp[[i]] <- c(vnames[i],paste0("_lang_v_", oldlang), old.varlabel[i]) } attr(dat, "expansion.fields") <- c(attr(dat, "expansion.fields"),tmp) ex <- attr(dat, "expansion.fields") varname <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[1]) varlabel <- sapply(ex[grep(paste0("_lang_v_", lang), ex)], function(x) x[3]) names(varlabel) <- varname varlabel.out <- as.character(varlabel[vnames]) varlabel.out[is.na(varlabel.out)] <- "" attr(dat, "var.labels") <- varlabel.out # set new default lang and store string as default attributes names(val.labels) <- NULL attr(dat, "val.labels") <- val.labels attr(dat, "expansion.fields")[[ grep("_lang_c", attr(dat, "expansion.fields")) ]][3] <- lang return(dat) } } #' Check if numeric vector can be expressed as interger vector #' #' Compression can reduce numeric vectors as integers if the vector does only #' contain integer type data. #' #' @param x vector of data frame saveToExport <- function(x) { isTRUE(all.equal(x, as.integer(x))) } #' Check max char length of data.frame vectors #' #' Stata requires us to provide the maximum size of a charactervector as every #' row is stored in a bit region of this size. #' #' Ex: If the max chars size is four, _ is no character in this vector: #' 1. row: four #' 3. row: one_ #' 4. row: ____ #' #' If a character vector contains only missings or is empty, we will assign it a #' value of one, since Stata otherwise cannot handle what we write. #' #' @param x vector of data frame maxchar <- function(x) { z <- max(nchar(x, type="byte"), na.rm = TRUE) # Stata does not allow storing a string of size 0 if (is.infinite(z) | (z == 0)) z <- 1 z } readstata13/R/save.R0000644000176200001440000003210613302177220013647 0ustar liggesusers# # Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Write Stata Binary Files #' #' \code{save.dta13} writes a Stata dta-file bytewise and saves the data #' into a dta-file. #' #' @param file \emph{character.} Path to the dta file you want to export. #' @param data \emph{data.frame.} A data.frame Object. #' @param data.label \emph{character.} Name of the dta-file. #' @param time.stamp \emph{logical.} If \code{TRUE}, add a time.stamp to the #' dta-file. #' @param convert.factors \emph{logical.} If \code{TRUE}, factors will be #' converted to Stata variables with labels. #' Stata expects strings to be encoded as Windows-1252, so all levels will be #' recoded. Character which can not be mapped in Windows-1252 will be saved as #' hexcode. #' @param convert.dates \emph{logical.} If \code{TRUE}, dates will be converted #' to Stata date time format. Code from \code{foreign::write.dta} #' @param convert.underscore \emph{logical.} If \code{TRUE}, all non numerics or #' non alphabet characters will be converted to underscores. #' @param tz \emph{character.} The name of the timezone convert.dates will use. #' @param add.rownames \emph{logical.} If \code{TRUE}, a new variable rownames #' will be added to the dta-file. #' @param compress \emph{logical.} If \code{TRUE}, the resulting dta-file will #' use all of Statas numeric-vartypes. #' @param version \emph{numeric.} Stata format for the resulting dta-file either #' Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). #' Experimental support for large datasets: Use version="15mp" to save the dataset #' in the new Stata 15/MP file format. This feature is not thoroughly tested yet. #' @return The function writes a dta-file to disk. The following features of the #' dta file format are supported: #' \describe{ #' \item{datalabel:}{Dataset label} #' \item{time.stamp:}{Timestamp of file creation} #' \item{formats:}{Stata display formats. May be used with #' \code{\link[base]{sprintf}}} #' \item{type:}{Stata data type (see Stata Corp 2014)} #' \item{var.labels:}{Variable labels} #' \item{version:}{dta file format version} #' \item{strl:}{List of character vectors for the new strL string variable #' type. The first element is the identifier and the second element the #' string.} #' } #' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and #' \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in #' package \code{haven} for Stata version >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @useDynLib readstata13 #' @importFrom utils localeToCharset #' @export save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE, convert.factors=TRUE, convert.dates=TRUE, tz="GMT", add.rownames=FALSE, compress=FALSE, version=117, convert.underscore=FALSE){ if (!is.data.frame(data)) stop("The object \"data\" must have class data.frame") if (!dir.exists13(dirname(file))) stop("Path is invalid. Possibly a non-existing directory.") # Allow writing version as Stata version not Stata format if (version=="15mp") version <- 119 if (version==15L) version <- 118 if (version==14L) version <- 118 if (version==13L) version <- 117 if (version==12L) version <- 115 if (version==11L | version==10L) version <- 114 if (version==9L | version==8L) version <- 113 if (version==7) version <- 110 if (version==6) version <- 108 if (version == 119) message("Support for Stata 15/MP (119) format is experimental and not thoroughly tested.") if (version<102 | version == 109 | version == 116 | version>119) stop("Version mismatch abort execution. No Data was saved.") sstr <- 2045 sstrl <- 32768 sdouble <- 65526 sfloat <- 65527 slong <- 65528 sint <- 65529 sbyte <- 65530 if (version < 117) { sstr <- 244 sstrl <- 244 sdouble <- 255 sfloat <- 254 slong <- 253 sint <- 252 sbyte <- 251 } if (version<111 | version==112) sstrl <- 80 if(!is.data.frame(data)) { stop("Object is not of class data.frame.") } # Is recoding necessary? if (version<=117) { # Reencoding is always needed doRecode <- TRUE toEncoding <- "CP1252" } else if (toupper(localeToCharset()[1])!="UTF-8") { # If R runs in a non UTF-8 locale and Stata > 13 doRecode <- TRUE toEncoding <- "UTF-8" } else { # utf-8 and Stata > 13 doRecode <- FALSE } if (add.rownames) { if (doRecode) { rwn <- save.encoding(rownames(data), toEncoding) } else { rwn <-rownames(data) } data <- data.frame(rownames= rwn, data, stringsAsFactors = F) } rownames(data) <- NULL if (convert.underscore) { names(data) <- gsub("[^a-zA-Z0-9_]", "_", names(data)) names(data)[grepl("^[0-9]", names(data))] <- paste0( "_", names(data)[grepl("^[0-9]", names(data))]) } filepath <- path.expand(file) # For now we handle numeric and integers vartypen <- sapply(data, class) names(vartypen) <- names(data) # Convert logicals to integers for (v in names(vartypen[vartypen == "logical"])) data[[v]] <- as.integer(data[[v]]) vartypen <- vtyp <- sapply(data, class) if (convert.factors){ if (version < 106) { hasfactors <- sapply(data, is.factor) if (any(hasfactors)) warning(paste("dta-format < 106 can not handle factors.", "Labels are not saved!")) } # If our data.frame contains factors, we create a label.table factors <- which(sapply(data, is.factor)) f.names <- attr(factors,"names") label.table <- vector("list", length(f.names)) names(label.table) <- f.names valLabel <- sapply(data, class) valLabel[valLabel != "factor"] <- "" i <- 0 for (v in factors) { i <- i + 1 if (doRecode) { f.levels <- save.encoding(levels(data[[v]]), toEncoding) } else { f.levels <- levels(data[[v]]) } f.labels <- as.integer(labels(levels(data[[v]]))) attr(f.labels, "names") <- f.levels f.labels <- f.labels[names(f.labels) != ".."] label.table[[ (f.names[i]) ]] <- f.labels valLabel[v] <- f.names[i] } attr(data, "label.table") <- rev(label.table) if (doRecode) { valLabel <- save.encoding(valLabel, toEncoding) } attr(data, "vallabels") <- valLabel } else { attr(data, "label.table") <- NULL attr(data, "vallabels") <- rep("",length(data)) } if (convert.dates) { dates <- which(sapply(data, function(x) inherits(x, "Date")) ) for (v in dates) data[[v]] <- as.vector( julian(data[[v]],as.Date("1960-1-1", tz = "GMT")) ) dates <- which( sapply(data, function(x) inherits(x,"POSIXt")) ) for (v in dates) data[[v]] <- as.vector( round(julian(data[[v]], ISOdate(1960, 1, 1, tz = tz))) ) } # is.numeric is TRUE for integers ff <- sapply(data, is.numeric) ii <- sapply(data, is.integer) factors <- sapply(data, is.factor) empty <- sapply(data, function(x) all(is.na(x) & !is.character(x))) ddates <- vartypen == "Date" # default no compression: numeric as double; integer as long; date as date; # empty as byte if (!compress) { vartypen[ff] <- sdouble vartypen[ii] <- slong vartypen[factors] <- slong vartypen[ddates] <- -sdouble vartypen[empty] <- sbyte } else { varTmin <- sapply(data[(ff | ii) & !empty], function(x) min(x,na.rm=TRUE)) varTmax <- sapply(data[(ff | ii) & !empty], function(x) max(x,na.rm=TRUE)) # check if numerics can be stored as integers numToCompress <- sapply(data[ff], saveToExport) if (any(numToCompress)) { saveToConvert <- names(ff[numToCompress]) # replace numerics as intergers data[saveToConvert] <- sapply(data[saveToConvert], as.integer) # recheck after update ff <- sapply(data, is.numeric) ii <- sapply(data, is.integer) } vartypen[ff] <- sdouble bmin <- -127; bmax <- 100 imin <- -32767; imax <- 32740 # check if integer is byte, int or long for (k in names(which(ii & !empty))) { vartypen[k][varTmin[k] < imin | varTmax[k] > imax] <- slong vartypen[k][varTmin[k] > imin & varTmax[k] < imax] <- sint vartypen[k][varTmin[k] > bmin & varTmax[k] < bmax] <- sbyte } factorlength <- sapply(data[factors & !empty], nlevels) for (k in names(which(factors & !empty))) { vartypen[factors & factorlength[k] > 0x1.000000p127] <- slong vartypen[factors & factorlength[k] < 0x1.000000p127] <- sint vartypen[factors & factorlength[k] < 101] <- sbyte } # keep dates as is vartypen[ddates] <- -sdouble # cast empty variables as byte vartypen[empty] <- sbyte } # recode character variables. >118 wants utf-8, so encoding may be required if(doRecode) { #TODO: use seq_len ? for(v in (1:ncol(data))[vartypen == "character"]) { data[, v] <- save.encoding(data[, v], toEncoding) } } # str and strL are stored by maximum length of chars in a variable str.length <- sapply(data[vartypen == "character"], FUN=maxchar) str.length[str.length > sstr] <- sstrl for (v in names(vartypen[vartypen == "character"])) { # str.length[str.length > sstr] <- sstrl # no loop necessary! vartypen[[v]] <- str.length[[v]] } # save type bevor abs() formats <- vartypen vartypen <- abs(as.integer(vartypen)) attr(data, "types") <- vartypen # ToDo: Add propper check. # # value_label_names must be < 33 chars # if (sapply(valLabel,FUN=maxchar) >= 33) # message ("at least one variable name is to long.") # Resize varnames to 32. Stata requires this. It allows storing 32*4 bytes, # but can not work with longer variable names. Chars can be 1 - 4 bytes we # count the varnames in R. Get nchars and trim them. varnames <- names(data) lenvarnames <- sapply(varnames, nchar) if (any (lenvarnames > 32) & version >= 117) { message ("Varname to long. Resizing. Max size is 32.") names(data) <- sapply(varnames, strtrim, width = 32) } # Stata format "%9,0g" means european format formats <- vartypen formats[vtyp == "Date"] <- "%td" formats[formats == sdouble] <- "%9.0g" formats[formats == sfloat] <- "%9.0g" formats[formats == slong] <- "%9.0g" formats[formats == sint] <- "%9.0g" formats[formats == sbyte] <- "%9.0g" formats[vartypen >= 0 & vartypen <= sstr] <- paste0("%", formats[vartypen >= 0 & vartypen <= sstr], "s") attr(data, "formats") <- formats # Create a datalabel if (is.null(data.label)) { attr(data, "datalabel") <- "Written by R" } else { if (version == 102L) warning("Format 102 does not print a data label in Stata.") if (doRecode) { data.label <- save.encoding(data.label, toEncoding) } attr(data, "datalabel") <- data.label } # Create the 17 char long timestamp. It may contain 17 char long strings if (!time.stamp) { attr(data, "timestamp") <- "" } else { lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C") attr(data, "timestamp") <- format(Sys.time(), "%d %b %Y %H:%M") Sys.setlocale("LC_TIME",lct) } expfield <- attr(data, "expansion.fields") if (doRecode) { expfield <- lapply(expfield, function(x) iconv(x, to=toEncoding)) } attr(data, "expansion.fields") <- rev(expfield) attr(data, "version") <- as.character(version) if (version < 117) attr(data, "version") <- version # If length of varlabels differs from ncols drop varlabels. This can happen, # when the initial data.frame was read by read.dta13 and another variable was # attached. In this case the last variable label has a non existing variable # label which will crash our Rcpp code. Since varlabels do not respect the # ordering inside the data frame, we simply drop them. varlabels <- attr(data, "var.labels") if (!is.null(varlabels) & (length(varlabels)!=ncol(data))) { attr(data, "var.labels") <- NULL warning("Number of variable labels does not match number of variables. Variable labels dropped.") } if (version >= 117) invisible( stata_save(filePath = filepath, dat = data) ) else invisible( stata_pre13_save(filePath = filepath, dat = data) ) } readstata13/R/read.R0000644000176200001440000003760613302177220013636 0ustar liggesusers# Copyright (C) 2014-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki # Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Read Stata Binary Files #' #' \code{read.dta13} reads a Stata dta-file and imports the data into a #' data.frame. #' #' @param file \emph{character.} Path to the dta file you want to import. #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata #' value labels are created. #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is #' TRUE, missing factor labels are created from integers. If duplicated labels #' are found, unique labels will be generated according the following scheme: #' "label_(integer code)". #' @param encoding \emph{character.} Strings can be converted from Windows-1252 #' or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify #' target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain #' strings which can't be displayed in the current locale. #' Set encoding=NULL to stop reencoding. #' @param fromEncoding \emph{character.} We expect strings to be encoded as #' "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 #' or newer "UTF-8" is used. In some situation the used encoding can differ for #' Stata 14 files and must be manually set. #' @param convert.underscore \emph{logical.} If \code{TRUE}, "_" in variable #' names will be changed to "." #' @param missing.type \emph{logical.} Stata knows 27 different missing types: #' ., .a, .b, ..., .z. If \code{TRUE}, attribute \code{missing} will be #' created. #' @param replace.strl \emph{logical.} If \code{TRUE}, replace the reference to #' a strL string in the data.frame with the actual value. The strl attribute #' will be removed from the data.frame (see details). #' @param convert.dates \emph{logical.} If \code{TRUE}, Stata dates are #' converted. #' @param add.rownames \emph{logical.} If \code{TRUE}, the first column will be #' used as rownames. Variable will be dropped afterwards. #' @param nonint.factors \emph{logical.} If \code{TRUE}, factors labels #' will be assigned to variables of type float and double. #' @param select.rows \emph{integer.} Vector of one or two numbers. If single #' value rows from 1:val are selected. If two values of a range are selected #' the rows in range will be selected. #' @param select.cols \emph{character:} Vector of variables to select. #' @param strlexport \emph{logical:} Should strl content be exported as binary #' files? #' @param strlpath \emph{cahracter:} Path for strl export. #' #' @details If the filename is a url, the file will be downloaded as a temporary #' file and read afterwards. #' #' Stata files are encoded in ansinew. Depending on your system's default #' encoding certain characters may appear wrong. Using a correct encoding may #' fix these. #' #' Variable names stored in the dta-file will be used in the resulting #' data.frame. Stata types char, byte, and int will become integer; float and #' double will become numerics. R only knows a single missing type, while Stata #' knows 27, so all Stata missings will become NA in R. If you need to keep #' track of Statas original missing types, you may use #' \code{missing.type=TRUE}. #' #' Stata dates are converted to R's Date class the same way foreign handles #' dates. #' #' Stata 13 introduced a new character type called strL. strLs are able to store #' strings up to 2 billion characters. While R is able to store #' strings of this size in a character vector, the printed representation of #' such vectors looks rather cluttered, so it's possible to save only a #' reference in the data.frame with option \code{replace.strl=FALSE}. #' #' In R, you may use rownames to store characters (see for instance #' \code{data(swiss)}). In Stata, this is not possible and rownames have to be #' stored as a variable. If you want to use rownames, set add.rownames to TRUE. #' Then the first variable of the dta-file will hold the rownames of the #' resulting data.frame. #' #' Reading dta-files of older and newer versions than 13 was introduced #' with version 0.8. #' @return The function returns a data.frame with attributes. The attributes #' include #' \describe{ #' \item{datalabel:}{Dataset label} #' \item{time.stamp:}{Timestamp of file creation} #' \item{formats:}{Stata display formats. May be used with #' \code{\link{sprintf}}} #' \item{types:}{Stata data type (see Stata Corp 2014)} #' \item{val.labels:}{For each variable the name of the associated value #' labels in "label"} #' \item{var.labels:}{Variable labels} #' \item{version:}{dta file format version} #' \item{label.table:}{List of value labels.} #' \item{strl:}{Character vector with long strings for the new strl string #' variable type. The name of every element is the identifier.} #' \item{expansion.fields:}{list providing variable name, characteristic name #' and the contents of Stata characteristic field.} #' \item{missing:}{List of numeric vectors with Stata missing type for each #' variable.} #' \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} #' \item{orig.dim:}{Dimension recorded inside the dta-file.} #' } #' @note read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members #' from foreign::read.dta(). #' @seealso \code{\link[foreign]{read.dta}} in package \code{foreign} and #' \code{memisc} for dta files from Stata #' versions < 13 and \code{read_dta} in package \code{haven} for Stata version #' >= 13. #' @references Stata Corp (2014): Description of .dta file format #' \url{http://www.stata.com/help.cgi?dta} #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @useDynLib readstata13 #' @importFrom utils download.file #' @importFrom stats na.omit #' @export read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE, encoding = "UTF-8", fromEncoding=NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors=FALSE, select.rows = NULL, select.cols = NULL, strlexport = FALSE, strlpath = ".") { # Check if path is a url if (length(grep("^(http|ftp|https)://", file))) { tmp <- tempfile() download.file(file, tmp, quiet = TRUE, mode = "wb") filepath <- tmp on.exit(unlink(filepath)) } else { # construct filepath and read file filepath <- get.filepath(file) } if (!file.exists(filepath)) return(message("File not found.")) # some select.row checks if (!is.null(select.rows)) { # check that it is a numeric if (!is.numeric(select.rows)){ return(message("select.rows must be of type numeric")) } else { # guard against negative values if (any(select.rows < 0) ) select.rows <- abs(select.rows) # check that length is not > 2 if (length(select.rows) > 2) return(message("select.rows must be of length 1 or 2.")) # if length 1 start at row 1 if (length(select.rows) == 1) select.rows <- c(1, select.rows) } # reorder if 2 is bigger than 1 if (select.rows[2] < select.rows[1]) select.rows <- c(select.rows[2], select.rows[1]) # make sure to start at index position 1 if select.rows[2] > 0 if (select.rows[2] > 0 & select.rows[1] == 0) select.rows[1] <- 1 } else { # set a value select.rows <- c(0,0) } if (is.null(select.cols)){ select.cols <- "" } data <- stata_read(filepath, missing.type, select.rows, select.cols, strlexport, strlpath) version <- attr(data, "version") sstr <- 2045 sstrl <- 32768 sdouble <- 65526 sfloat <- 65527 slong <- 65528 sint <- 65529 sbyte <- 65530 if (version < 117) { sstr <- 244 sstrl <- 255 sdouble <- 255 sfloat <- 254 slong <- 253 sint <- 252 sbyte <- 251 } if (convert.underscore) names(data) <- gsub("_", ".", names(data)) types <- attr(data, "types") val.labels <- attr(data, "val.labels") label <- attr(data, "label.table") if (missing.type) { stata.na <- data.frame(type = sdouble:sbyte, min = c(101, 32741, 2147483621, 2 ^ 127, 2 ^ 1023), inc = c(1, 1, 1, 2 ^ 115, 2 ^ 1011) ) if (version >= 113L & version < 117L) { missings <- vector("list", length(data)) names(missings) <- names(data) for (v in which(types > 250L)) { this.type <- types[v] - 250L nas <- is.na(data[[v]]) | data[[v]] >= stata.na$min[this.type] natype <- (data[[v]][nas] - stata.na$min[this.type])/ stata.na$inc[this.type] natype[is.na(natype)] <- 0L missings[[v]] <- rep(NA, NROW(data)) missings[[v]][nas] <- natype data[[v]][nas] <- NA } attr(data, "missing") <- missings } else { if (version >= 117L) { missings <- vector("list", length(data)) names(missings) <- names(data) for (v in which(types > 65525L)) { this.type <- 65531L - types[v] nas <- is.na(data[[v]]) | data[[v]] >= stata.na$min[this.type] natype <- (data[[v]][nas] - stata.na$min[this.type]) / stata.na$inc[this.type] natype[is.na(natype)] <- 0L missings[[v]] <- rep(NA, NROW(data)) missings[[v]][nas] <- natype data[[v]][nas] <- NA } attr(data, "missing") <- missings } else warning("'missing.type' only applicable to version >= 8 files") } } var.labels <- attr(data, "var.labels") ## Encoding if(!is.null(encoding)) { # set from encoding by dta version if(is.null(fromEncoding)) { fromEncoding <- "CP1252" if(attr(data, "version") >= 118L) fromEncoding <- "UTF-8" } # varnames names(data) <- read.encoding(names(data), fromEncoding, encoding) # var.labels attr(data, "var.labels") <- read.encoding(var.labels, fromEncoding, encoding) # val.labels names(val.labels) <- read.encoding(val.labels, fromEncoding, encoding) attr(data, "val.labels") <- val.labels # label names(label) <- read.encoding(names(label), fromEncoding, encoding) if (length(label) > 0) { for (i in 1:length(label)) { names(label[[i]]) <- read.encoding(names(label[[i]]), fromEncoding, encoding) } attr(data, "label.table") <- label } # recode character variables for (v in (1:ncol(data))[types <= sstr]) { data[, v] <- iconv(data[, v], from=fromEncoding, to=encoding, sub="byte") } # expansion.field efi <- attr(data, "expansion.fields") if (length(efi) > 0) { efiChar <- unlist(lapply(efi, is.character)) for (i in (1:length(efi))[efiChar]) { efi[[i]] <- read.encoding(efi[[i]], fromEncoding, encoding) } attr(data, "expansion.fields") <- efi } if (version >= 117L) { #strl strl <- attr(data, "strl") if (length(strl) > 0) { for (i in 1:length(strl)) { strl[[i]] <- read.encoding(strl[[i]], fromEncoding, encoding) } attr(data, "strl") <- strl } } } var.labels <- attr(data, "var.labels") if (replace.strl & version >= 117L) { strl <- c("") names(strl) <- "00000000000000000000" strl <- c(strl, attr(data,"strl")) for (j in seq(ncol(data))[types == sstrl] ) { data[, j] <- strl[data[,j]] } # if strls are in data.frame remove attribute strl attr(data, "strl") <- NULL } if (convert.dates) { convert_dt_c <- function(x) as.POSIXct((x + 0.1) / 1000, origin = "1960-01-01") # avoid rounding down convert_dt_C <- function(x) { ls <- .leap.seconds + seq_along(.leap.seconds) + 315619200 z <- (x + 0.1) / 1000 # avoid rounding down z <- z - rowSums(outer(z, ls, ">=")) as.POSIXct(z, origin = "1960-01-01") } ff <- attr(data, "formats") ## dates <- grep("%-*d", ff) ## Stata 12 introduced 'business dates' ## 'Formats beginning with %t or %-t are Stata's date and time formats.' ## but it seems some are earlier. ## The dta_115 description suggests this is too inclusive: ## 'Stata has an old *%d* format notation and some datasets ## still have them. Format *%d*... is equivalent to modern ## format *%td*... and *%-d*... is equivalent to *%-td*...' dates <- grep("^%(-|)(d|td)", ff) ## avoid as.Date in case strptime is messed up base <- structure(-3653L, class = "Date") # Stata dates are integer vars for (v in dates) data[[v]] <- structure(base + data[[v]], class = "Date") for (v in grep("%tc", ff)) data[[v]] <- convert_dt_c(data[[v]]) for (v in grep("%tC", ff)) data[[v]] <- convert_dt_C(data[[v]]) } if (convert.factors) { vnames <- names(data) for (i in seq_along(val.labels)) { labname <- val.labels[i] vartype <- types[i] labtable <- label[[labname]] #don't convert columns of type double or float to factor if (labname %in% names(label)) { if((vartype == sdouble | vartype == sfloat)) { if(!nonint.factors) { warning(paste0("\n ",vnames[i], ":\n Factor codes of type double ", "or float detected - no labels assigned.\n Set ", "option nonint.factors to TRUE to assign labels ", "anyway.\n")) next } } # get unique values / omit NA varunique <- na.omit(unique(data[, i])) #check for duplicated labels labcount <- table(names(labtable)) if(any(labcount > 1)) { warning(paste0("\n ",vnames[i], ":\n Duplicated factor levels ", "detected - generating unique labels.\n")) labdups <- names(labtable) %in% names(labcount[labcount > 1]) # generate unique labels from assigned label and code number names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")") } # assign label if label set is complete if (all(varunique %in% labtable)) { data[, i] <- factor(data[, i], levels=labtable, labels=names(labtable)) # else generate labels from codes } else if (generate.factors) { names(varunique) <- as.character(varunique) gen.lab <- sort(c(varunique[!varunique %in% labtable], labtable)) data[, i] <- factor(data[, i], levels=gen.lab, labels=names(gen.lab)) # add generated labels to label.table gen.lab.name <- paste0("gen_",vnames[i]) attr(data, "label.table")[[gen.lab.name]] <- gen.lab attr(data, "val.labels")[i] <- gen.lab.name } else { warning(paste0("\n ",vnames[i], ":\n Missing factor labels - no ", "labels assigned.\n Set option generate.factors=T to ", "generate labels.")) } } } } if (add.rownames) { rownames(data) <- data[[1]] data[[1]] <- NULL } return(data) } readstata13/R/dbcal.R0000644000176200001440000001443313302177220013761 0ustar liggesusers# # Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . #' Parse Stata business calendar files #' #' Create conversion table for business calendar dates. #' #' @param stbcalfile \emph{stbcal-file} Stata buisness calendar file created by #' Stata. #' @return Returns a data.frame with two cols: #' \describe{ #' \item{range:}{The date matching the buisnesdate. Date format.} #' \item{buisdays:}{The Stata business calendar day. Integer format.} #' } #' @details Stata 12 introduced business calender format. Business dates are #' integer numbers in a certain range of days, weeks, months or years. In this #' range some days are omitted (e.g. weekends or holidays). If a business #' calendar was created, a stbcal file matching this calendar was created. This #' file is required to read the business calendar. This parser reads the stbcal- #' file and returns a data.frame with dates matching business calendar dates. #' #' A dta-file containing Stata business dates imported with read.stata13() shows #' in formats which stdcal file is required (e.g. "%tbsp500" requires #' sp500.stbcal). #' #' Stata allows adding a short description called purpose. This is added as an #' attribute of the resulting data.frame. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) #' @importFrom stats complete.cases #' @export stbcal <- function(stbcalfile) { # Otherwise localised dates will be used. lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C") # Parse full file stbcal <- file(stbcalfile, "rb") x <- readLines(stbcal, file.info(stbcalfile)$size) close(stbcal) # Dateformat can be ymd, ydm, myd, mdy, dym or dmy if(any(grepl("dateformat ymd", x))) dateformat <- "%Y%b%d" if(any(grepl("dateformat ydm", x))) dateformat <- "%Y%d%b" if(any(grepl("dateformat myd", x))) dateformat <- "%b%Y%d" if(any(grepl("dateformat mdy", x))) dateformat <- "%b%d%Y" if(any(grepl("dateformat dym", x))) dateformat <- "%b%Y%d" if(any(grepl("dateformat dmy", x))) dateformat <- "%d%b%Y" # Range of stbcal. Range is required, contains start and end. rangepos <- grep("range", x) range <- x[rangepos] range <- strsplit(range, " ") rangestart <- range[[1]][2] rangestop <- range[[1]][3] range <- seq(from= as.Date(rangestart, dateformat), to= as.Date(rangestop, dateformat), "days") # Centerdate of stbcal. Date that matches 0. centerpos <- grep("centerdate", x) centerdate <- x[centerpos] centerdate <- gsub("centerdate ","",centerdate) centerdate <- as.Date(centerdate, dateformat) # Omit Dayofweek omitdayofweekpos <- grep ("omit dayofweek", x) omitdayofweek <- x[omitdayofweekpos] # Mo, Tu, We, Th, Fr, Sa, Su daysofweek <- weekdays(as.Date(range)) stbcal <- data.frame(range = range, daysofweek=daysofweek) # Weekdays every week if (any(grepl("Mo", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Monday"] <- NA if (any(grepl("Tu", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Tuesday"] <- NA if (any(grepl("We", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Wednesday"] <- NA if (any(grepl("Th", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Thursday"] <- NA if (any(grepl("Fr", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Friday"] <- NA if (any(grepl("Sa", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Saturday"] <- NA if (any(grepl("Su", omitdayofweek))) stbcal$daysofweek[stbcal$daysofweek=="Sunday"] <- NA # Special days to be omitted if (any(grepl("omit date", x))) { dates <- grep("omit date", x) omitdates <- x[dates] omitdates <- gsub("omit date ", "", omitdates) dates <- as.Date(omitdates, dateformat) stbcal$daysofweek[which(stbcal$range%in%dates)] <- NA # Keep only wanted days stbcal$daysofweek behalten stbcal <- stbcal[complete.cases(stbcal$daysofweek),] } # In case centerdate is not rangestart: stbcal$buisdays <- NA stbcal$buisdays[stbcal$range==centerdate] <- 0 stbcal$buisdays[stbcal$rangecenterdate] <- seq( from=1, to=length(stbcal$range[stbcal$range>centerdate])) # Add purpose if (any(grepl("purpose", x))) { purposepos <- grep("purpose", x) purpose <- x[purposepos] attr(stbcal, "purpose") <- purpose } # restore locale Sys.setlocale("LC_TIME", lct) return(stbcal) } #' Convert Stata business calendar dates in readable dates. #' #' Convert Stata business calendar dates in readable dates. #' #' @param buisdays numeric Vector of business dates #' @param cal data.frame Conversion table for business calendar dates #' @param format character String with date format as in \code{\link{as.Date}} #' @return Returns a vector of readable dates. #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} #' @examples #' # read business calendar and data #' sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) #' dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) #' #' # convert dates and check #' dat$ldatescal2 <- as.caldays(dat$ldate, sp500) #' all(dat$ldatescal2==dat$ldatescal) #' @export as.caldays <- function(buisdays, cal, format="%Y-%m-%d") { rownames(cal) <- cal$buisdays dates <- cal[as.character(buisdays), "range"] if(!is.null(format)) as.Date(dates, format = format) return(dates) } readstata13/README.md0000644000176200001440000001535313302217043013647 0ustar liggesusers# readstata13 Package to read and write all Stata file formats (version 15 and older) into a R data.frame. The dta file format versions 102 to 118 are supported. The function ```read.dta``` from the foreign package imports only dta files from Stata versions <= 12. Due to the different structure and features of dta 117 files, we wrote a new file reader in Rcpp. Additionally the package supports many features of the Stata dta format like label sets in different languages (`?set.lang`) or business calendars (`?as.caldays`). ## Installation The package is now hosted on CRAN. ```R install.packages("readstata13") ``` ## Usage ```R library(readstata13) dat <- read.dta13("path to file.dta") save.dta13(dat, file="newfile.dta") ``` ## Development Version To install the current release from github you need the plattform specific build tools. On Windows a current installation of [Rtools](https://cran.r-project.org/bin/windows/Rtools/) is necessary, while OS X users need to install [Xcode](https://itunes.apple.com/us/app/xcode/id497799835). ```R # install.packages("devtools") devtools::install_github("sjewo/readstata13", ref="0.9.2") ``` Older Versions of devtools require a username option: ```R install_github("readstata13", username="sjewo", ref="0.9.2") ``` To install the current development version from github: ```R devtools::install_github("sjewo/readstata13", ref="testing") ``` ## Current Status [![Build Status](https://travis-ci.org/sjewo/readstata13.svg?branch=master)](https://travis-ci.org/sjewo/readstata13) [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13) ### Changelog and Features | Version | Changes | | ------ | ---------------------------------------------------- | | 0.9.2 | Fix Build on MacOS X | | | | | 0.9.1 | Allow reading only pre-selected variables | | 0.9.1 | Experimental support for format 119 | | 0.9.1 | Improvements to partial reading. Idea by Kevin Jin | | 0.9.1 | Export of binary data from dta-files | | 0.9.1 | new function get.label.tables() to show all Stata label sets | | 0.9.1 | Fix check for duplicate labels and in set.lang() | | | | 0.9.0 | Generate unique factor labels to prevent errors in factor definition | | 0.9.0 | check interrupt for long read. Patch by Giovanni Righi | | 0.9.0 | Updates to notes, roxygen and register | | 0.9.0 | Fixed size of character length. Bug reported by Yiming (Paul) Li | | 0.9.0 | Fix saving characters containing missings. Bug reported by Eivind H. Olsen | | 0.9.0 | Adjustments to convert.underscore. Patch by luke-m-olson | | 0.9.0 | Allow partial reading of selected rows | | | | 0.8.5 | Fix errors on big-endians systems | | | | 0.8.4 | Fix valgrind errors. converting from dta.write to writestr | | 0.8.4 | Fix for empty data label | | 0.8.4 | Make replace.strl default | | | | 0.8.3 | Restrict length of varnames to 32 chars for compatibility with Stata 14 | | 0.8.3 | Add many function tests | | 0.8.3 | Avoid converting of double to floats while writing compressed files | | | | 0.8.2 | Save NA values in character vector as empty string | | 0.8.2 | Convert.underscore=T will convert all non-literal characters to underscores | | 0.8.2 | Fix saving of Dates | | 0.8.2 | Save with convert.factors by default | | 0.8.2 | Test for NaN and inf values while writing missing values and replace with NA | | 0.8.2 | Remove message about saving factors | | | | 0.8.1 | Convert non-integer variables to factors (```nonint.factors=T```) | | 0.8.1 | Handle large datasets | | 0.8.1 | Working with strL variables is now a lot faster | | | | | <0.8.1 | Reading data files from disk or url and create a data.frame | | <0.8.1 | Saving dta files to disk - most features of the dta file format are supported | | <0.8.1 | Assign variable names | | <0.8.1 | Read the new strL strings and save them as attribute | | <0.8.1 | Convert stata label to factors and save them as attribute | | <0.8.1 | Read some meta data (timestamp, dataset label, formats,...) | | <0.8.1 | Convert strings to system encoding | | <0.8.1 | Handle different NA values | | <0.8.1 | Handle multiple label languages | | <0.8.1 | Convert dates | | <0.8.1 | Reading business calendar files | ### Test Since our attributes differ from foreign::read.dta all.equal and identical report false. If you check the values, everything is identical. ```R library("foreign") r12 <- read.dta("http://www.stata-press.com/data/r12/auto.dta") r13 <- read.dta13("http://www.stata-press.com/data/r13/auto.dta") Map(identical,r12,r13) att <- names(attributes(r12)) for (i in seq(att)) cat(att[i],":", all.equal(attr(r12,att[i]),attr(r13,att[i])),"\n") r12 <- read.dta("http://www.stata-press.com/data/r12/auto.dta",convert.factors=F) r13 <- read.dta13("http://www.stata-press.com/data/r13/auto.dta",convert.factors=F) Map(identical,r12,r13) ``` ## Authors [Marvin Garbuszus](mailto:jan.garbuszus@ruhr-uni-bochum.de) ([JanMarvin](https://github.com/JanMarvin)) and [Sebastian Jeworutzki](mailto:Sebastian.Jeworutzki@ruhr-uni-bochum.de) ([sjewo](https://github.com/sjewo)) ## Licence GPL2 readstata13/MD50000644000176200001440000000633013302354454012703 0ustar liggesusers937bcbd9a32c2dbb8ab22fde4102ddf6 *DESCRIPTION e8c1458438ead3c34974bc0be3a03ed6 *LICENSE a12456c41e1ef128bcbbf3a2177ed986 *NAMESPACE 07789d5571c706690651f1e17339f835 *NEWS 8aa1ee878b92e4641e471dc32ee3a0d2 *R/RcppExports.R bc600119e8b1dbcd7a8715aa40d959cf *R/dbcal.R 8f44ee6ee622eab57ad533ad22974cfe *R/read.R 29ce11e9849d698bed87f5fc9b1b4c72 *R/readstata13.R 3511e8b87b1bc0327b854f254f734b6a *R/save.R 3138b670a9496c66258c107d7c652e3a *R/tools.R c2fcfbe243a55982b4ad6e281574bc44 *README.md a885e4f610350825892c92d3ca858889 *inst/extdata/encode.do 1165031bfee6c9e6ce501baa24e3a7f1 *inst/extdata/encode.dta 23c478f4b7d45b7aabcc48a0f5795480 *inst/extdata/encodecp.dta b9463f13d2e57b2d0ee028368eefcd29 *inst/extdata/gen_fac.do 1530f9cdf1f80c39158ea8d249e19af0 *inst/extdata/gen_fac.dta d6127dcadbd1316ee9dafd18420f01b1 *inst/extdata/missings.do dcd880aca64cc264c0ba20ee9b8d1510 *inst/extdata/missings.dta d66c8a83373c17ab2098ca07b975a97e *inst/extdata/missings_lsf.dta 36d795506440d058f7506aa0a7b70989 *inst/extdata/missings_msf.dta 8204563fbdff2e7ee74951eb894c6154 *inst/extdata/nonint.do ed8842275b4ba33858fe0822ff3f178e *inst/extdata/nonint.dta 295396a1a55b4326d89d2c2a86e90441 *inst/extdata/sp500.stbcal 389e33d907d10ec8efe41250f99221ab *inst/extdata/statacar.do f899f302225e099f83de7ac42f0623f2 *inst/extdata/statacar.dta a4248360860c7223c04f2bda517994fd *inst/extdata/test.zip 1e29776eed16f780a9beee2d11ada4d4 *inst/extdata/underscore.do 18d63a094394dd93f3b4363fcd09f322 *inst/extdata/underscore.dta be3bdd7d0414f9b7b9770645b944320a *inst/include/read_data.h 0a650c8fbc76b901c624289b0676e825 *inst/include/read_dta.h a04dcc41e345cae0fa9351ce678c27e6 *inst/include/read_pre13_dta.h 3ffe5e453924a7c642a6bc5c1086ccc3 *inst/include/readstata.h 36c0ee1660a90fb2d8b961c558c3d145 *inst/include/statadefines.h fc806a4ead84a5b3c6bb4f00af91ebf3 *inst/include/swap_endian.h 3e936e81cffb62a119785e96d210b1e9 *man/as.caldays.Rd 0db337c2d06483d1cc9417c75903b4f5 *man/get.label.Rd 8dff90ecaf79055181b6666d45621b25 *man/get.label.name.Rd 1aeb1e5335f4e76bbe4b046a578a2b80 *man/get.label.tables.Rd 5a4700ab8b6e29b9ad1fd134a6c62977 *man/get.lang.Rd 3b2bb969adb3f8a26d5741cf467d470b *man/get.origin.codes.Rd f3c2ac88ad9ea19659f1d7c35f3d0ac9 *man/maxchar.Rd f78b325f1795fd27452a08b237029e04 *man/read.dta13.Rd 71f1e3ccae8375b9365488ba436eb934 *man/readstata13.Rd 628c37cd5fa01a2bd1c4b96de503fe0e *man/save.dta13.Rd bc27b06c1c1e566f8c3bcb49eadd61b6 *man/saveToExport.Rd 9dd790746cc83f755b65139c745e9c93 *man/set.label.Rd 67e025e2c70d6e96d54703a7b6654663 *man/set.lang.Rd ec29e8c38f6333f0f2ce706a95acff83 *man/stbcal.Rd 12fa4a2647f8413110876599ffee13dc *man/varlabel.Rd 5a37728c526310cfca2804ea6c29fb51 *src/Makevars 5a37728c526310cfca2804ea6c29fb51 *src/Makevars.win e18c84894d06a1255a18152473cb82f1 *src/RcppExports.cpp fd42fb972ec16877b4aa18510233c8db *src/read.cpp eb3d0bf081e4bcf4908d00bc75d61f51 *src/read_data.cpp fb7b806ab067a7ca7f2c51d36596a558 *src/read_dta.cpp 1f235376919f05ec7cb19f91597efd77 *src/read_pre13_dta.cpp fe0af5c98deb865283da1f241f6e0d8e *src/save_dta.cpp a9bcdd02d2270aaea3c54797bc5ce735 *src/save_pre13_dta.cpp 4dd91c288ce11a342d68442481e65e8b *tests/testthat.R ef487b80f9222b1adb3c3ee8dfd8ddde *tests/testthat/test_read.R b6696e529eead060c3c2475213a7c002 *tests/testthat/test_save.R readstata13/DESCRIPTION0000644000176200001440000000242613302354454014103 0ustar liggesusersPackage: readstata13 Type: Package Title: Import 'Stata' Data Files Version: 0.9.2 Authors@R: c( person("Jan Marvin", "Garbuszus", email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut")), person("Sebastian", "Jeworutzki", email="Sebastian.Jeworutzki@ruhr-uni-bochum.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2671-5253")), person("R Core Team", role="cph"), person("Magnus Thor", "Torfason", role="ctb"), person("Luke M.", "Olson", role="ctb"), person("Giovanni", "Righi", role="ctb"), person("Kevin", "Jin", role="ctb") ) Description: Function to read and write the 'Stata' file format. URL: https://github.com/sjewo/readstata13 BugReports: https://github.com/sjewo/readstata13/issues License: GPL-2 | file LICENSE Imports: Rcpp (>= 0.11.5) LinkingTo: Rcpp ByteCompile: yes Suggests: testthat RoxygenNote: 6.0.1 NeedsCompilation: yes Packaged: 2018-05-26 10:41:40 UTC; sj Author: Jan Marvin Garbuszus [aut], Sebastian Jeworutzki [aut, cre] (), R Core Team [cph], Magnus Thor Torfason [ctb], Luke M. Olson [ctb], Giovanni Righi [ctb], Kevin Jin [ctb] Maintainer: Sebastian Jeworutzki Repository: CRAN Date/Publication: 2018-05-26 22:01:16 UTC readstata13/man/0000755000176200001440000000000013302177220013136 5ustar liggesusersreadstata13/man/readstata13.Rd0000644000176200001440000000113113302177220015535 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/readstata13.R \docType{package} \name{readstata13} \alias{readstata13} \alias{readstata13-package} \title{Import Stata Data Files} \description{ Function to read the Stata file format into a data.frame. } \note{ If you catch a bug, please do not sue us, we do not have any money. } \seealso{ \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from Stata Versions < 13 } \author{ Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/saveToExport.Rd0000644000176200001440000000060013077075424016100 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{saveToExport} \alias{saveToExport} \title{Check if numeric vector can be expressed as interger vector} \usage{ saveToExport(x) } \arguments{ \item{x}{vector of data frame} } \description{ Compression can reduce numeric vectors as integers if the vector does only contain integer type data. } readstata13/man/get.label.name.Rd0000644000176200001440000000171113302177220016201 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.label.name} \alias{get.label.name} \title{Get Names of Stata Label Set} \usage{ get.label.name(dat, var.name = NULL, lang = NA) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character vector.} Variable names. If \code{NULL}, get names of all label sets.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} } \value{ Returns an named vector of variable labels } \description{ Retrieves the Stata label set in the dataset for all or an vector of variable names. } \details{ Stata stores factor labels in variable independent labels sets. This function retrieves the name of the label set for a variable. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/save.dta13.Rd0000644000176200001440000000565313302177220015307 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/save.R \name{save.dta13} \alias{save.dta13} \title{Write Stata Binary Files} \usage{ save.dta13(data, file, data.label = NULL, time.stamp = TRUE, convert.factors = TRUE, convert.dates = TRUE, tz = "GMT", add.rownames = FALSE, compress = FALSE, version = 117, convert.underscore = FALSE) } \arguments{ \item{data}{\emph{data.frame.} A data.frame Object.} \item{file}{\emph{character.} Path to the dta file you want to export.} \item{data.label}{\emph{character.} Name of the dta-file.} \item{time.stamp}{\emph{logical.} If \code{TRUE}, add a time.stamp to the dta-file.} \item{convert.factors}{\emph{logical.} If \code{TRUE}, factors will be converted to Stata variables with labels. Stata expects strings to be encoded as Windows-1252, so all levels will be recoded. Character which can not be mapped in Windows-1252 will be saved as hexcode.} \item{convert.dates}{\emph{logical.} If \code{TRUE}, dates will be converted to Stata date time format. Code from \code{foreign::write.dta}} \item{tz}{\emph{character.} The name of the timezone convert.dates will use.} \item{add.rownames}{\emph{logical.} If \code{TRUE}, a new variable rownames will be added to the dta-file.} \item{compress}{\emph{logical.} If \code{TRUE}, the resulting dta-file will use all of Statas numeric-vartypes.} \item{version}{\emph{numeric.} Stata format for the resulting dta-file either Stata version number (6 - 15) or the internal Stata dta-format (e.g. 117 for Stata 13). Experimental support for large datasets: Use version="15mp" to save the dataset in the new Stata 15/MP file format. This feature is not thoroughly tested yet.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, all non numerics or non alphabet characters will be converted to underscores.} } \value{ The function writes a dta-file to disk. The following features of the dta file format are supported: \describe{ \item{datalabel:}{Dataset label} \item{time.stamp:}{Timestamp of file creation} \item{formats:}{Stata display formats. May be used with \code{\link[base]{sprintf}}} \item{type:}{Stata data type (see Stata Corp 2014)} \item{var.labels:}{Variable labels} \item{version:}{dta file format version} \item{strl:}{List of character vectors for the new strL string variable type. The first element is the identifier and the second element the string.} } } \description{ \code{save.dta13} writes a Stata dta-file bytewise and saves the data into a dta-file. } \references{ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/as.caldays.Rd0000644000176200001440000000200713077075424015462 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbcal.R \name{as.caldays} \alias{as.caldays} \title{Convert Stata business calendar dates in readable dates.} \usage{ as.caldays(buisdays, cal, format = "\%Y-\%m-\%d") } \arguments{ \item{buisdays}{numeric Vector of business dates} \item{cal}{data.frame Conversion table for business calendar dates} \item{format}{character String with date format as in \code{\link{as.Date}}} } \value{ Returns a vector of readable dates. } \description{ Convert Stata business calendar dates in readable dates. } \examples{ # read business calendar and data sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) # convert dates and check dat$ldatescal2 <- as.caldays(dat$ldate, sp500) all(dat$ldatescal2==dat$ldatescal) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/maxchar.Rd0000644000176200001440000000117313103030636015050 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{maxchar} \alias{maxchar} \title{Check max char length of data.frame vectors} \usage{ maxchar(x) } \arguments{ \item{x}{vector of data frame} } \description{ Stata requires us to provide the maximum size of a charactervector as every row is stored in a bit region of this size. } \details{ Ex: If the max chars size is four, _ is no character in this vector: 1. row: four 3. row: one_ 4. row: ____ If a character vector contains only missings or is empty, we will assign it a value of one, since Stata otherwise cannot handle what we write. } readstata13/man/varlabel.Rd0000644000176200001440000000160113302177220015213 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{varlabel} \alias{varlabel} \alias{varlabel} \alias{varlabel<-} \title{Get and assign Stata Variable Labels} \usage{ varlabel(dat, var.name = NULL, lang = NA) varlabel(dat) <- value } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character vector.} Variable names. If NULL, get label for all variables.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} \item{value}{\emph{character vector.} Vector of variable names.} } \value{ Returns an named vector of variable labels } \description{ Retrieve or set variable labels for a dataset. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.origin.codes.Rd0000644000176200001440000000205713302177220016572 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.origin.codes} \alias{get.origin.codes} \title{Get Origin Code Numbers for Factors} \usage{ get.origin.codes(x, label.table) } \arguments{ \item{x}{\emph{factor.} Factor to obtain code for} \item{label.table}{\emph{table.} Table with factor levels obtained by \code{\link{get.label}}.} } \value{ Returns an integer with original codes } \description{ Recreates the code numbers of a factor as stored in the Stata dataset. } \details{ While converting numeric variables into factors, the original code numbers are lost. This function reconstructs the codes from the attribute \code{label.table}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) labname <- get.label.name(dat,"type") labtab <- get.label(dat, labname) # comparsion get.origin.codes(dat$type, labtab) as.integer(dat$type) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.lang.Rd0000644000176200001440000000166313302177220015132 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.lang} \alias{get.lang} \title{Show Default Label Language} \usage{ get.lang(dat, print = T) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{print}{\emph{logical.} If \code{TRUE}, print available languages and default language.} } \value{ Returns a list with two components: \describe{ \item{languages:}{Vector of label languages used in the dataset} \item{default:}{Name of the actual default label language, otherwise NA} } } \description{ Displays informations about the defined label languages. } \details{ Stata allows to define multiple label sets in different languages. This functions reports the available languages and the selected default language. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.label.tables.Rd0000644000176200001440000000140513302177220016533 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.label.tables} \alias{get.label.tables} \title{Get all Stata Label Sets for a Data.frame} \usage{ get.label.tables(dat) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} } \value{ Returns a named list of label tables } \description{ Retrieve the value labels for all variables. } \details{ This function returns the factor levels which represent a Stata label set for all variables. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) get.label.tables(dat) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/get.label.Rd0000644000176200001440000000170113302177220015261 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{get.label} \alias{get.label} \title{Get Stata Label Table for a Label Set} \usage{ get.label(dat, label.name) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{label.name}{\emph{character.} Name of the Stata label set} } \value{ Returns a named vector of code numbers } \description{ Retrieve the value labels for a specific Stata label set. } \details{ This function returns the table of factor levels which represent a Stata label set. The name of a label set for a variable can be obtained by \code{\link{get.label.name}}. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) labname <- get.label.name(dat,"type") get.label(dat, labname) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/set.label.Rd0000644000176200001440000000172613302177220015304 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{set.label} \alias{set.label} \title{Assign Stata Labels to a Variable} \usage{ set.label(dat, var.name, lang = NA) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{var.name}{\emph{character.} Name of the variable in the data.frame} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} } \value{ Returns a labeled factor } \description{ Assign value labels from a Stata label set to a variable. If duplicated labels are found, unique labels will be generated according the following scheme: "label_(integer code)". Levels without labels will become . } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13"), convert.factors=FALSE) # compare vectors set.label(dat, "type") dat$type # German label set.label(dat, "type", "de") } readstata13/man/stbcal.Rd0000644000176200001440000000274413077075424014720 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/dbcal.R \name{stbcal} \alias{stbcal} \title{Parse Stata business calendar files} \usage{ stbcal(stbcalfile) } \arguments{ \item{stbcalfile}{\emph{stbcal-file} Stata buisness calendar file created by Stata.} } \value{ Returns a data.frame with two cols: \describe{ \item{range:}{The date matching the buisnesdate. Date format.} \item{buisdays:}{The Stata business calendar day. Integer format.} } } \description{ Create conversion table for business calendar dates. } \details{ Stata 12 introduced business calender format. Business dates are integer numbers in a certain range of days, weeks, months or years. In this range some days are omitted (e.g. weekends or holidays). If a business calendar was created, a stbcal file matching this calendar was created. This file is required to read the business calendar. This parser reads the stbcal- file and returns a data.frame with dates matching business calendar dates. A dta-file containing Stata business dates imported with read.stata13() shows in formats which stdcal file is required (e.g. "%tbsp500" requires sp500.stbcal). Stata allows adding a short description called purpose. This is added as an attribute of the resulting data.frame. } \examples{ sp500 <- stbcal(system.file("extdata/sp500.stbcal", package="readstata13")) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/set.lang.Rd0000644000176200001440000000205413302177220015141 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tools.R \name{set.lang} \alias{set.lang} \title{Assign Stata Language Labels} \usage{ set.lang(dat, lang = NA, generate.factors = FALSE) } \arguments{ \item{dat}{\emph{data.frame.} Data.frame created by \code{read.dta13}.} \item{lang}{\emph{character.} Label language. Default language defined by \code{\link{get.lang}} is used if NA} \item{generate.factors}{\emph{logical.} If \code{TRUE}, missing factor levels are generated.} } \value{ Returns a data.frame with value labels in language "lang". } \description{ Changes default label language for a dataset. Variables with generated labels (option generate.labels=TRUE) are kept unchanged. } \examples{ dat <- read.dta13(system.file("extdata/statacar.dta", package="readstata13")) get.lang(dat) varlabel(dat) # set German label datDE <- set.lang(dat, "de") get.lang(datDE) varlabel(datDE) } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/man/read.dta13.Rd0000644000176200001440000001341613302177220015260 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read.R \name{read.dta13} \alias{read.dta13} \title{Read Stata Binary Files} \usage{ read.dta13(file, convert.factors = TRUE, generate.factors = FALSE, encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors = FALSE, select.rows = NULL, select.cols = NULL, strlexport = FALSE, strlpath = ".") } \arguments{ \item{file}{\emph{character.} Path to the dta file you want to import.} \item{convert.factors}{\emph{logical.} If \code{TRUE}, factors from Stata value labels are created.} \item{generate.factors}{\emph{logical.} If \code{TRUE} and convert.factors is TRUE, missing factor labels are created from integers. If duplicated labels are found, unique labels will be generated according the following scheme: "label_(integer code)".} \item{encoding}{\emph{character.} Strings can be converted from Windows-1252 or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify target encoding explicitly. Stata 14 and 15 files are UTF-8 encoded and may contain strings which can't be displayed in the current locale. Set encoding=NULL to stop reencoding.} \item{fromEncoding}{\emph{character.} We expect strings to be encoded as "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14 or newer "UTF-8" is used. In some situation the used encoding can differ for Stata 14 files and must be manually set.} \item{convert.underscore}{\emph{logical.} If \code{TRUE}, "_" in variable names will be changed to "."} \item{missing.type}{\emph{logical.} Stata knows 27 different missing types: ., .a, .b, ..., .z. If \code{TRUE}, attribute \code{missing} will be created.} \item{convert.dates}{\emph{logical.} If \code{TRUE}, Stata dates are converted.} \item{replace.strl}{\emph{logical.} If \code{TRUE}, replace the reference to a strL string in the data.frame with the actual value. The strl attribute will be removed from the data.frame (see details).} \item{add.rownames}{\emph{logical.} If \code{TRUE}, the first column will be used as rownames. Variable will be dropped afterwards.} \item{nonint.factors}{\emph{logical.} If \code{TRUE}, factors labels will be assigned to variables of type float and double.} \item{select.rows}{\emph{integer.} Vector of one or two numbers. If single value rows from 1:val are selected. If two values of a range are selected the rows in range will be selected.} \item{select.cols}{\emph{character:} Vector of variables to select.} \item{strlexport}{\emph{logical:} Should strl content be exported as binary files?} \item{strlpath}{\emph{cahracter:} Path for strl export.} } \value{ The function returns a data.frame with attributes. The attributes include \describe{ \item{datalabel:}{Dataset label} \item{time.stamp:}{Timestamp of file creation} \item{formats:}{Stata display formats. May be used with \code{\link{sprintf}}} \item{types:}{Stata data type (see Stata Corp 2014)} \item{val.labels:}{For each variable the name of the associated value labels in "label"} \item{var.labels:}{Variable labels} \item{version:}{dta file format version} \item{label.table:}{List of value labels.} \item{strl:}{Character vector with long strings for the new strl string variable type. The name of every element is the identifier.} \item{expansion.fields:}{list providing variable name, characteristic name and the contents of Stata characteristic field.} \item{missing:}{List of numeric vectors with Stata missing type for each variable.} \item{byteorder:}{Byteorder of the dta-file. LSF or MSF.} \item{orig.dim:}{Dimension recorded inside the dta-file.} } } \description{ \code{read.dta13} reads a Stata dta-file and imports the data into a data.frame. } \details{ If the filename is a url, the file will be downloaded as a temporary file and read afterwards. Stata files are encoded in ansinew. Depending on your system's default encoding certain characters may appear wrong. Using a correct encoding may fix these. Variable names stored in the dta-file will be used in the resulting data.frame. Stata types char, byte, and int will become integer; float and double will become numerics. R only knows a single missing type, while Stata knows 27, so all Stata missings will become NA in R. If you need to keep track of Statas original missing types, you may use \code{missing.type=TRUE}. Stata dates are converted to R's Date class the same way foreign handles dates. Stata 13 introduced a new character type called strL. strLs are able to store strings up to 2 billion characters. While R is able to store strings of this size in a character vector, the printed representation of such vectors looks rather cluttered, so it's possible to save only a reference in the data.frame with option \code{replace.strl=FALSE}. In R, you may use rownames to store characters (see for instance \code{data(swiss)}). In Stata, this is not possible and rownames have to be stored as a variable. If you want to use rownames, set add.rownames to TRUE. Then the first variable of the dta-file will hold the rownames of the resulting data.frame. Reading dta-files of older and newer versions than 13 was introduced with version 0.8. } \note{ read.dta13 uses GPL 2 licensed code by Thomas Lumley and R-core members from foreign::read.dta(). } \references{ Stata Corp (2014): Description of .dta file format \url{http://www.stata.com/help.cgi?dta} } \seealso{ \code{\link[foreign]{read.dta}} in package \code{foreign} and \code{memisc} for dta files from Stata versions < 13 and \code{read_dta} in package \code{haven} for Stata version >= 13. } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} Sebastian Jeworutzki \email{sebastian.jeworutzki@ruhr-uni-bochum.de} } readstata13/LICENSE0000644000176200001440000004315213077075424013411 0ustar liggesusersGNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. {description} Copyright (C) {year} {fullname} This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. {signature of Ty Coon}, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License.