data.table/0000755000175100001440000000000013174561362012277 5ustar hornikusersdata.table/inst/0000755000175100001440000000000013172212367013250 5ustar hornikusersdata.table/inst/tests/0000755000175100001440000000000013172210047014403 5ustar hornikusersdata.table/inst/tests/fread_line_error.csv0000644000175100001440000004052613172210047020430 0ustar hornikusers3,2-0-6 4:2:7.4 HAV,2,7,0,2,4,RI Y#4,T/U,2,1,1,Q,,OX~JA LI~BC,,OZ~4~FB,,,,,5,,.Q8_2_0W_8_1_7_L-4-U-5_1YSV-S-3-5.X 4,2-1-3 4:6:2.1 MIC,4,,7,0,1,LN V#4,S/BK [LR QT],3,3,7,V RX IF: KU XB,VX~IE TW~LA,FP~NC IK~KJ,HV~4~FP,AW~5~SP,,,,,4,,.A5_0_2U_6_6_6_M-0-Z-2_0KJL-U-3-8.L 5,2-5-4 6:2:8.4 RUF,1,3,4,6,5,UO Y#2,O/VQ I K K ,8,5,3,P TN UP: YC PD,HT~AI OL~SL,LL~UX US~QO,OF~0~CU,IQ~3~PZ,,,,,1,,.M8_4_3Q_5_2_4_V-8-J-1_7SXB-O-6-7.J 8,4-5-8 2:6:0.2 NWF,3,,3,3,8,ZC S#2,P/HF [UU],5,1,3,P JZ: XT,RE~VL QW~IO,PF~ST KJ~JY,CG~8~EI,HF~3~RE,,,,,3,VS [CP] Z L A,.K3_8_0N_1_4_8_Q-0-C-6_3MOQ-K-2-7.C 4,4-5-8 6:0:2.0 GQP,5,0,6,4,2,DY Y#5,L/AH QE Q NL NY,5,8,4,R FZ: HO,,"WC ZA (5-8-6) 7 DHKABA ZA, SADGLRQ, WE 4~BWIR VD~BI",,"ZT ZJ (6-3-2) 5 LHXYPW NY, IWGFUFH, OF 1~6~DC",,,,,4,,.J0_0_4W_5_8_7_R-8-Q-3_7NXQ-E-8-2.S 6,4-5-5 1:8:3.8 PVV,2,6,2,7,5,YU G#8,H/BV O 'AC JB',7,0,3,L ZW DR,,EI 7~NDPR NX JV~WA,,IM 4~5~GR,,,,,1,,.C2_4_0V_5_0_8_V-5-O-8_0WVY-D-0-2.S 0,1-1-0 1:4:4.2 XIP,4,3,2,2,0,JA D#1,Q/MR T 'FV',1,6,8,Q HW,EY 5~ZLIQ IX AF~VM,HI 0~BWJJ ZF TU~TV,TI 2~7~KW,AT 7~8~JQK=8,,,,,2,,.R1_5_4B_4_0_8_J-0-I-5_4NFW-P-3-3.Q 1,4-4-2 2:0:6.6 YXJ,5,1,1,3,7,IP N#4,A/FI [IQ U],2,3,7,F IG U,,OG 8~JSJV KT MR~EF___R/G/H-O-R.R,,TW 0~1~~POWB SV ML~SQ___A/E/Y-I-H.K,BE 4~MNSG NT MN~LZ___O/M/L-R-S.W,ZG 4~5~SWZ=1___J/F/K-H-S.F,UP 2~4~ZKH=7___W/R/Y-H-D.H,,,,,7,,.Y5_1_7M_1_3_8_A-2-D-5_7FCM-A-5-4.R 3,4-0-5 2:1:0.6 YAP,,,,,,SE Y#6,A/GU [CT >],7,7,7,,,,,,,,,,6,,.R7_1_8Y_5_5_2_D-6-X-0_6UUM-J-6-3.W 32,0-0-4 6:6:3.5 XJZ,,,,,,BZ T#3,W/UN [AT],4,2,7,,,,,,,,,,8,,.U5_5_8H_7_6_0_U-5-J-7_2GNY-J-3-5.X 31,3-0-7 4:1:7.5 HVV,,,,,,NK K#6,TT A R,4,4,7,,,,,,,,,,5,,#O,M,B.Y,Q.B,N.O,U.M.V,G.P.J,L,J,G.L,C.G,P,O.U,L.Y.U,F.W.D,H.N.R,S.R.N,N.R,A.N,G.N,H.EHE,R,S,V 42,2-3-8 0:4:4.7 YOB,4,5,4,3,4,YM D#2,J/G,0,6,7,G,,GF~PY OE~IW,,NM~7~MV,,,,,3,,.X2_7_2H_1_8_6_S-1-Y-8_2SDZ-T-6-0.X 43,8-2-5 0:6:7.5 DKY,3,,8,5,7,HD K#6,X/FL [RW SK],6,7,1,J CO EC: LU VO,YC~PR XU~GV,QX~VK GC~LR,RP~7~OI,YB~0~UR,,,,,3,,.Z1_3_2G_8_5_1_E-1-Z-2_1XHH-Y-3-1.H 73,8-1-6 7:5:2.5 CZB,8,7,3,4,5,ZD A#7,A/JK O C L ,3,0,1,N DT WY: WC UU,GR~XX AG~DG,JR~VN VX~AF,ZK~5~GG,GY~0~WC,,,,,5,,.D4_2_4T_0_7_8_R-6-H-3_4XSR-C-3-4.F 00,1-7-6 6:2:5.1 TAX,0,,1,1,8,NR P#5,O/IF [JL],2,8,1,O FD: GT,XA~DD QO~OP,ZQ~ES GO~XR,PX~8~LR,UG~0~CQ,,,,,5,LV [IZ] R S U,.Z3_5_3K_1_1_2_M-7-P-1_2WUR-T-8-7.F 60,1-3-3 1:3:2.8 VSC,1,7,5,6,4,DU C#8,V/DD MO B WU JL,3,2,8,E WR: WB,,"JU MW (7-5-5) 4 ZGKIRMRLJ LJ, QVQPMBPTZUVP, KK 7~JBMJ JD~WF",,"ZQ CP (5-1-2) 2 WEUCFZTKY FL, NPSTRXNZNLEB, YW 8~7~UI",,,,,0,,.X1_7_4M_3_6_2_B-3-L-8_7WWD-F-0-8.Q 04,4-8-5 4:0:8.8 DQU,0,0,0,5,7,LF X#7,D/WD X 'MU EV',7,4,5,T JU MB,,FY 2~PPCQ QC LR~AG,,EV 1~4~PS,,,,,2,,.F7_5_2J_3_3_3_D-5-Z-1_5KWT-L-4-6.R 05,6-1-2 0:3:6.1 CQP,3,3,0,3,5,GB G#3,A/OH G 'VB',3,4,0,O VD,JT 6~WSQK GM XF~AN,LL 6~EXTZ CX VZ~LL,CM 7~2~VU,ZU 4~7~WBQ=0,,,,,7,,.X8_3_2S_6_5_7_J-1-B-8_5LDR-I-6-6.R 66,8-5-6 5:7:0.8 QQX,0,8,3,7,4,RR K#1,V/TO [IR Q],7,8,4,O QH Q,,IK 5~XHED HG WB~KZ___W/G/P-C-J.K,,BH 5~8~RTY=5___X/G/O-B-I.M,,,,,1,,.A8_5_5O_3_5_8_B-5-H-3_8XGC-O-4-5.P 37,3-6-3 1:7:3.1 UII,1,,5,2,0,VJ F#4,D/FN P K J ,7,4,5,"T #2 - OH O-U X D#3 - TO V X-J G#8 - TJ K N#2 - EA R#0 - ZU Q-K#4 - KR I-F#4 - WP O#2 - LM A#1 - ON D M#5 - XN O#3 - PH A F S#5 - WC R#7 - EE E#4 - QJ P#2 - JW M M#0 - JA U Z#8 - PO W#0 - GO M E#6 - VQ D C#8 - IG I O#6 - XS J K#6 - E & R Y#3 - MB D W J#6 - QJ M#1 - XW V F#3 - YE P K Q#2 - XO L T#6 - XS L F#4 - WM B/Z#1 - PH P#4 - DM Y N#6 - OM B L#4 - MM E Q#8 - XO N W O A#2 - RP J B#3 - JP X S#7 - AA S S R#3 - KD O#5 - EQ G#4 - WM J Q#8 - GW A G R#0 - OW X#6 - HQ U#0 - XJ N#7 - WR/M I#3 - ND P#7 - BV W D I#2 - TT W#4 - VP Y O#7 - RM P#7 - UZ K Q X#1 - PM Y#6 - DF J N#4 - JX O Z#7 - KW R W#3 - ZL O U#4 - TD X B#4 - XM#2 - RH#4 - GT#8 - BU E A#2 - BO#1 - JV#3 - DU G K#7 - DU C#2 - RP W L#8 - RF#8 - DC N#7 - WD R#5 - QX X H W#8 - YN X S G#6 - DE H J#6 - ZY O N#7 - DA-T K Y#8 - RD D Q#6 - US S S#2 - LJ R I#0 - IP V K#4 - XD J#7 - MS O C L#8 - JA W V#7 - IK P#0 - LK C#5 - YA P K J Z#3 - FB G N#5 - JT G#6 - CJ L D#1 - YL Y I S#0 - ZS Y W I#2 - YP V R#7 - RE Q T#5 - RA A#8 - RR H K#8 - UO J#5 - KJ Z#3 - MO N H#3 - DX 0/0 K N#5 - ST F Q#2 - JT F Y#4 - WP B S#3 - NN V#4 - DV/M D#1 - LR#3 - MB V#8 - JM U#6 - LX J W#2 - OX#5 - VH P#3 - JD B L#4 - UM E Z#2 - VT M#7 - YA N#3 - PX L#1 - CA A#0 - NJ N#5 - ES/M B Y#6 - KT/O V J#4 - OU/U E W#7 - SJ N O T#0 - PW A V G#7 - JP T#7 - LV Z-L O#0 - JD J#1 - NT A#2 - QV P H#4 - DC M G#,EE 5~ZTLG CK BJ~FR___K/T/N-E-D.T,FF 1~HZKW YA RI~KR___U/B/K-C-Q.G,YP 8~7~HBN=1___Y/C/A-F-Q.L,RR 3~8~OIZ=7___S/O/P-H-A.O,,,,,7,,.A5_0_1E_8_5_3_I-5-N-4_7RSS-U-3-5.E 8,6-4-8 8:3:1.6 TAB,,,,,,CT U#3,S/QB [OW >],4,3,4,,,,,,,,,,6,,.E8_7_8W_8_4_3_M-1-H-7_4NAB-Q-2-2.K 4,3-8-4 5:4:2.5 TZQ,,,,,,RT A#5,L/XQ [FW],8,0,5,,,,,,,,,,2,,.X8_6_5J_7_1_3_H-2-P-8_3AJN-T-8-1.G 8,3-4-7 6:3:4.3 UTS,,,,,,VM U#1,AI L R,3,3,1,,,,,,,,,,8,,#U,C,Z.Y,I.Q,D.U,I.H.F,B.Q.R,C,M,A.R,I.S,H,Y.W,L.A.B,J.N.E,Z.D.Q,K.U.S,T.C,C.I,G.P,Z.UVV,E,I,D 1,0-6-3 3:2:2.2 JRF,8,8,3,8,4,CW E#0,Y/S,4,3,1,Y,,CZ~HC PA~CZ,,YE~4~JD,,,,,0,,.J3_4_5Y_8_0_7_B-4-W-4_0BVK-C-4-8.C 3,6-0-8 3:4:8.4 RBA,8,,4,3,1,SU K#5,N/DB [FI YM],3,5,1,D BA NR: TR KS,PS~BN EI~IS,EH~XD OX~TE,YP~5~GN,FL~7~QD,,,,,1,,.I1_0_4F_5_4_8_Q-0-S-8_4RFJ-Z-1-7.L 4,0-3-8 6:8:5.7 SWO,2,2,6,5,4,DO B#3,K/UQ X X U ,0,0,4,X ZM HP: WE LE,GQ~UD VR~CT,JY~CX ZJ~BU,FP~1~XG,CL~0~AE,,,,,2,,.P2_6_8H_7_5_7_X-5-H-2_2BBT-G-0-0.Z 3,6-5-5 0:8:5.1 BVF,,,,,,IC B#4,Z/WN [FX],1,7,5,,,,,,,,,,6,YV [IJ] E Y R,.V8_1_1C_2_0_5_H-7-H-7_1RHX-A-5-0.G 6,5-3-6 8:6:7.5 ERK,3,8,2,3,5,JV T#1,S/ET WR F UX ID,3,6,4,R HH: XV,,"CW BE (7-7-3) 4 KIWFPXJXN NZ, SOTNRINKIO, VY 8~JSGW BI~XX",,"XF OT (1-1-3) 5 WNPAQEUDE TA, ILWAPWMVVQ, BS 3~1~FU",,,,,6,,.Z7_4_0A_2_8_8_E-8-S-5_8UVU-F-3-2.I 1,5-6-4 3:6:3.1 WWT,2,5,6,6,4,TY L#4,G/WT M 'VP TZ',4,1,1,P WM CX,,WT 1~ZOVQ NF MF~KV,,MA 7~0~DG,,,,,0,,.F6_7_7U_2_2_2_X-6-Q-0_1NQD-T-0-3.R 0,8-8-5 1:3:1.8 AMA,5,8,7,4,6,TN P#2,Y/ID E 'XH',6,6,2,W FB,EC 7~HAEK VZ FW~EJ,EI 5~LAKE QM OQ~IB,EL 8~4~PA,WH 3~0~HJG=3,,,,,5,,.G2_3_7Q_2_5_1_Z-6-Q-3_6YVL-L-2-3.R 0,3-1-8 0:6:1.4 QPY,7,1,7,2,1,EC D#6,I/GA [MV O],0,3,2,Y,,YL 8~GLOC VN SG~KI___H/B/V-S-I.R,,HF 7~5~~PENK UI SV~HR___Y/M/G-U-M.L,EL 2~ABUU LO FH~JM___G/Y/Q-L-P.U,JB 6~4~UXG=0___Y/O/M-D-Z.U,UQ 2~2~RES=1___Y/I/E-J-V.H,,,,,2,,.Y5_8_2D_3_8_6_Z-0-K-4_3VRF-H-5-7.R 8,0-7-2 3:0:6.8 DFS,,,,,,JA L#0,I/KT [YM >],7,1,6,,,,,,,,,,2,,.X4_0_3Y_6_4_1_M-7-K-1_7JYJ-A-0-2.A 7,7-0-8 2:2:0.0 DZE,,,,,,RW F#6,K/SA [NJ],4,7,0,,,,,,,,,,0,,.P3_5_8X_4_0_0_H-8-L-8_0UJI-F-0-3.I 7,5-7-1 5:6:0.8 UVP,,,,,,GC T#1,PP W V,5,2,1,,,,,,,,,,3,,#G,N,M.J,V.Z,Y.C,I.A.Y,T.H.Y,B,R,U.C,D.H,N,K.I,T.Q.R,H.F.T,A.Y.V,G.O.G,T.R,F.Q,Z.U,V.ACI,N,S,R 8,4-3-1 3:2:1.2 XYP,1,6,5,2,1,CT R#3,R/R,5,4,0,S,,QS~CX FS~QR,,KR~6~GD,,,,,8,,.M6_8_5P_3_1_4_W-7-U-5_4IST-X-5-0.A 6,5-6-8 1:1:1.1 UGK,3,,5,5,5,WO Y#8,V/OP [AM RF],5,0,8,O LG DH: ID JZ,IT~SP DI~CB,DG~IZ EA~OG,JR~3~FO,XQ~8~AR,,,,,2,,.T7_3_2H_1_5_2_X-0-Y-3_3HQX-Y-2-1.J 2,2-6-7 6:7:8.0 ZCI,8,1,2,8,7,EJ S#6,L/EQ Q Q K ,7,3,6,G US LD: YN EA,LG~QE DZ~AR,IV~BS SQ~CY,QR~2~XJ,BJ~8~EW,,,,,7,,.F1_3_1C_3_8_0_Q-6-Z-4_1WLV-G-5-5.U 0,6-6-0 5:6:1.4 LNI,,,,,,ZG U#5,V/FH [EN],3,8,1,,,,,,,,,,1,MM [JE] O A H,.W0_6_0Y_1_5_7_B-0-S-5_7VSF-W-8-7.L 1,6-3-8 8:7:5.4 WVP,2,6,2,4,0,NL Z#7,O/CH AZ Z UM FP,8,8,0,P HU: MF,,"RD CT (4-6-0) 7 TFDTX MI DF JAN PA 7, KHIBTPIN, ST 6~YDXA DA~YJ",,"SB GQ (6-5-4) 5 CCEAP LQ IB LGX HL 2, FPPJAYEO, KE 1~0~GG",,,,,5,,.I4_2_6L_6_0_7_F-0-C-0_6BGG-S-0-0.R 4,2-5-5 3:4:4.1 VMF,1,1,4,3,5,XT H#1,V/JL A 'MQ VN',5,1,4,M NM FA,,AD 4~YTDX DC AB~FX,,BQ 8~5~EF,,,,,1,,.R4_3_7U_6_5_1_O-0-R-0_2JGX-P-8-1.F 7,7-0-3 1:3:8.4 WQO,6,6,3,8,1,FP K#1,M/RG O 'UI',2,6,3,R TS,XJ 8~AXFD XB KB~OT,YB 0~TPVY RY SL~WZ,YC 8~6~IP,FI 8~6~LVY=2,,,,,5,,.M5_8_0A_4_1_4_C-2-H-1_0QUX-V-3-1.G 1,4-1-2 5:2:6.1 HHF,5,4,6,6,0,NY Q#8,W/BS [SN B],3,4,0,D ZW Y,,WK 3~UMJT PZ XD~LJ___G/D/R-U-J.H,,LO 7~5~~OSKA GM YJ~FT___I/G/X-Q-B.J,QR 2~AGXF XM BT~JY___M/E/H-V-M.F,SN 1~1~BUA=5___U/U/O-G-A.I,XL 0~7~GKZ=3___V/F/P-M-N.A,,,,,6,,.D0_5_2O_4_7_2_E-6-C-2_3PJV-A-8-3.H 8,3-6-5 3:1:6.5 CXO,,,,,,BG Y#2,X/DO [ZM >],6,8,1,,,,,,,,,,2,,.W5_2_1A_7_0_8_T-8-S-8_8QMR-V-3-7.S 1,8-0-1 1:1:0.4 YYA,,,,,,RK V#5,Z/PP [PG],1,0,0,,,,,,,,,,2,,.U3_0_6Z_6_1_6_N-4-J-4_7OCG-L-8-2.N 7,7-8-4 3:8:4.4 UNN,,,,,,ED G#3,TH V S,8,1,1,,,,,,,,,,3,,#S,B,K.X,W.B,Z.T,Z.I.G,J.D.E,V,R,K.D,Y.Y,M,X.R,I.Z.U,R.X.O,C.J.F,T.S.G,D.U,Q.Y,N.Z,W.LEU,J,W,J 0,7-0-1 8:1:6.0 WNL,3,8,8,0,5,IO Z#4,G/C,3,4,1,C,,GG~YA OO~DY,,WH~3~XU,,,,,0,,.Q1_6_8P_3_3_7_B-3-L-4_1IVB-T-8-7.Q 1,5-2-3 2:0:6.8 OZK,0,,8,2,0,LO M#5,J/TZ [JB WX],0,7,8,H HD GG: TZ KO,MV~UI EB~KI,WQ~CV RX~IY,MA~3~EY,LH~0~JJ,,,,,3,,.M4_5_3S_2_3_1_H-0-Z-1_4IXO-G-6-1.F 0,8-6-8 8:6:7.4 QMK,1,6,2,0,7,XU S#3,H/FJ P O U ,1,3,1,H KY XN: FK DQ,WD~DC QY~PC,VI~UA FC~QL,ZJ~1~HF,XM~1~SI,,,,,1,,.W5_4_8L_4_7_7_P-6-D-0_7KGP-K-7-5.Q 6,0-3-0 4:6:8.8 VWA,8,,7,0,6,EA Y#8,P/CJ [IC],2,0,2,Y DL: TU,MP~CA JO~NW,ZK~NC EH~PE,AN~1~JD,HT~3~RI,,,,,8,LV [EO] M C R,.J0_8_1V_5_6_5_Q-2-M-6_8LIG-V-4-2.G 6,8-4-3 3:4:2.6 JMA,2,6,1,2,2,FP W#4,H/ZB AR H WJ GS,0,8,2,R PD: AQ,,"DG DC (8-4-0) 7 BEDE ZASLK TA, MCQABRT, BO 8~DQKU NV~FE",,"YY ZT (7-7-3) 3 XJRK SZYWB DB, IFXPFAV, YT 1~4~TQ",,,,,6,,.F6_6_4Y_5_0_6_U-8-I-5_4BXI-Q-8-5.V 7,6-1-8 2:5:4.3 SIM,7,0,8,7,4,UV M#0,J/RM Z 'SM SX',1,6,5,A RF ZX,,AF 5~MRCV QY KI~VG,,LG 3~1~CK,,,,,4,,.Q5_1_5P_3_1_3_A-6-F-0_7QBH-N-5-8.J 2,4-8-7 7:4:5.0 MOE,3,7,8,2,1,AI B#6,G/TI S 'NB',5,7,4,N IY,DD 5~QLTA VK YV~RP,TE 6~EGFG AH BX~OT,GP 2~6~PH,MO 8~3~ALM=1,,,,,4,,.E1_7_7F_5_7_3_W-0-R-7_2CXX-Z-7-2.Z 2,0-2-3 8:2:1.7 WUW,8,1,5,3,0,FU L#3,C/ON [GA T],5,6,7,A HK E,,AI 7~FVYO IQ BE~JK___X/X/K-M-V.M,,IP 1~5~~YYFK LM IX~CF___U/I/C-P-L.U,BV 0~IQZK BD XZ~YC___D/R/V-D-X.K,AO 8~8~DGE=7___Q/A/P-U-K.H,PA 1~8~OZU=8___N/Y/B-A-E.H,,,,,2,,.Z5_5_6F_1_2_5_H-6-X-0_6HUX-Q-4-2.E 0,8-3-5 8:3:1.7 HOA,,,,,,FJ V#7,Z/KQ [AA >],6,2,1,,,,,,,,,,5,,.R7_1_7O_6_1_7_S-7-H-8_7YYL-Y-2-1.Q 8,2-1-5 3:7:4.6 ZNW,,,,,,IV J#7,M/XA [OO],0,1,2,,,,,,,,,,5,,.P5_6_3S_5_3_0_P-0-N-3_2URR-O-1-7.C 8,2-5-0 5:0:3.0 FXN,,,,,,AT F#1,WX I I,6,8,3,,,,,,,,,,8,,#V,I,N.K,L.Y,C.V,D.B.T,K.E.M,L,Q,R.Y,J.L,L,V.Z,E.Y.R,F.F.G,O.W.R,N.U.Z,E.P,B.T,Q.Y,Z.UOD,S,Y,J 8,3-3-3 1:8:2.3 TOO,3,8,2,1,3,YA I#5,S/I,2,5,4,H,,KT~IW UG~LN,,LE~4~IY,,,,,4,,.B5_3_2M_5_7_4_L-1-D-2_7JLS-R-6-5.I 8,7-1-7 3:7:8.2 UNV,7,,6,7,0,LF C#6,B/RM [LV DA],1,0,8,K OM HO: FF OM,QV~VO PL~EB,AR~NU FL~JR,LS~4~TQ,QD~5~BX,,,,,7,,.H0_0_7O_3_6_3_X-4-J-0_0JQM-D-6-8.R 8,8-4-0 5:5:1.0 APJ,3,7,4,2,5,MM H#1,N/II G J V ,3,1,6,I EQ AB: VG SF,KE~KA XO~HU,SA~RA RM~KE,LK~3~GN,QC~4~RD,,,,,5,,.W7_4_7E_5_3_5_W-8-Y-5_5HNH-S-6-4.I 7,7-8-4 2:6:2.5 QJJ,0,,6,1,4,VD P#5,K/HJ [PE],7,6,2,W NU: TF,SX~ET CT~AF,SN~IM RY~KL,JT~6~SY,PN~4~UA,,,,,1,YC [CH] M H D,.H4_2_1K_8_1_5_C-0-C-4_6DDT-U-7-7.B 2,3-5-7 2:0:6.4 PJM,6,5,8,7,1,QV V#8,H/MX JC A AW KL,8,1,4,F QN: GH,XJ~SV ZM~WF,VR BU~QE WI~PE,OW~7~DR,ME PO~2~UCE=5,,,,,8,AU L A Z,.M7_1_7Z_1_5_2_H-7-B-7_1HTC-V-2-4.W 6,8-6-3 7:3:0.2 YJX,5,8,4,6,7,SD Z#4,J/QJ M 'PC ZU',8,7,4,R DJ: EI,BQ~BS IV~UC,YU SV~BE HL~XR,XZ~2~ML,IQ FE~6~TUE=7,,,,,3,YD [OA TT] U V K,.V2_7_7C_8_4_1_S-7-A-7_1RXT-N-1-2.Z 2,2-2-5 7:6:6.8 KLO,0,8,6,4,2,QP U#0,O/ZU D 'QY',6,8,2,I DG: QJ,NW~DN BR~GY,SD TJ~NA YZ~BZ,MY~4~QF,CV MU~3~PXJ=5,,,,,3,ZA [KM] H T E,.Q8_6_4E_4_7_0_K-8-N-7_7YTY-M-3-8.Y 6,8-4-3 1:4:7.1 AQO,1,3,4,0,1,PW R#0,V/YX E F,6,7,4,J VI: DH,FC~FA NG~CS,QX CZ~UU IV~JM,LI~4~HQ,EA DF~7~QVI=7,,,,,1,KE M H[F_K] L B B,.A8_2_2P_7_3_3_H-6-S-5_8HFI-N-3-2.L 6,3-7-6 8:4:2.1 UCA,7,0,3,4,7,EC G#2,B/NE T N,5,4,3,D SR: BO,VT~UI LY~BJ,NL GZ~TL UV~DW,LG~3~WZ,OK AQ~7~BHS=8,,,,,8,"P",.H4_1_7V_0_0_2_K-3-E-8_1HSG-W-6-7.W data.table/inst/tests/tests.Rraw0000644000175100001440000203652613172210047016420 0ustar hornikusers if (!exists("test.data.table",.GlobalEnv,inherits=FALSE)) { require(data.table) # in dev the package should not be loaded options(warn=0) # use require() so it warns but doesn't halt if not available inst_pkgs = rownames(installed.packages()) sugg_pkgs = c("bit64", "knitr", "nanotime", "chron", "ggplot2", "plyr", "reshape", "reshape2", "testthat", "hexbin", "fastmatch", "nlme", "GenomicRanges", "xts", "gdata", "caret", "curl", "zoo", "plm", "rmarkdown", "parallel") lapply(setNames(sugg_pkgs, nm = sugg_pkgs), function(pkg) if(pkg %in% inst_pkgs) require(pkg, character.only=TRUE)) # reshape2 ahead of reshape ... try(detach(package:reshape2),silent=TRUE) try(detach(package:reshape),silent=TRUE) if("reshape2" %in% inst_pkgs) library(reshape2, pos="package:base", logical.return=TRUE) if("reshape" %in% inst_pkgs) library(reshape, pos="package:base", logical.return=TRUE) .devtesting=FALSE } else { # Matt has suppressMessages(require(bit64)) in .Rprofile if ("package:reshape2" %in% search()) { detach(package:reshape2) library(reshape2, pos="package:base", logical.return=TRUE) } if ("package:reshape" %in% search()) { detach(package:reshape) library(reshape, pos="package:base", logical.return=TRUE) } .devtesting=TRUE } options(warn=2) setDTthreads(2) # Tests are small and quick so should themselves switch to 1 thread, but explicity limit to 2 # so as not to breach CRAN policy, just in case. nfail = ntest = lastnum = 0 whichfail = NULL .timingtests = FALSE started.at = Sys.time() # Test default values in case user set global option. These are restored # at the end of this file. oldalloccol = options(datatable.alloccol=1024L) if (!.devtesting) { test = data.table:::test INT = data.table:::INT compactprint = data.table:::compactprint is.sorted = data.table:::is.sorted forderv = data.table:::forderv forder = data.table:::forder null.data.table = data.table:::null.data.table uniqlist = data.table:::uniqlist uniqlengths = data.table:::uniqlengths setrev = data.table:::setrev setreordervec = data.table:::setreordervec selfrefok = data.table:::selfrefok setattr = data.table::setattr # so as not to use bit::setattr .R.listCopiesNamed = data.table:::.R.listCopiesNamed .R.assignNamesCopiesAll = data.table:::.R.assignNamesCopiesAll .R.subassignCopiesOthers = data.table:::.R.subassignCopiesOthers .R.subassignCopiesVecsxp = data.table:::.R.subassignCopiesVecsxp setdiff_ = data.table:::setdiff_ frankv = data.table::frankv is_na = data.table:::is_na shallow = data.table:::shallow # until exported chmatch2 = data.table:::chmatch2 which_ = data.table:::which_ shift = data.table::shift any_na = data.table:::any_na replace_dot_alias = data.table:::replace_dot_alias isReallyReal = data.table:::isReallyReal between = data.table::between which.first = data.table:::which.first which.last = data.table:::which.last trim = data.table:::trim `%+%.default` = data.table:::`%+%.default` .shallow = data.table:::.shallow getdots = data.table:::getdots second = data.table::second # avoid S4Vectors::second binary = data.table:::binary } # test for covering tables.R 100%, we need to run tables() before creating any data.tables to return null data.table test(0, tables(), null.data.table(), output = "No objects of class") TESTDT = data.table(a=as.integer(c(1,3,4,4,4,4,7)), b=as.integer(c(5,5,6,6,9,9,2)), v=1:7) setkey(TESTDT,a,b) # i.e. a b v # [1,] 1 5 1 # [2,] 3 5 2 # [3,] 4 6 3 # [4,] 4 6 4 # [5,] 4 9 5 # [6,] 4 9 6 # [7,] 7 2 7 ########################## test(1, TESTDT[SJ(4,6),v,mult="first"], 3L) test(2, TESTDT[SJ(4,6),v,mult="last"], 4L) test(3, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE], INT(4,4,4)) test(4, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE], INT(6,6,6)) test(5, TESTDT[SJ(c(4,4,4),c(6,6,7)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4)) test(6, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="last",roll=TRUE,rollends=FALSE], INT(6,6,NA)) test(7, TESTDT[SJ(c(4,4,4),c(9,9,10)),v,mult="first",roll=TRUE,rollends=FALSE], INT(5,5,NA)) test(8, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v], INT(NA,NA,NA,NA,NA)) test(9, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE], INT(NA,NA,NA,6,NA)) test(10, TESTDT[SJ(c(-9,1,4,4,8),c(1,4,4,10,1)),v,roll=TRUE,rollends=FALSE], INT(NA,NA,NA,NA,NA)) test(11, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first"], INT(NA,NA,3,3,NA,7,NA)) test(12, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="first",roll=TRUE], INT(NA,1,3,3,6,7,7)) test(13, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last"], INT(NA,NA,6,6,NA,7,NA)) test(14, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",roll=TRUE], INT(NA,1,6,6,6,7,7)) test(15, TESTDT[SJ(c(-3,2,4,4,5,7,8)),v,mult="last",nomatch=0], INT(6,6,7)) test(16, TESTDT[SJ(c(4)),v], INT(3,4,5,6)) #test(17, suppressWarnings(TESTDT[SJ(c(4,4)),v,mult="all",incbycols=FALSE][[1]]), INT(3:6,3:6)) test(18, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=0,by=.EACHI][[2]], INT(3:6)) test(185, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",nomatch=NA], INT(NA,NA,3:6,NA)) test(19, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:6,7)) test(186, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,nomatch=NA], INT(NA,1,3:6,7)) test(20, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:6)) test(187, TESTDT[SJ(c(-3,2,4,8)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA], INT(NA,1,3:6,NA)) test(21, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0], INT(1,3:4)) test(188, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA, allow.cartesian=TRUE], INT(NA,1,NA,3:4,NA,NA,NA)) test(22, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,4,6)) test(189, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA, allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,6,NA)) test(23, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:4,4)) test(190, TESTDT[SJ(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,NA,NA)) test(24, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,5:6,6)) test(191, TESTDT[SJ(c(1,NA,4,NA,NA,4,4),c(5,5,6,6,7,9,10)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,NA,NA,1,3:4,5:6,6)) # Note that the NAs get sorted to the beginning by the SJ(). # i.e. a b v (same test matrix, repeating here for easier reading of the test cases below) # [1,] 1 5 1 # [2,] 3 5 2 # [3,] 4 6 3 # [4,] 4 6 4 # [5,] 4 9 5 # [6,] 4 9 6 # [7,] 7 2 7 test(25, TESTDT[SJ(4,6),v,mult="first"], 3L) test(26, TESTDT[SJ(4,6),v,mult="last"], 4L) test(27, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE], INT(4,4,4)) test(28, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE], INT(6,6,6)) test(29, TESTDT[J(c(4,4,4),c(7,6,6)),v,mult="last",roll=TRUE,rollends=FALSE], INT(4,4,4)) test(30, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="last",roll=TRUE,rollends=FALSE], INT(NA,6,6)) test(31, TESTDT[J(c(4,4,4),c(10,9,9)),v,mult="first",roll=TRUE,rollends=FALSE], INT(NA,5,5)) test(32, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v], INT(NA,NA,NA,NA,NA)) test(33, TESTDT[J(c(8,1,4,4,-9),c(1,4,4,10,1)),v,roll=TRUE], INT(NA,NA,NA,6,NA)) test(34, TESTDT[J(c(8,1,4,4,-9),c(1,4,7,10,1)),v,roll=TRUE,rollends=FALSE], INT(NA,NA,4,NA,NA)) test(35, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first"], INT(NA,3,NA,NA,3,7,NA)) test(36, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="first",roll=TRUE], INT(6,3,NA,7,3,7,1)) test(37, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last"], INT(NA,6,NA,NA,6,7,NA)) test(38, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",roll=TRUE], INT(6,6,NA,7,6,7,1)) test(39, TESTDT[J(c(5,4,-3,8,4,7,2)),v,mult="last",nomatch=0], INT(6,6,7)) test(40, TESTDT[J(c(4)),v,mult="all"], INT(3,4,5,6)) test(41, TESTDT[J(c(4,4)),v,mult="all", allow.cartesian=TRUE], INT(3:6,3:6)) test(42, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=0], INT(3:6)) test(192, TESTDT[J(c(8,2,4,-3)),v,mult="all",nomatch=NA], INT(NA,NA,3:6,NA)) test(43, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=0], INT(7,1,3:6)) test(193, TESTDT[J(c(8,2,4,-3)),v,mult="all",roll=TRUE,nomatch=NA], INT(7,1,3:6,NA)) #test(44, suppressWarnings(TESTDT[J(c(8,4,2,-3)),v,mult="all",roll=TRUE,rollends=FALSE,incbycols=FALSE]), INT(3:6,1)) test(45, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=0], INT(1,3:4)) test(194, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,NA,NA,NA)) test(46, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=0], INT(1,3:4,4,6)) test(195, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,6,NA)) test(47, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=0], INT(1,3:4,4)) test(196, TESTDT[J(c(-9,1,4,4,4,4,8),c(1,5,5,6,7,10,3)),v,mult="all",roll=TRUE,rollends=FALSE,nomatch=NA,allow.cartesian=TRUE], INT(NA,1,NA,3:4,4,NA,NA)) test(48, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=0], INT(5:6,1,5:6,6)) # this time the NAs stay where they are. Compare to test 24 above. test(197, TESTDT[J(c(-9,NA,4,NA,1,4,4),c(1,5,9,6,5,9,10)),v,mult="all",roll=TRUE,nomatch=NA,allow.cartesian=TRUE], INT(NA,NA,5:6,NA,1,5:6,6)) test(49, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=0], INT(3,4,1,2,7,3,4)) test(198, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,nomatch=NA,allow.cartesian=TRUE], INT(3,4,1,NA,NA,2,7,NA,3,4,NA)) test(50, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=0], INT(4,1,2,7,4)) test(199, TESTDT[J(c(4,1,0,5,3,7,NA,4,1),c(6,5,1,10,5,2,1,6,NA)),v,mult="last",nomatch=NA], INT(4,1,NA,NA,2,7,NA,4,NA)) TESTDT[, a:=letters[a]] setkey(TESTDT,a,b) # i.e. a b v # [1,] a 5 1 # [2,] c 5 2 # [3,] d 6 3 # [4,] d 6 4 # [5,] d 9 5 # [6,] d 9 6 # [7,] g 2 7 test(51, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=0], INT(3:4,4,7)) test(200, TESTDT[SJ(c("d","d","e","g"),c(6,7,1,2)),v,mult="all",roll=TRUE,nomatch=NA], INT(3:4,4,NA,7)) test(52, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=0], INT(7,3:4)) test(201, TESTDT[J(c("g","d","e","d"),c(6,6,1,2)),v,mult="all",roll=TRUE,nomatch=NA], INT(7,3:4,NA,NA)) TESTDT[, b:=letters[b]] setkey(TESTDT,a,b) # i.e. # a b v # [1,] a e 1 # [2,] c e 2 # [3,] d f 3 # [4,] d f 4 # [5,] d i 5 # [6,] d i 6 # [7,] g b 7 test(53, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="last"], INT(4,NA,NA,7)) test(54, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="last"], INT(7,NA,NA,4)) # this tests (d,g) ok even though there is an NA in last match in the roll. test(55, TESTDT[SJ(c("d","d","e","g"),c("f","g","a","b")),v,mult="first"], INT(3,NA,NA,7)) test(56, TESTDT[J(c("g","d","e","d"),c("b","g","a","f")),v,mult="first"], INT(7,NA,NA,3)) test(57, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE], INT(7,4,6,NA,NA,3,4)) # test 58 removed. Tested this failed (rolling join on factors) pre character columns, now works. test(59, TESTDT[J(c("g","d","d","d","e","d"),c("b","g","k","b","a","f")),v,roll=TRUE,rollends=FALSE], INT(7,4,NA,NA,NA,3,4)) # test 60 removed. Tested this failed (rolling join on factors) pre character columns, now works. # Tests 61-66 were testing sortedmatch which is now replaced by chmatch for characters, and removed # for integers until needed. # Test 67 removed. No longer use factors so debate/problem avoided. # [.factor and c.factor are no longer present in data.table, not even hidden away # X = factor(letters[1:10]) # test(67, levels(X[4:6]), letters[4:6]) test(68, "TESTDT" %in% tables(silent=TRUE)[,NAME]) # NAME is returned as a column in which we look for the string test(69, "TESTDT" %in% tables(silent=TRUE)[,as.character(NAME)]) # an old test (from when NAME was factor) but no harm in keeping it test(69.1, names(tables(silent=TRUE)), c("NAME","NROW","NCOL","MB","COLS","KEY")) test(69.2, names(tables(silent=TRUE,mb=FALSE)), c("NAME","NROW","NCOL","COLS","KEY")) a = "d" # Variable Twister. a in this scope has same name as a inside DT scope. # Aug 2010 : As a result of bug 1005, and consistency with 'j' and 'by' we now allow self joins (test 183) in 'i'. test(70, TESTDT[eval(J(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a")) # the eval() enabled you to use the 'a' in the calling scope, not 'a' in the TESTDT. TO DO: document this. test(71, TESTDT[eval(SJ(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a")) test(72, TESTDT[eval(CJ(a)),v,by=.EACHI], data.table(a="d",v=3:6,key="a")) test(73, TESTDT[,v], 1:7) # still old behaviour for 1 year. WhenJsymbol option was set to FALSE at the top of this file test(74, TESTDT[,3], data.table(v=1:7)) test(74.1, TESTDT[,4], error="outside the column number range.*1,ncol=3") test(74.2, TESTDT[,3L], data.table(v=1:7)) test(74.3, TESTDT[,0], null.data.table()) test(75, TESTDT[,"v"], data.table(v=1:7)) test(76, TESTDT[,2:3], TESTDT[,2:3,with=FALSE]) test(77, TESTDT[,2:3,with=FALSE], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) test(78, TESTDT[,c("b","v")], data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) colsVar = c("b","v") test(79.1, TESTDT[,colsVar], error="column name 'colsVar' is not found") test(79.2, TESTDT[,colsVar,with=FALSE], ans<-data.table(b=c("e","e","f","f","i","i","b"),v=1:7)) test(79.3, TESTDT[, ..colsVar], ans) # works in test.data.table, but not eval(body(test.data.table)) when in R CMD check ... test(81, TESTDT[1:2,c(a,b)], factor(c("a","c","e","e"))) # It is expected the above to be common source of confusion. c(a,b) is evaluated within # the frame of TESTDT, and c() creates one vector, not 2 column subset as in data.frame's. # If 2 columns were required use list(a,b). c() can be useful too, but is different. test(82, TESTDT[,c("a","b")], data.table(a=TESTDT[[1]], b=TESTDT[[2]], key=c("a","b"))) test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) # test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default test(87, TESTDT[,list(MySum=sum(v)),by="b"], data.table(b=c("e","f","i","b"),MySum=INT(3,7,11,7))) test(88, TESTDT[,list(MySum=sum(v),Sq=v*v),by="b"][1:3], data.table(b=c("e","e","f"),MySum=INT(3,3,7),Sq=INT(1,4,9))) # silent repetition of MySum to match the v*v vector # Test 89 dropped. Simplify argument no longer exists. by is now fast and always returns a data.table ... test(89, TESTDT[,sum(v),by="b",simplify=FALSE], list(7L,3L,7L,11L)) # Test 88.5 contributed by Johann Hibschman (for bug fix #1294) : test(88.5, TESTDT[a=="d",list(MySum=sum(v)),by=list(b)], data.table(b=c("f","i"), MySum=INT(7,11), key="b")) setkey(TESTDT,b) test(90, TESTDT[J(c("f","i")),sum(v),by=.EACHI], data.table(b=c("f","i"),V1=c(7L,11L),key="b")) test(90.5, TESTDT[J(c("i","f")),sum(v),by=.EACHI], data.table(b=c("i","f"),V1=c(11L,7L))) # test not keyed test(91, TESTDT[SJ(c("f","i")),sum(v),by=.EACHI], data.table(b=c("f","i"),V1=c(7L,11L),key="b")) # Test 92 dropped same reason as 89 ... test(TESTDT[92, J(c("f","i")),sum(v),mult="all",simplify=FALSE], list(7L,11L)) test(93, TESTDT[c("f","i"), which=TRUE], 4:7) test(94, TESTDT[c("i","f"), mult="last", which=TRUE], INT(7,5)) test(95, TESTDT["f",v], 3:4) test(96, TESTDT["f",v,by=.EACHI], data.table(b="f",v=3:4,key="b")) test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("f","i","b"), GroupSum=c(7L,11L,7L))) # that line above doesn't create a key on the result so that the order fib is preserved. test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b")) # line above is the way to group, sort by group and setkey on the result by group. dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B")) # test [<- for column assignment dt1 <- dt2 <- dt test(100, {dt1[,"A"] <- 3L; dt1}, {dt2$A <- 3L; dt2}) # test transform and within test(101, within(dt, {D <- B^2}), transform(dt, D = B^2)) test(102, within(dt, {A <- B^2}), transform(dt, A = B^2)) # test .SD object test(103, dt[, sum(.SD$B), by = "A"], dt[, sum(B), by = "A"]) test(104, dt[, transform(.SD, D = min(B)), by = "A"], dt[, list(B,C,D=min(B)), by = "A"]) # test numeric and comparison operations on a data table test(105, all(dt + dt > dt)) test(106, all(dt + dt > 1)) test(107, dt + dt, dt * 2L) # test a few other generics: test(108, dt, data.table(t(t(dt)),key="A,B")) test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key test(110, sum(is.na(dt2)), 1L) test(111, {setkey(dt,NULL);dt}, na.omit(dt)) test(112, dt2[2:nrow(dt2),A], na.omit(dt2)$A) # test [<- assignment: dt2[is.na(dt2)] <- 1L test(113, {setkey(dt,NULL);dt}, dt2) # key should be dropped because we assigned to a key column # want to discourage this going forward (inefficient to create RHS like this) # dt2[, c("A", "B")] <- dt1[, c("A", "B"), with = FALSE] # test(114, dt1, dt2) ## doesn't work, yet: ## dt2[rep(TRUE, nrow(dt)), c("A", "B")] <- dt1[, c("A", "B"), with = FALSE] ## dt2[rep(TRUE, nrow(dt)), c("A")] <- dt1[, c("A"), with = FALSE] ## test(dt, dt2)) stop("Test 112 failed") # test the alternate form of setkey: dt1 = copy(dt) dt2 = copy(dt) setkeyv(dt1, "A") setkey(dt2, A) test(115, dt1, dt2) # Test dogroups works correctly for character/factor columns test(116, TESTDT[,a[1],by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), key="b")) test(117, TESTDT[,list(a[1],v[1]),by="b"], data.table(b=c("b","e","f","i"), V1=c("g","a","d","d"), V2=INT(7,1,3,5), key="b")) # We no longer check i for out of bounds, for consistency with data.frame and e.g. cbind(DT[w],DT[w+1]). NA rows should be returned for i>nrow test(118, TESTDT[8], data.table(a=as.character(NA), b=as.character(NA), v=as.integer(NA), key="b")) test(119, TESTDT[6:9], data.table(a=c("d","d",NA,NA), b=c("i","i",NA,NA), v=c(5L,6L,NA,NA))) n=10000 grp1=sample(1:50,n,replace=TRUE) grp2=sample(1:50,n,replace=TRUE) dt=data.table(x=rnorm(n),y=rnorm(n),grp1=grp1,grp2=grp2) tt = system.time(ans <- dt[,list(.Internal(mean(x)),.Internal(mean(y))),by="grp1,grp2"]) # test(120, tt[1] < 0.5) # actually takes more like 0.068 << 0.5, but the micro EC2 instance can be slow sometimes. # TO DO: incorporate performance testing into R CMD check (using testthat?), that somehow copes with running on slow machines. i = sample(nrow(ans),1) test(121, all.equal(ans[i,c(V1,V2)], dt[grp1==ans[i,grp1] & grp2==ans[i,grp2], c(mean(x),mean(y))])) # To DO: add a data.frame aggregate method here and check data.table is faster # Tests of 0 and 1 row tables TESTDT = data.table(NULL) test(122, TESTDT[1], TESTDT) test(123, TESTDT[0], TESTDT) test(124, TESTDT[1:10], TESTDT) test(125, TESTDT["k"], error="the columns to join by must be specified either using") # test 126 no longer needed now that test() has 'error' argument TESTDT = data.table(a=3L,v=2L,key="a") # testing 1-row table test(127, TESTDT[J(3)], TESTDT) test(128, TESTDT[J(4)], data.table(a=4L,v=NA_integer_,key="a")) # see tests 206-207 too re the [NA] test(129, TESTDT[J(4),roll=TRUE], data.table(a=4L,v=2L,key="a")) # the i values are in the result now (which make more sense for rolling joins, the x.a can still be accessed if need be) test(130, TESTDT[J(4),roll=TRUE,rollends=FALSE], data.table(a=4L,v=NA_integer_,key="a")) test(131, TESTDT[J(-4),roll=TRUE], data.table(a=-4L,v=NA_integer_,key="a")) test(132, ncol(TESTDT[0]), 2L) test(133, TESTDT[0][J(3)], data.table(a=3L,v=NA_integer_,key="a")) # These need to retain key for consistency (edge cases of larger sorted i) # tests on data table names, make.names is now FALSE by default from v1.8.0 x = 2L; `1x` = 4L dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x) test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names=TRUE) test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names = FALSE) test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) # the last two terms differ from data.frame() test(136, dt[,b_1, by="a.1"], data.table(a.1=1L,"b_1"=2L)) test(137, dt[,`a 1`, by="a.1"], data.table(a.1=1L,"a 1"=4L, check.names=FALSE)) test(138, dt[,a.1, by="`a 1`"], data.table(`a 1`=4L,a.1=1L, check.names=FALSE)) # tests with NA's in factors dt = data.table(a = c(NA, letters[1:5]), b = 1:6) test(139, dt[,sum(b), by="a"], data.table(a = c(NA, letters[1:5]), V1 = 1:6)) # tests to make sure rbind and grouping keep classes dt = data.table(a = rep(as.Date("2010-01-01"), 4), b = rep("a",4)) test(140, rbind(dt,dt), data.table(a = rep(as.Date("2010-01-01"), 8), b = rep("a",8))) test(141, dt[,list(a=a), by="b"], dt[,2:1, with = FALSE]) dt$a <- structure(as.integer(dt$a), class = "Date") test(142, dt[,list(b=b), by="a"], dt) dt = data.table(x=1:5,y=6:10) test(143, tail(dt), dt) # tail was failing if a column name was called x. dt <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], b2 = LETTERS[1:4]) test(144, dt[, .SD[3,], by=b], data.table(b=LETTERS[1:4],a=3L,b2=LETTERS[1:4])) DT = data.table(x=rep(c("a","b"),c(2,3)),y=1:5) xx = capture.output(ans <- DT[,{print(x);sum(y)},by=x,verbose=FALSE]) test(145, xx, c("[1] \"a\"","[1] \"b\"")) test(146, ans, data.table(x=c("a","b"),V1=c(3L,12L))) test(147, DT[,MySum=sum(v)], error="unused argument") # user meant DT[,list(MySum=sum(v))]. FR#204 done. dt = data.table(a=c(1L,4L,5L), b=1:3, key="a") test(148, dt[CJ(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L),key="a")) test(149, dt[J(2:3),roll=TRUE], data.table(a=c(2L,3L),b=c(1L,1L))) # in future this will detect the subset is ordered and retain the key # 150:158 test out of order factor levels in key columns (now allowed from v1.8.0) dt = data.table(x=factor(c("c","b","a"),levels=c("b","a","c")),y=1:3) setkey(dt,x) test(150.1, dt["b",y,verbose=TRUE], output="Coercing character column i.'V1' to factor") # changed i.V1 to i.x as per FR #2693 test(150.2, dt["b",y], 2L) # from Tom's post : a = data.table(a=rep(1:5, 2), b=factor(letters[rep(1:5, each =2)], levels=letters[5:1]), key="b") test(151.1, a[J("b"),a,verbose=TRUE], output="Coercing character column i.'V1' to factor") # message back to `i.V1` now. 'b' still accessible to satisfy FR #2693, checked on next line test(151.2, a[J("b"),a], 3:4) # stretch tests further, two out of order levels, one gets key'd the other not : a = data.table(x=factor(letters[rep(1:5, each =2)], levels=letters[5:1]), y=factor(letters[rep(c(6,9,7,10,8), each =2)], levels=letters[10:6]), z=1:10) test(152, is.sorted(levels(a$x)), FALSE) test(153, is.sorted(levels(a$y)), FALSE) test(154, a[,sum(z),by=x][1,paste(x,V1)], "a 3") # ad hoc by doesn't sort the groups so 'a' (5th level) should be first setkey(a,x) # 'e' (level 1) should come first now. test(155, is.sorted(levels(a$x)), FALSE) test(156, is.sorted(levels(a$y)), FALSE) test(157, a[,sum(z),by=x][1,paste(x,V1)], "e 19") # 1st level is now first test(158, a[,sum(z),by=y][1,paste(y,V1)], "h 19") # not 'f' test(158.5, a[,sum(z),keyby=y][1,paste(y,V1)], "j 15") # not 'f' either # tests of by expression variables DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 ) f = quote( list(d) ) test(159, DT[,mean(b),by=eval(f)], DT[,mean(b),by=list(d)]) # column f doesn't get in the way of expression f foo = function( grp ) { DT[,mean(b),by=eval(grp)] } test(160, foo(quote(list(d))), DT[,mean(b),by=list(d)]) test(161, foo(quote(list(d,a))), DT[,mean(b),by=list(d,a)]) test(162, foo(quote(list(f))), DT[,mean(b),by=list(f)]) test(163, foo(quote(list(grp))), DT[,mean(b),by=list(grp)]) # grp local variable in foo doesn't conflict with column grp test(164, foo(f), DT[,mean(b),by=d]) # checks that data.table inherits methods from data.frame in base ok test(165, subset(DT,a>2), DT[a>2]) test(166, suppressWarnings(split(DT,DT$grp)[[2]]), DT[grp==2]) if ("package:ggplot2" %in% search()) { test(167, names(print(ggplot(DT,aes(b,f))+geom_point()))[c(1,3)], c("data","plot")) # The names() is a stronger test that it has actually plotted, but also because test() sees the invisible result test(167.1,DT[,print(ggplot(.SD,aes(b,f))+geom_point()),by=list(grp%%2L)],data.table(grp=integer())) # %%2 because there are 5 groups in DT data at this stage, just need 2 to test # New test reported by C Neff on 11 Oct 2011 if ("package:hexbin" %in% search()) test(167.2, names(print(ggplot(DT) + geom_hex(aes(b, f)) + facet_wrap(~grp)))[c(1,3)], c("data","plot")) else cat("Test 167.2 not run. If required call library(hexbin) first.\n") # Test plotting ITime with ggplot2 which seems to require an as.data.frame method for ITime, #1713 datetimes = c("2011 NOV18 09:29:16", "2011 NOV18 10:42:40", "2011 NOV18 23:47:12", "2011 NOV19 01:06:01", "2011 NOV19 11:35:34", "2011 NOV19 11:51:09") DT = IDateTime(strptime(datetimes,"%Y %b%d %H:%M:%S")) test(168, print(DT[,qplot(idate,itime)])$ranges, print(qplot(DT$idate,DT$itime))$ranges) test(168.1, print(DT[,qplot(idate,as.POSIXct(itime,tzone=""))])$ranges, print(qplot(idate,as.POSIXct(itime,tzone=""),data=DT))$ranges) try(graphics.off(),silent=TRUE) #try(graphics.off(),silent=TRUE) # R CMD check doesn't like graphics it seems, even when inside try() } else { cat("Tests 167-168 not run. If required call library(ggplot2) first.\n") # ggplot takes a long time so we don't include these by default # From examples, the library(ggplot2) is done first, so that 'R CMD check' does include tests 167-168 } # test of . in formula, using inheritance DT = data.table(y=1:100,x=101:200,y=201:300,grp=1:5) test(169,DT[,as.list(lm(y~0+.,.SD)$coef),by=grp][2,x]-2<1e-10, TRUE) DT <- data.table( a=1:4, d=c("A","B","C","D") ) g <- quote( list( d ) ) test(170, DT[,list(d)], DT[,eval(g)]) DT = data.table(A=c(25L,85L,25L,25L,85L), B=c("a","a","b","c","c"), C=c(2,65,9,82,823)) test(171.1, DT[B=="b"][A==85], output="Empty data.table (0 rows) of 3 cols: A,B,C") test(171.2, DT[B=="b"][A==85,C], numeric()) test(171.3, DT[ , data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ], by=B ], data.table(B=c("a","c"),V1=c(67,905))) test(172, DT[ , list(3,data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ]), by=B ], data.table(B=c("a","b","c"),V1=3,V2=c(67,NA,905))) # Test growing result in memory. Usually the guess is good though. # This example returns no rows for first group so guess for up-front allocate needs a reallocate DT = data.table(A=c(1L,1L,2L,2L,3L,3L), B=1:6) test(173, DT[,B[B>3],by=A][,V1], c(4L,5L,6L)) # Example taken from Harish post to datatable-help on 11 July DT <- data.table( A=c("a","a","b","b","d","c","a","d"), B=c("x1","x2","x2","x1","x2","x1","x1","x2"), C=c(5,2,3,4,9,5,1,9) ) test(174, DT[,C[C-min(C)<3],by=list(A,B)][,V1], c(1,2,3,4,9,9,5)) test(175, DT[,C[C-min(C)<5],by=list(A,B)][,V1], c(5,1,2,3,4,9,9,5)) # Tests of data.table sub-assignments: $<-.data.table & [<-.data.table DT = data.table(a = c("A", "Z"), b = 1:10, key = "a") DT[J("A"),2] <- 100L # without L generates nice warning :-) DT[J("A"),"b"] <- 1:5 DT[1:3,"b"] <- 33L test(176, DT, data.table(a = rep(c("A", "Z"), each = 5), b = as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))), key = "a")) DT[J("A"),"a"] <- "Z" test(177, DT, data.table(a="Z", b=as.integer(c(rep(33, 3), 4:5, seq(2, 10, by = 2))))) # i.e. key dropped and column a still factor DT <- data.table(a = c("A", "Z"), b = 1:10, key = "a") DT$b[1:5] <- 1:5 DT$b[1:3] <- 33 test(178, DT, data.table(a = rep(c("A", "Z"), each = 5), b = c(rep(33, 3), 4:5, seq(2, 10, by = 2)), key = "a")) DT$a <- 10:1 test(179, key(DT), NULL ) # Test logical in a key DT = data.table(a=rep(1:3,each=2),b=c(TRUE,FALSE),v=1:6) setkey(DT,a,b) test(180, DT[J(2,FALSE),v], 4L) test(181, DT[,sum(v),by=b][,V1], c(12L,9L)) # Test fix for bug 1026 reported by Harish V # this test needed a unique var name to generate error 'object 'b' not found'. # Otherwise it finds 'b' in local scope. setnames(DT,2,"buniquename314") bar = function( data, fcn ) { q = substitute( fcn ) xx = data[,eval(q),by=a] yy = data[,eval(substitute(fcn)),by=a] identical(xx,yy) } test(182, bar( DT, sum(buniquename314) ), TRUE) # Test bug 1005 reported by Branson Owen DT = data.table(A = c("o", "x"), B = 1:10, key = "A") test(183, DT[J(unique(A)), B], DT$B) # Test bug 709 which returned an error here. And return type now empty table, #1945 in 1.8.1. xx = data.table(a=1:5,b=6:10) test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer())) # Tests of bug 1015 highlight by Harish # See thread "'by without by' now heeds nomatch=NA" # Tests 185-201 were added in above next to originals x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b") y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A")) test(202, x[y], x[y,mult="all"]) test(203, x[y,d], c(1,2,NA,NA)) test(204, x[y,list(d)]$d, x[y,d]) test(205, x[y,list(d),mult="all"][,d], c(1,2,NA,NA)) # Test [NA] returns one NA row. NA is type *logical* so prior to # change in v1.5, NA would get silently recycled and the whole table would # be returned all NA (rarely useful and often confusing, but consistent # with data.frame). TESTDT = data.table(a=1:3,v=1:3,key="a") test(206, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_,key="a")) # NA are now allowed in keys, so retains key # TESTDT[NA] is expected to return a row of NA since nobody remembers that NA is different to NA_integer_ # Then user tries TESTDT[c(1,NA,2)] and it feels consistent to them since they see that row of NA in the middle # But only the NA symbol is caught and replaced with NA_integer_, for this convenience. # Otherwise logical expressions returning a single NA logical will still return empty, for consistency, #1252. setkey(TESTDT,NULL) test(207, TESTDT[NA], data.table(a=NA_integer_,v=NA_integer_)) # With inheritance, NROW and NCOL in base work nicely. No need for them in data.table. test(208, NROW(TESTDT), 3L) test(209, nrow(TESTDT), 3L) test(210, NCOL(TESTDT), 2L) test(211, ncol(TESTDT), 2L) # Test infinite recursion error is trapped when a pre-1.5 data.table # is used with 1.5 (bug #1008) DT = data.table(a=1:6,key="a") test(212, DT[J(3)]$a, 3L) # correct class c("data.table","data.frame") class(DT) = "data.table" # incorrect class, but as from 1.8.1 it works. By accident when moving from colnames() to names(), it was dimnames() doing the check, but rather than add a check that identical(class(DT),c("data.frame","data.table")) at the top of [.data.table, we'll leave it flexible to user (user might not want to inherit from data.frame for some reason). test(213, DT[J(3)]$a, 3L) # setkey now auto coerces double and character for convenience, and # to solve bug #953 DF = data.frame(a=LETTERS[1:10], b=1:10, stringsAsFactors=FALSE) DT = data.table(DF) setkey(DT,a) # used to complain about character test(215, DT["C",b], 3L) DT = data.table(DF,key="a") test(216, DT["C",b], 3L) DT = data.table(a=c(1,2,3),v=1:3,key="a") test(217, DT[J(2),v], 2L) DT = data.table(a=c(1,2.1,3),v=1:3,key="a") test(218, DT[J(2.1),v], 2L) # tests of quote()-ed expressions in i. Bug #1058 DT = data.table(a=1:5,b=6:10,key="a") q = quote(a>3) test(220, DT[eval(q),b], 9:10) test(221, DT[eval(parse(text="a>4")),b], 10L) test(222, DT[eval(parse(text="J(2)")),b], 7L) # lists in calling scope should be ok as single names passed to by, bug #1060 DT = data.table(a=1:2,b=rnorm(10)) byfact = DT[,a] # vector, ok before fix but check anyway test(223, DT[,mean(b),by=byfact], DT[,mean(b),by=list(byfact)]) byfact = DT[,list(a)] # this caused next line to fail before fix test(224, DT[,mean(b),by=byfact], DT[,mean(b),by=as.list(byfact)]) test(225, DT[,mean(b),by=byfact], DT[,mean(b),by={byfact}]) # tests for building expressions via parse, bug #1243 dt1key<-data.table(A1=1:100,onekey=rep(1:2,each=50)) setkey(dt1key,onekey) ASumExpr<-parse(text="quote(sum(A1))") # no need for quote but we test it anyway because that was work around when test 227 failed ASumExprNoQ<-parse(text="sum(A1)") ans = dt1key[,sum(A1),by=onekey] test(226,ans,dt1key[,eval(eval(ASumExpr)),by=onekey]) test(227,ans,dt1key[,eval(ASumExprNoQ),by=onekey]) # test for uncommon grouping pattern on 1-row data.table, bug #1245 DT = data.table(a=1L,b=2L) test(228,DT[,list(1:2),by=a],data.table(a=c(1L,1L),V1=1:2)) # special case j=.SD, bug #1247 DT = data.table(a=rep(1:2,each=2),b=1:4) test(229,DT[,.SD,by=a],DT) setkey(DT,a) test(229.1,DT[,.SD,by=key(DT)],DT) # merge bug with column 'x', bug #1229 d1 <- data.table(x=c(1,3,8),y1=rnorm(3), key="x") d2 <- data.table(x=c(3,8,10),y2=rnorm(3), key="x") ans1=merge(d1, d2, by="x") ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,x) test(230, ans1, ans2) # one column merge, bug #1241 DT = data.table(a=rep(1:2,each=3),b=1:6,key="a") y = data.table(a=c(0,1),bb=c(10,11),key="a") test(231,merge(y,DT),data.table(a=1L,bb=11,b=1:3,key="a")) test(232,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),bb=rep(c(10,11,NA_real_),c(1,3,3)),b=c(NA_integer_,1:6),key="a")) y = data.table(a=c(0,1),key="a") # y with only a key column test(233,merge(y,DT),data.table(a=1L,b=1:3,key="a")) test(234,merge(y,DT,all=TRUE),data.table(a=rep(c(0L,1L,2L),c(1,3,3)),b=c(NA_integer_,1:6),key="a")) # 'by' when DT contains list columns DT = data.table(a=c(1,1,2,3,3),key="a") DT$b=list(1:2,1:3,1:4,1:5,1:6) test(235,DT[,mean(unlist(b)),by=a],data.table(a=c(1,2,3),V1=c(1.8,2.5,mean(c(1:5,1:6))),key="a")) test(236,DT[,sapply(b,mean),by=a],data.table(a=c(1,1,2,3,3),V1=c(1.5,2.0,2.5,3.0,3.5),key="a")) # when i is a single name, it no longer evaluates within data.table scope DT = data.table(a=1:5,b=rnorm(5),key="a") a = list(4) test(237,DT[a],DT[J(4)]) # repeat earlier test with xkey instead of x. xkey is internal to merge; the bigger problem Tom mentioned. d1 <- data.table(xkey=c(1,3,8),y1=rnorm(3), key="xkey") d2 <- data.table(xkey=c(3,8,10),y2=rnorm(3), key="xkey") ans2=cbind(d1[2:3],y2=d2[1:2]$y2);setkey(ans2,xkey) test(238, merge(d1, d2, by="xkey"), ans2) # Join Inherited Scope, and X[Y] including Y's non-join columns X=data.table(a=rep(1:3,c(3,3,2)),foo=1:8,key="a") Y=data.table(a=2:3,bar=6:7) test(239, X[Y,sum(foo),by=.EACHI], data.table(a=2:3,V1=c(15L,15L),key="a")) test(240, X[Y,sum(foo*bar),by=.EACHI], data.table(a=2:3,V1=c(90L,105L),key="a")) test(241, X[Y], data.table(a=rep(2:3,3:2),foo=4:8,bar=rep(6:7,3:2),key="a")) test(242, X[Y,list(foo,bar),by=.EACHI][,sum(foo*bar)], 195L) test(243, X[Y][,sum(foo*bar)], 195L) # not sure about these yet : # test(244, X[Y,sum(foo*bar),mult="first"], data.table(a=2:3,V1=c(24L,49L))) # test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L))) # joining to less than all X's key colums (in examples but can't see formal test) X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b") test(246.1, X["A"], X[1:2]) # checks that X[1:2] retains key, too test(246.2, key(X["A"]), c("a","b")) test(247, X["C"]$v, NA_integer_) test(248, nrow(X["C",nomatch=0]), 0L) x=data.table( a=c("a","b","c"), b=1:3, key="a" ) y=data.table( a=c("b","d","e"), d=c(8,9,10) ) test(249, x[y], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10))) # keeps i join cols test(250, x[y,mult="first"], data.table(a=c("b","d","e"),b=c(2L,NA,NA),d=c(8,9,10))) # same x=data.table( a=c("a","b","b","c"), b=1:4, key="a" ) y=data.table(a=c("b","d","b"), d=c(8,9,10)) test(251, x[y, allow.cartesian=TRUE], data.table(a=c("b","b","d","b","b"),b=c(2:3,NA,2:3),d=c(8,8,9,10,10))) # auto coerce float to int in ad hoc by (just like setkey), FR#1051 DT = data.table(a=INT(1,1,1,2,2),v=1:5) test(252, DT[,sum(v),by=a], data.table(a=1:2,V1=c(6L,9L))) # check that by retains factor columns, since character is now default DT = data.table(a=factor(c("A","A","A","B","B")),v=1:5) test(253, DT[,sum(v),by=a], data.table(a=factor(c("A","B")),V1=c(6L,9L))) # fix for bug #1298 with by=key(DT) and divisibility error. DT=data.table(a=c(1,1,1,2,2),b=1:5,key="a") test(254, DT[,sum(b),by=key(DT)]$V1, c(6L,9L)) # for for bug #1294 (combining scanning i and by) # also see test 88.5 contributed by Johann Hibschman above. DT = data.table(a=1:12,b=1:2,c=1:4) test(255, DT[a>5,sum(c),by=b]$V1, c(12L, 7L)) # fix for bug #1301 (all.vars() doesn't appear to find fn in fns[[fn]] usage) DT = data.table(a=1:6,b=1:2,c=letters[1:2],d=1:6) fns = list(a=max,b=min) test(256, DT[,fns[[b[1]]](d),by=c]$V1, c(5L,2L)) test(257, DT[,fns[[c[1]]](d),by=c]$V1, c(5L,2L)) fns=c(max,min) DT = data.table(ID=1:10, SCORE_1=1:10, SCORE_2=11:20, SCORE_3=30:21, fn=c(rep(1, 5), rep(2, 5))) test(258, DT[,fns[[fn]](SCORE_1,SCORE_2,SCORE_3),by=ID]$V1, c(30:26,6:10)) test(259, DT[,as.list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10)) test(260, DT[,list(fns[[fn]](SCORE_1,SCORE_2,SCORE_3)),by=ID]$V1, c(30:26,6:10)) # fix for bug #1340 - Duplicate column names in self-joins (but print ok) DT <- data.table(id=1:4, x1=c("a","a","b","c"), x2=c(1L,2L,3L,3L), key="x1") test(261, DT[DT, allow.cartesian=TRUE][id < i.id]$i.x2, 2L) # "<-" within j now assigns in the same environment for 1st group, as the rest # Thanks to Andeas Borg for highlighting on 11 May dt <- data.table(x=c(0,0,1,0,1,1), y=c(0,1,0,1,0,1), z=1:6) groupInd = 0 test(262, dt[,list(z,groupInd<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4)) test(263, groupInd, 0) test(264, dt[,list(z,groupInd<<-groupInd+1),by=list(x,y)]$V2, c(1,2,2,3,3,4)) test(265, groupInd, 4) # Tests for passing 'by' expressions that evaluate to character column # names in the edge case of 1 row; the character 'by' vector could # feasibly be intended to be grouping values. Bug 1404; thanks to Andreas Borg # for the detailed report, suggested fix and tests. DT = data.frame(x=1,y="a",stringsAsFactors=FALSE) DT = as.data.table(DT) test(266,class(DT$y),"character") # just to check we setup the test correctly test(267,DT[,sum(x),by=y]$V1,1) test(268,DT[,sum(x),by="y"]$V1,1) colvars="y" test(269,DT[,sum(x),by=colvars]$V1,1) setkey(DT,y) test(270,DT[,sum(x),by=key(DT)]$V1,1) DT = data.table(x=1,y=2) setkeyv(DT,names(DT)) test(271, DT[,length(x),by=key(DT)]$V1, 1L) DT = data.table(x=c(1,2,1), y=c(2,3,2), z=1:3) setkeyv(DT,names(DT)) test(272, DT[,sum(z),by=key(DT)]$V1, c(1L,3L,2L)) # Tests for .BY and implicit .BY # .BY is a single row, and by variables are now, too. FAQ 2.10 has been changed accordingly. DT = data.table(a=1:6,b=1:2) test(273, DT[,sum(a)*b,by=b]$V1, c(9L,24L)) test(274, DT[,sum(a)*.BY[[1]],by=b], data.table(b=1:2,V1=c(9L,24L))) test(275, DT[,sum(a)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L))) test(276, DT[,sapply(.SD,sum)*b,by=b], data.table(b=1:2,V1=c(9L,24L))) # .SD should no longer include b, unlike v1.6 and before test(277, DT[,sapply(.SD,sum)*bcalc,by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(18L,36L))) # cols used in by expressions are excluded from .SD, but can still be used in j (by name only and may vary within the group e.g. DT[,max(diff(date)),by=month(date)] test(278, DT[,sum(a*b),by=list(bcalc=b+1L)], data.table(bcalc=2:3,V1=c(9L,24L))) # Test x==y where either column contain NA. DT = data.table(x=c(1,2,NA,3,4),y=c(0,2,3,NA,4),z=1:5) test(279, DT[x==y,sum(z)], 7L) # In data.frame the equivalent is : # > DF = as.data.frame(DT) # > DF[DF$x==DF$y,] # x y z # 2 2 2 2 # NA NA NA NA # NA.1 NA NA NA # 5 4 4 5 # > DF[!is.na(DF$x) & !is.na(DF$y) & DF$x==DF$y,] # x y z # 2 2 2 2 # 5 4 4 5 # Test that 0 length columns are expanded with NA to match non-0 length columns, bug fix #1431 DT = data.table(pool = c(1L, 1L, 2L), bal = c(10, 20, 30)) test(280, DT[, list(bal[0], bal[1]), by=pool], data.table(pool=1:2, V1=NA_real_, V2=c(10,30))) test(281, DT[, list(bal[1], bal[0]), by=pool], data.table(pool=1:2, V1=c(10,30), V2=NA_real_)) # Test 2nd group too (the 1st is special) ... test(282, DT[, list(bal[ifelse(pool==1,1,0)], bal[1]), by=pool], data.table(pool=1:2, V1=c(10,NA), V2=c(10,30))) # More tests based on Andreas Borg's post of 11 May 2011. DT = data.table(x=INT(0,0,1,0,1,1), y=INT(1,1,0,1,1,1), z=1:6) ans = data.table(x=c(0L,1L,1L),y=c(1L,0L,1L),V1=c(1L,1L,2L),V2=c(7L,3L,11L)) test(283, DT[,list(sum(x[1], y[1]),sum(z)), by=list(x,y)], ans) test(284, DT[,list(sum(unlist(.BY)),sum(z)),by=list(x,y)], ans) groupCols = c("x", "y") test(285, DT[,list(sum(unlist(.BY)),sum(z)),by=groupCols], ans) groupExpr = quote(list(x,y)) test(286, DT[,list(sum(unlist(.BY)),sum(z)),by=groupExpr], ans) # Bug fix from Damian B on 25 June 2011 : DT = data.table(X=c(NA,1,2,3), Y=c(NA,2,1,3)) setkeyv(DT,c("X","Y")) test(287, unique(DT, by=key(DT)), DT) # Bug fix #1421: using vars in calling scope in j when i is logical or integer. DT = data.table(A=c("a","b","b"),B=c(4,5,NA)) myvar = 6 test(288, DT[A=="b",B*myvar], c(30,NA)) # Test new feature in 1.6.1 that i can be plain list (such as .BY) DT = data.table(grp=c("a","a","a","a","b","b","b"),v=1:7) mysinglelookup = data.table(grp=c("a","b"),s=c(42,84),grpname=c("California","New York"),key="grp") setkey(mysinglelookup,grp) test(289, DT[,sum(v*mysinglelookup[.BY]$s),by=grp], data.table(grp=c("a","b"),V1=c(420,1512))) # In v1.6.2 we will change so that single name j returns a vector, regardless of grouping test(290, DT[,list(mysinglelookup[.BY]$grpname,sum(v)),by=grp], data.table(grp=c("a","b"),V1=c("California","New York"),V2=c(10L,18L))) # Test user defined attributes are retained, see comment in FR#1006 DT = data.table(a=as.numeric(1:2),b=3:4) setattr(DT,"myuserattr",42) setkey(DT,a) # a is numeric so a change of type to integer occurs, too, via := which checks selfref is ok test(291, attr(DT,"myuserattr"), 42) # Test new .N symbol DT = data.table(a=INT(1,1,1,1,2,2,2),b=INT(3,3,3,4,4,4,4)) test(292, DT[,.N,by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),N=c(3L,1L,3L))) test(293, DT[,list(a+b,.N),by=list(a,b)], data.table(a=c(1L,1L,2L),b=c(3L,4L,4L),V1=4:6,N=c(3L,1L,3L))) # Test that setkey and := syntax really are by reference, even within functions. You # really do need to take a copy first to a new name; force(x) isn't enough. DT = data.table(a=1:3,b=4:6) f = function(x){ force(x) setkey(x) } f(DT) test(294,key(DT),c("a","b")) # The setkey didn't copy to a local variable. Need to copy first to local variable (with a new name) if required. f = function(x){ force(x) x[,a:=42L] } f(DT) test(295,DT,data.table(a=42L,b=4:6)) # := was by reference (fast) and dropped the key, too, because assigned to key column DT = data.table(a=1:3,b=4:6) f = function(x){ x = copy(x) setkey(x) } f(DT) test(295.1,key(DT),NULL) setkey(DT,a) f = function(x){ x = copy(x) x[,b:=10:12][J(2),b] } # test copy retains key test(295.2,f(DT),11L) test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy # new feature added 1.6.3, that key can be vector. test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b"))) # test .SDcols (not speed, just operation) DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp") test(297,DT[,list(A1=sum(A1),A2=sum(A2),A3=sum(A3)),by=grp], DT[,lapply(.SD,sum),by=grp,.SDcols=2:4]) DT = data.table(a=1:3,b=4:6) test(298, {DT$b<-NULL;DT}, data.table(a=1:3)) # delete column test(299.01, {DT$c<-as.character(DT$c);DT}, data.table(a=1:3, c=NA_character_)) # Column c is missing, so DT$c is NULL. test(299.02, DT[,c:=""], data.table(a=1:3,c="")) test(299.03, truelength(DT)>length(DT)) # the := over-allocated, by 100 by default, but user may have changed default so just check '>' # FR #2551 - old 299.3 and 299.5 are changed to include length(RHS) > 1 to issue the warning DT[,c:=rep(42L,.N)] # plonk test(299.04, DT, data.table(a=1:3, c=42L)) test(299.05, DT[2:3,c:=c(42, 42)], data.table(a=1:3,c=42L), warning="Coerced 'double' RHS to 'integer' to match the column's type.*length 3 (nrows of entire table)") # FR #2551 - length(RHS) = 1 - no warning for type conversion test(299.06, DT[2,c:=42], data.table(a=1:3,c=42L)) # also see tests 302 and 303. (Ok, new test file for fast assign would be tidier). test(299.07, DT[,c:=rep(FALSE,nrow(DT))], data.table(a=1:3,c=FALSE)) # replace c column with logical test(299.08, DT[2:3,c:=c(42,0)], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced 'double' RHS to 'logical' to match the column's type.*length 3 (nrows of entire table)") # FR #2551 is now changed to fit in / fix bug #5442. Stricter warnings are in place now. Check tests 1294.1-34 below. test(299.09, DT[2,c:=42], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced 'double' RHS to 'logical' to match") test(299.11, DT[2,c:=42L], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced 'integer' RHS to 'logical' to match") test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=FALSE), warning="Coerced 'integer' RHS to 'logical' to match the column's type.*length 3 (nrows of entire table)") # Test bug fix #1468, combining i and by. DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L),key="b")) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) # Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix) # This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which # this tests. But it's well under 10 seconds. DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep=""))) test(301, nrow(DT[,sum(B),by=C])==100010) DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test fast assign DT = data.table(a=c(1L,2L,2L,3L),b=4:7,key="a") DT[2,b:=42L] # needs to be on its own line to test DT symbol is changed by reference test(302, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,42L,6L,7L),key="a")) DT[J(2),b:=84L] test(303, DT, data.table(a=c(1L,2L,2L,3L),b=c(4L,84L,84L,7L),key="a")) # Test key is dropped when non-dt-aware packages reorder rows of data.table (for example) if ("package:plyr" %in% search()) { DT = data.table(a=1:10,b=1:2,key="a") test(304, arrange(DT,b), data.table(a=INT(1,3,5,7,9,2,4,6,8,10),b=INT(1,1,1,1,1,2,2,2,2,2))) # testing no key here, too } else { cat("Test 304 not run. If required call library(plyr) first.\n") } # Test that changing colnames keep key in sync. # TO DO: will have to do this for secondary keys, too, when implemented. DT = data.table(x=1:10,y=1:10,key="x") setnames(DT,c("a","b")) test(305, key(DT), "a") setnames(DT,"a","R") test(306, key(DT), "R") setnames(DT,"b","S") test(307, key(DT), "R") setnames(DT,c("a","b")) test(308, key(DT), "a") setnames(DT,1,"R") test(309, key(DT), "R") # Test :=NULL DT = data.table(x=1:5,y=6:10,z=11:15,key="y") test(310, DT[,x:=NULL], data.table(y=6:10,z=11:15,key="y")) # delete first test(311, DT[,y:=NULL], data.table(z=11:15)) # deleting key column also removes key test(312, DT[,z:=NULL], data.table(NULL)) # deleting all test(313, DT[,a:=1:3], error="") # cannot := a new column to NULL data.table, currently. Must use data.table() DT = data.table(a=20:22) test(314, {DT[,b:=23:25];DT[,c:=26:28]}, data.table(a=20:22,b=23:25,c=26:28)) # add in series test(315, DT[,c:=NULL], data.table(a=20:22,b=23:25)) # delete last test(316, DT[,c:=NULL], data.table(a=20:22,b=23:25), warning="Adding new column 'c' then assigning NULL") # Test adding, removing and updating columns via [<- in one step DT = data.table(a=1:6,b=1:6,c=1:6) DT[,c("a","c","d","e")] <- list(NULL,11:16,42L,21:26) test(317, DT, data.table(b=1:6,c=11:16,d=42L,e=21:26)) # Other assignments (covers DT[x==2, y:=5] too, #1502) DT[e<24,"b"] <- 99L test(318, DT, data.table(b=c(99L,99L,99L,4L,5L,6L),c=11:16,d=42L,e=21:26)) test(319, DT[b!=99L,b:=99L], data.table(b=99L,c=11:16,d=42L,e=21:26)) # previous within functionality restored, #1498 DT = data.table(a=1:10) test(320, within(DT, {b <- 1:10; c <- a + b})[,list(a,b,c)], data.table(a=1:10,b=1:10,c=as.integer(seq(2,20,length=10)))) # not sure why within makes columns in order a,c,b, but it seems to be a data.frame thing, too. test(321, transform(DT,b=42L,e=a), data.table(a=1:10,b=42L,e=1:10)) DT = data.table(a=1:5, b=1:5) test(322, within(DT, rm(b)), data.table(a=1:5)) # check that cbind dispatches on first argument as expected test(323, cbind(DT,DT), data.table(a=1:5,b=1:5,a=1:5,b=1:5)) # no check.names as from v1.8.0 (now we have :=, cbind is used far less anyway) test(324, cbind(DT,data.frame(c=1:5)), data.table(a=1:5,b=1:5,c=1:5)) test(325, rbind(DT,DT), data.table(a=c(1:5,1:5),b=1:5)) test(326, rbind(DT,data.frame(a=6:10,b=6:10)), data.table(a=1:10,b=1:10)) # test removing multiple columns, and non-existing ones, #1510 DT = data.table(a=1:5, b=6:10, c=11:15) test(327, within(DT,rm(a,b)), data.table(c=11:15)) test(328, within(DT,rm(b,c)), data.table(a=1:5)) test(329, within(DT,rm(b,a)), data.table(c=11:15)) test(330, within(DT,rm(b,c,d)), data.table(a=1:5), warning="object 'd' not found") DT[,c("b","a")]=NULL test(332, DT, data.table(c=11:15)) test(333, within(DT,rm(c)), data.table(NULL)) DT = data.table(a=1:5, b=6:10, c=11:15) DT[,2:1]=NULL test(334, DT, data.table(c=11:15)) test(335, DT[,2:1]<-NULL, error="Attempt to assign to column") DT = data.table(a=1:2, b=1:6) test(336, DT[,z:=a/b], data.table(a=1:2,b=1:6,z=(1:2)/(1:6))) test(337, DT[3:4,z:=a*b], data.table(a=1:2,b=1:6,z=c(1,1,3,8,1/5,2/6)), warning="Coerced 'integer' RHS to 'double' to match the colum") # test eval of LHS of := (using with=FALSE gives a warning here from v1.9.3) DT = data.table(a=1:3, b=4:6) test(338, DT[,2:=42L], data.table(a=1:3,b=42L)) test(339, DT[,2:1:=list(10:12,3L)], data.table(a=3L,b=10:12)) test(340, DT[,"a":=7:9], data.table(a=7:9,b=10:12)) test(341, DT[,c("a","b"):=1:3], data.table(a=1:3,b=1:3)) mycols = "a" test(342, DT[,(mycols):=NULL], data.table(b=1:3)) mynewcol = "newname" test(343, DT[,(mynewcol):=21L], data.table(b=1:3,newname=21L)) mycols = 1:2 test(344, DT[,(mycols):=NULL], data.table(NULL)) # It seems that the .Internal rbind of two data.frame coerces IDate to numeric. Tried defining # "[<-.IDate" as per Tom's suggestion, and c.IDate to no avail (maybe because the .Internal code # in bind.c doesn't look up package methods?). Anyway, as from 1.8.1, double are allowed in keys, so # these still work but for a different reason than before 1.8.1: the results are IDate stored as double, # rather than before when is worked because by and setkey coerced double to integer. DF = data.frame(x=as.IDate(c("2010-01-01","2010-01-02")), y=1:6) DT = as.data.table(rbind(DF,DF)) test(345, DT[,sum(y),by=x], {.x=as.IDate(c("2010-01-01","2010-01-02"));mode(.x)="double";data.table(x=.x,V1=c(18L,24L))}) test(346, setkey(DT,x)[J(as.IDate("2010-01-02"))], {.x=as.IDate(rep("2010-01-02",6L));mode(.x)="double";data.table(x=.x,y=rep(c(2L,4L,6L),2),key="x")}) # Test that invalid keys are reset, without user needing to remove key using key(DT)=NULL first DT = data.table(a=letters[1:3],b=letters[6:4],key="a") attr(DT,"sorted")="b" # user can go under the hood test(347, setkey(DT,b), data.table(a=letters[3:1],b=letters[4:6],key="b"), warning="Already keyed by this key but had invalid row order, key rebuilt") # Test .N==0 with nomatch=NA|0, # tests for #963 added as well DT = data.table(a=1:2,b=1:6,key="a") test(349, DT[J(2:3),.N,nomatch=NA,by=.EACHI]$N, c(3L,0L)) test(350, DT[J(2:3),.N,nomatch=0], c(3L)) # Test first .N==0 with nomatch=NA|0 test(350.1, DT[J(2:3),.N], c(4L)) test(350.2, DT[J(4),.N], 1L) test(350.3, DT[J(4),.N,nomatch=0L], 0L) test(350.4, DT[J(4:5),.N,nomatch=0L], 0L) test(350.5, DT[J(0:4),.N,by=.EACHI]$N, c(0L,3L,3L,0L,0L)) test(350.6, DT[c(0,0,0), .N], 0L) # Test recycling list() on RHS of := DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12) test(351, DT[,c("a","b"):=list(13:15)], data.table(a=13:15,b=13:15,c=7:9,d=10:12)) test(352, DT[,letters[1:4]:=list(1L,NULL)], data.table(a=c(1L,1L,1L),c=c(1L,1L,1L))) # Test assigning new levels into factor columns DT = data.table(f=factor(c("a","b")),x=1:4) test(353, DT[2,f:="c"], data.table(f=factor(c("a","c","a","b")),x=1:4)) test(354, DT[3,f:=factor("foo")], data.table(f=factor(c("a","c","foo","b")),x=1:4)) # Test growVector logic when adding levels (don't need to grow levels for character cols) newlevels = as.character(as.hexmode(1:2000)) DT = data.table(f=factor("000"),x=1:2010) test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevels)),x=1:2010)) DT = data.table(f=c("a","b"),x=1:4) # Test coercing factor to character column test(355.5, DT[3,f:=factor("foo")], data.table(f=c("a","b","foo","b"),x=1:4)) test(355.6, DT[4,f:=factor("bar"),verbose=TRUE], data.table(f=c("a","b","foo","bar"),x=1:4), output="Coerced factor to character to match the column") # See datatable-help post and NEWS item for 1.6.7 DT = data.table(X=factor(letters[1:10]), Y=1:10) DT$X = "Something Different" test(356, DT, data.table(X=factor("Something Different",levels=c(letters[1:10],"Something Different")), Y=1:10)) DT = data.table(X=letters[1:10], Y=1:10) DT$X = "Something Different" test(356.5, DT, data.table(X="Something Different", Y=1:10)) # Bug fix 1570 DT = data.table(x=1:5,y=1:5) test(357, DT[x==0, y:=5L], data.table(x=1:5,y=1:5)) test(358, DT[FALSE, y:=5L], data.table(x=1:5,y=1:5)) # Bug fix 1599 DT = data.table(a=1:2,b=1:6) test(359, DT[,sum(b),by=NULL], data.table(V1=21L)) test(360, DT[,sum(b),by=character(0)], data.table(V1=21L)) # Bug fix 1576 : NULL j results in 'inconsistent types' error DT = data.table(a=1:3,b=1:9) ans = data.table(a=c(1L,3L),V1=c(12L,18L)) test(361, DT[,if (a==2) NULL else sum(b),by=a], ans) test(362, DT[,if (a==2) data.table(NULL) else sum(b),by=a], ans) test(363, DT[,if (a==2) as.list(NULL) else sum(b),by=a], ans) test(364, DT[,if (a==2) integer(0) else sum(b),by=a], ans) # Test that data.table() can create list() columns directly # NB: test 235 above ('by' when DT contains list columns) created the list column in two steps, no longer necessary DT = data.table(a=1:2,b=list("h",7:8)) test(365, DT[1,b], list("h")) # should it be a special case for 1-item results to unlist? Don't think so: in keeping with no drop=TRUE principle test(366, DT[2,b], list(7:8)) DT = data.table(a=1:4,b=list("h",7:8),c=list(matrix(1:12,3),data.table(a=letters[1:3],b=list(1:2,3.4,"k"),key="a"))) test(367, DT[3,b], list("h")) test(368, DT[4,b], list(7:8)) test(369, DT[3,c[[1]][2,3]], 8L) test(370, DT[4,c[[1]]["b",b]][[1]], 3.4) # Test returning a list() column via grouping DT = data.table(x=INT(1,1,2,2,2),y=1:5) test(371, DT[,list(list(unique(y))),by=x], data.table(x=1:2,V1=list(1:2,3:5))) # Test matrix i is an error test(372, DT[matrix(1:2,ncol=2)], error="i is invalid type (matrix)") # Tests from bug fix #1593 DT = data.table(x=letters[1:2], y=1:4) DT[x == "a", ]$y <- 0L test(373, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L))) DT = data.table(x=letters[1:2], y=1:4, key="x") DT["a", ]$y <- 0L test(374, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L), key="x")) DT = data.table(x=letters[1:2], y=1:4) DT[c(1,3), ]$y <- 0L test(375, DT, data.table(x=letters[1:2], y=c(0L,2L,0L,4L))) # Test unique on unsorted tables (and tolerance on numeric columns, too) DT = data.table(a=c(2,1,2),b=c(1,2,1)) test(376, unique(DT), data.table(a=c(2,1),b=c(1,2))) # From the SO thread : M = matrix(sample(2, 120, replace = TRUE), ncol = 3) DF = as.data.frame(M) DT = as.data.table(M) test(377, as.data.table(unique(DF)), unique(DT)) # Test compatibility with sqldf. sqldf() does a do.call("rbind" with empty input, # so this tests ..1 when NULL (which was insufficiently list(...)[[1]] in 1.6.6). # We now test this directly rather than using sqldf, because we couldn't get 'R CMD check' # past "(converted from warning) closing unused connection 3 (/tmp/RtmpYllyW2/file55822c52)" test(378, cbind(), NULL) test(379, rbind(), NULL) DT = data.table(a=rep(1:3,1:3),b=1:6) test(380, DT[,{.SD$b[1]=10L;.SD}, by=a], error="locked binding") # .SD locked for 1st group test(381, DT[,{if (a==2) {.SD$b[1]=10L;.SD} else .SD}, by=a], error="locked binding") # .SD locked in 2nd group onwards too # test that direct := is trapped, but := within a copy of .SD is allowed (FAQ 4.5). See also tests 556-557. test(382, DT[,b:=.N*2L,by=a], data.table(a=rep(1:3,1:3),b=rep(2L*(1:3),1:3))) test(383, DT[,{z=10L;b:=z},by=a], error=":= and `:=`(...) are defined for use in j, once only and in particular ways") test(384, DT[,{mySD=copy(.SD);mySD[1,b:=99L];mySD},by=a], data.table(a=rep(1:3,1:3),b=c(99L,99L,4L,99L,6L,6L))) # somehow missed testing := on logical subset with mixed TRUE/FALSE, reported by Muhammad Waliji DT = data.table(x=1:2, y=1:6) test(385, DT[x==1, y := x], data.table(x=1:2,y=c(1L,2L,1L,4L,1L,6L))) test(386.1, DT[c(FALSE,TRUE)], error="i evaluates to.*Recycling of logical i is no longer allowed.*use rep.*[.]N") test(386.2, DT[rep(c(FALSE,TRUE),length=.N),y:=99L], data.table(x=1:2,y=c(1L,99L,1L,99L,1L,99L))) # test that column names have the appearance of being local in j (can assign to them ok), bug #1624 DT = data.table(name=c(rep('a', 3), rep('b', 2), rep('c', 5)), flag=FALSE) test(387, DT[,{flag[1]<-TRUE;list(flag=flag)}, by=name], DT[c(1,4,6),flag:=TRUE]) DT = data.table(score=1:10, name=c(rep('a', 4), rep('b',2), rep('c', 3), 'd')) test(388, DT[,{ans = score[1] score[1] <- -score[1] ans },by=name], data.table(name=letters[1:4],V1=c(1L,5L,7L,10L))) # Tests 389-394 (character grouping and sorting) now at the start of this file, so that any # errors elsewhere show up in the last 13 lines displayed by CRAN checks. # Test unique.data.table for numeric columns within tolerance, for consistency with # with unique.data.frame which does this using paste. old_rounding = getNumericRounding() DT = data.table(a=tan(pi*(1/4 + 1:10)),b=42L) # tan(...) from example in ?all.equal. test(395, all.equal(DT$a, rep(1,10))) test(396, length(unique(DT$a))>1) # 10 unique values on all CRAN machines (as of Nov 2011) other than mac (5 unique) # commenting these two as they give different results on os x and linux. # test(397.1, unique(DT), DT[duplicated(DT)]) # default, no rounding # test(398.1, duplicated(DT), c(FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE)) setNumericRounding(2L) test(397.2, unique(DT), DT[1]) # before v1.7.2 unique would return all 10 rows. For stability within tolerance, data.table has it's own modified numeric sort. test(398.2, duplicated(DT), c(FALSE,rep(TRUE,9))) setNumericRounding(old_rounding) DT = data.table(a=c(3.142, 4.2, 4.2, 3.142, 1.223, 1.223), b=rep(1,6)) test(399, unique(DT), DT[c(1,2,5)]) test(400, duplicated(DT), c(FALSE,FALSE,TRUE,TRUE,FALSE,TRUE)) DT[c(2,4,5),a:=NA] test(401, unique(DT), DT[c(1,2,3,6)]) test(402, duplicated(DT), c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE)) # Test NULL columns next to non-NULL, #1633 DT = data.table(a=1:3,b=4:6) test(403, DT[,list(3,if(a==2)NULL else b),by=a], data.table(a=1:3,V1=3,V2=c(4L,NA_integer_,6L))) test(404, DT[,list(3,if(a==1)NULL else b),by=a], error="Please use a typed empty vector instead.*such as integer.*or numeric") test(405, DT[,list(3,if(a==1)numeric() else b),by=a], error="Column 2 of result for group.*integer.*double.*types must be consistent for each group") test(406, DT[,list(3,if(a==1)integer() else b),by=a], data.table(a=1:3,V1=3,V2=c(NA_integer_,5:6))) # Test that first column can be list, #1640 test(407, data.table(list(1:2,3:5)), as.data.table(list(list(1:2,3:5)))) # With over-allocation, null data.table has truelength 100. Replaced the calls to structure() in the # code to new null.data.table(), so test internal function. User may have changed default, so this # doesn't test "100" explicitly. test(408, null.data.table(), data.table(NULL)) test(408.5, data.table(), data.table(NULL)) # Test that adding a column using := is fully by reference rather than a shallow copy, #1646 DT = data.table(1:2,3:4) # list vector truelength 100 DT2 = DT DT2[,y:=10L] test(409, DT, DT2) test(410, DT, data.table(1:2,3:4,y=10L)) DT2[1,V1:=99L] test(411, DT, DT2) test(412, DT, data.table(c(99L,2L),3:4,y=10L)) # Test that cbind dispatched to data.table() and retains keys DT = data.table(x=c("a","b"),y=1:4,key="x") test(413.1, key(cbind(DT,DT)), NULL) # key dropped because name "x" ambiguous DT1 = data.table(z = c(1,2), w = 1:4, key = "z") test(413.2, key(cbind(DT,DT1)), c("x", "z")) test(413.3, key(cbind(colA=10:13, DT)), "x") # data.table() dispatched even though 1st argument isn't data.table test(413.4, key(cbind(colA=10:17, DT)), NULL) # DT recycled so key is dropped test(413.5, key(cbind(colA=1, DT)), "x") # DT not recycled so key retained test(414.1, key(cbind(DT,as.data.frame(DT1))), "x") test(414.2, cbind(as.data.frame(DT),DT1), data.frame(DT,DT1)) # cbind(DF,...) should return a data.frame for consistency with base. Package treemap (at least) depends # on this in the return() in treepalette(). # Use data.table(DF,DT) if a data.table result is required. # Test friendly error when := is used in wrong place test(415, x:=1, error="defined for use in j, once only and in particular ways") # Somehow never tested that X[Y] is error if X is unkeyed. DT = data.table(a=1:3,b=4:6) test(416, DT[J(2)], error="the columns to join by must be specified either using") # Test shallow copy verbose message from := adding a column, and (TO DO) only when X is NAMED. DT = data.table(a=1:3,b=4:6) test(417, alloc.col(DT,3,verbose=TRUE), DT, output="Attempt to reduce allocation from.*to 5 ignored. Can only increase allocation via shallow copy") old = options(datatable.alloccol=1L) DT = data.table(a=1:3,b=4:6) options(old) DT2 = DT test(418, length(DT)==2 && truelength(DT)==3) DT[,c:=7L] # uses final slot test(419, DT, DT2) test(420, length(DT)==3 && truelength(DT)==3 && length(DT2)==3 && truelength(DT2)==3) test(421, DT[,d:=8L,verbose=TRUE], output="Growing vector of column pointers from") test(422, length(DT)==4) test(423, truelength(DT), 1028L) # Test crash bug fixed, #1656, introduced with the 1.7.0 feature DT <- data.table(a = factor(c("A", "Z")), b = 1:4) DT[1,1] <- "Z" test(424, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) test(425, DT[1,1] <- 1, 1, warning="Coerced 'double' RHS to 'integer'") test(426, DT, data.table(a=factor(c("A","Z")),b=1:4)) DT[1,1] <- 2L test(427, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) DT[1,a:="A"] test(428, DT, data.table(a=factor(c("A","Z","A","Z")),b=1:4)) DT[1,a:=2L] test(429, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) test(430, DT[1,1]<- 3L, NA_integer_, warning="RHS contains 3 which is outside the levels range.*1,2.*of column 1, NAs generated") test(431, DT[1,1:=4L], data.table(a=factor(c(NA,"Z","A","Z")),b=1:4), warning="RHS contains 4 which is outside the levels range.*1,2.*of column 1, NAs generated") old = getOption("datatable.alloccol") options(datatable.alloccol=NULL) # In this =NULL case, R 3.0.0 returns TRUE rather than the old value. # Hence split out into separate getOption() first. # This was an R bug fixed in R 3.1.1. test(432.1, data.table(a=1:3,b=4:6), error="Has getOption('datatable.alloccol') somehow become unset?") options(datatable.alloccol=old) # simple realloc test DT = data.table(a=1:3,b=4:6) test(432.2, truelength(DT), 1026L) alloc.col(DT,200) # should have no affect since 200<1024 test(433, truelength(DT), 1026L) DT = alloc.col(DT,2000) # test the superfluous DT = test(434, truelength(DT), 2002L) DT2 = alloc.col(DT,3000) # DT changed then DT2 pointed to it test(435, truelength(DT), 3002L) test(436, truelength(DT2), 3002L) # test that alloc.col assigns from within functions too (i.e. to wherever that object is) DT = data.table(a=1:3,b=4:6) # tl 1024 now by default test(437.1, truelength(DT), 1026L) f = function() { alloc.col(DT,2042) # DT isn't local so (via inherits=TRUE) it finds in frame above. invisible() } f() test(437.2, truelength(DT), 2044L) # quick test that [<- over allocates (again) after the copy of length via *tmp* DT = data.table(a=1:3,b=4:6) tl = truelength(DT) DT$foo = 7L test(438, truelength(DT), tl+1L) # the (not recommended) $<- calls a new alloc.col, hence tl becomes +1 DT[,"bar"] = 8L test(439, truelength(DT), tl+2L) test(440, DT, data.table(a=1:3,b=4:6,foo=7L,bar=8L)) # Test rbind works by colname now, for consistency with base, FR#1634 DT = data.table(a=1:3,b=4:6) test(441, rbind(DT,list(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(442, rbind(DT,data.frame(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(443, rbind(DT,data.table(a=4L,b=7L)), data.table(a=1:4,b=4:7)) test(444, rbind(DT,list(b=7L,a=4L)), data.table(a=1:4,b=4:7)) # rbind should by default check row names. Don't warn here. Add clear documentation instead. test(445, rbind(DT,data.frame(b=7L,a=4L)), data.table(a=1:4,b=4:7)) test(446, rbind(DT,data.table(b=7L,a=4L)), data.table(a=1:4,b=4:7)) test(450, rbind(DT,list(c=4L,a=7L)), error="This could be because the items in the list may not ") test(451, rbind(DT,data.frame(c=4L,a=7L)), error="This could be because the items in the list may not ") test(452, rbind(DT,data.table(c=4L,a=7L)), error="This could be because the items in the list may not ") test(453, rbind(DT,list(4L,7L)), data.table(a=1:4,b=4:7)) # Test new use.names argument in 1.8.0 test(453.1, rbind(DT,list(FOO=4L,BAR=7L),use.names=FALSE), data.table(a=1:4,b=4:7)) test(453.2, rbind(DT,data.table(b=4:5,a=7:8), use.names=FALSE), data.table(a=1:5,b=4:8)) # Test the linked reported bug, #1645 A1 = data.table(b='hello', a='foo', key='a') A2 = data.table(a=c('foo', 'bar'), key='a') test(454, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a")) A1 = data.table(a='foo', b='hello', key='a') test(455, merge(A1, A2, all.y=TRUE, by='a'), data.table(a=c("bar","foo"),b=c(NA,"hello"),key="a")) # Test mixing nomatch=0 and mult="last", bug #1661 DT = data.table(id=c(1L, 2L, 2L, 3L), val=1:4, key="id") test(456, DT[J(c(1,2,4)), mult="last", nomatch=0], data.table(id=1:2,val=c(1L,3L),key="id")) # Test join inherited scope respexts nomatch=0, #1663 DT2 = data.table(id=c(1L,2L,4L), val2=c(11,12,14),key="id") test(457, DT[DT2, list(val, val2), nomatch=0, by=.EACHI], data.table(id=c(1L,2L,2L),val=1:3,val2=c(11,12,12),key="id")) # Test bysameorder edge cases, #1631 DT = data.table(a=1:3,v=4:9,key="a") test(458, DT[,sum(v),by=list(a%%2L)], data.table(a=c(1L,0L),V1=c(26L,13L))) test(459, DT[, list(sum(v)), list(ifelse(a == 2, NA, 1L))], data.table(ifelse=c(1L,NA_integer_),V1=c(26L,13L))) test(460, DT[, list(sum(v)), list(ifelse(a == 2, 1, NA))], data.table(ifelse=c(NA_real_,1),V1=c(26L,13L))) test(461, DT[,sum(v),by=a], data.table(a=1:3,V1=c(11L,13L,15L),key="a")) # Test loading from file (which resets tl to 0 in R 2.14.0+, and unitialized random number in 2.13.2-) f = tempfile() save(list="DT",file=f) load(f) test(462, DT[,foo:=10L], data.table(a=1:3,v=4:9,foo=10L,key="a")) unlink(f) # Test CJ problems with v1.7.4, #1689 test(463, all(sapply(CJ(1:2,1:3),length)==6L)) DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y") test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA)) test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4)) f1 = factor(c("READING","MATHEMATICS")) f2 = factor(c("2010_2011","2009_2010","2008_2009"), levels=paste(2006:2010,2007:2011,sep="_")) test(466, all(sapply(CJ(f1, f2),length)==6L)) # Test list(.SD,newcol=..) gives error with guidance DT = data.table(a=1:2,v=3:6) test(467, DT[,list(newcol=7L,.SD),by=a], error="Error.*use := by group instead") # Test empty list column DT = data.table(a=1:3,b=4:6) test(468, DT[,foo:=list()], data.table(a=1:3,b=4:6,foo=list())) # Test plonk list test(469, DT[,bar:=list(1,"a",3.14)], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14))) # Test plonk list variable (to catch deparse treating j=list() specially) x = list(2,"b",2.718) test(470, DT[,baz:=x], data.table(a=1:3,b=4:6,foo=list(),bar=list(1,"a",3.14),baz=list(2,"b",2.718))) # Test recycling list DT = data.table(a=1:4,b=5:8) test(471, DT[,foo:=list("a",2:3)], data.table(a=1:4,b=5:8,foo=list("a",2:3,"a",2:3))) # Test recycling singleton list DT[,foo:=NULL] test(472, DT[,foo:=list(list(2:3))], data.table(a=1:4,b=5:8,foo=list(2:3,2:3,2:3,2:3))) # Test adding new column with a recycled factor, #1691 DT = data.table(a=1:4,b=5:8) DT[,c:=factor("a")] test(473, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")))) DT[,d:=factor(c("a","b"))] test(474, DT, data.table(a=1:4,b=5:8,c=factor(c("a","a","a","a")),d=factor(c("a","b","a","b")))) # Test scoping error introduced at 1.6.1, unique(DT) when key column is 'x' DT=data.table(x=c("a", "a", "b", "b"), y=c("a", "a", "b", "b"), key="x") test(475, unique(DT, by=key(DT)), data.table(x=c("a","b"),y=c("a","b"),key="x")) # Test character and list columns in tables with many small groups N = if (.devtesting) 1000L else 100L DT = data.table(grp=1:(2*N),char=sample(as.hexmode(1:N),4*N,replace=TRUE),int=sample(1:N,4*N,replace=TRUE)) ans = DT[,list(p=paste(unique(char),collapse=","), i=list(unique(int))), by=grp] test(476, nrow(as.matrix(ans)), 2L*N) # The as.matrix triggers the "'getCharCE' must be called on a CHARSXP", or similar symptom of earlier corruption, before fix in dogroups.c. # Test that plonking from calling scope works, even after removing, and column copy via := is ok too. DT = data.table(a=1:3) foo = 4:6 DT[,foo:=foo] rm(foo) gc() DT[,foo2:=foo] DT[2,foo:=10L] DT[3,foo2:=11L] gc() test(477, DT, data.table(a=1:3,foo=c(4L,10L,6L),foo2=c(4L,5L,11L))) test(478, DT[,foo:=foo], DT) # does nothing, with no warning, consistent with base R `a<-a`. # Test that recycling now works with oversized inputs and % != 0 length, both with warnings. DT = data.table(x=1:4) test(479, DT[, a:=5:7], data.table(x=1:4,a=c(5:7,5L)), warning="Supplied 3 items to be assigned to 4 items of column 'a' (recycled leaving remainder of 1 items)") # Test that multiple columns can be added DT = data.table(x=1:4) test(481, DT[, c("foo","bar"):=list(10L,11:14)], data.table(x=1:4,foo=10L,bar=11:14)) # and combined with update and add in one step test(482, DT[, c("foo","baz"):=list(12L,15:18)], data.table(x=1:4,foo=12L,bar=11:14,baz=15:18)) # Test that errors in := do not leave DT in bad state, #1711 DT = data.table(x=1:4) test(483.1, DT[,c("foo","bar"):=list(20L,stop('user error'))], error="user error") test(483.2, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo being added as it did in v1.7.7- # The test used to be as follows but as from v1.9.8, the empty numeric() now works and creates a NA_real_ column test(484, DT[,c("foo","bar"):=list(20L,numeric())], data.table(x=1:4, foo=20L, bar=NA_real_)) # Test i's key longer than x's d1 <- data.table(a=1:2, b=11:14, key="a,b") d2 <- data.table(A=0:1, B=1:4, key="A") test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A")) test(486, d2[d1,sum(B),by=.EACHI], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by if ("package:reshape" %in% search()) { DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9) test(487, data.table(reshape(DT, idvar="ID", timevar="TIME", direction="wide")), data.table(ID=1:3,X.1=INT(1,4,7),X.2=INT(2,5,8),X.3=INT(3,6,9))) # The data.table() around reshape is to drop reshape's attributes. DT <- data.table(ID=rep(1:3, each=3), TIME=rep(1:3, 3), X=1:9, Y=10:18) test(488, data.table(reshape(DT, idvar="ID", timevar="TIME", direction="wide")), data.table(ID=1:3,X.1=INT(1,4,7),Y.1=INT(10,13,16),X.2=INT(2,5,8),Y.2=INT(11,14,17),X.3=INT(3,6,9),Y.3=INT(12,15,18))) } else { cat("Tests 487 and 488 not run. If required call library(reshape) first.\n") } # Test warnings for names<- and colnames<-, but only warnings when caller is data.table aware. DT = data.table(a=1:3,b=4:6) test(489, names(DT)[1]<-"A", "A", warning=if (base::getRversion()>="3.1.0") NULL else "Please upgrade") test(490, names(DT), c("A","b")) test(491, colnames(DT)[2]<-"B", "B", warning=if (base::getRversion()>="3.1.0") NULL else "Please upgrade") test(492, names(DT), c("A","B")) # Check setnames out of bounds errors test(493, setnames(DT,"foo","bar"), error="not found.*foo") test(494, setnames(DT,3,"bar"), error="outside range.*3") # Test new function setcolorder() DT = data.table(a=1:2,b=3:4,c=5:6) test(495, setcolorder(DT,c(2,1,3)), data.table(b=3:4,a=1:2,c=5:6)) test(496, setcolorder(DT,c(2,1,3)), data.table(a=1:2,b=3:4,c=5:6)) test(497, setcolorder(DT,c("c","a","b")), data.table(c=5:6,a=1:2,b=3:4)) test(498, setcolorder(DT,"a"), error="neworder is length") test(498.1, setcolorder(DT,c("d","a","b")), error="Names in neworder not found in x: d") # test first group listens to nomatch when j uses join inherited scope. x <- data.table(x=c(1,3,8),x1=10:12, key="x") y <- data.table(x=c(3,8,10),y1=10:12, key="x") test(499, y[x,x1,nomatch=0,by=.EACHI], data.table(x=c(3,8),x1=11:12, key="x")) test(500, y[x,x1,nomatch=NA,by=.EACHI], data.table(x=c(1,3,8),x1=10:12, key="x")) # Test merge bug of unkeyed tables introduced in 1.6.8 and 1.6.9 reported by Eric, and ... dt1 <- data.table(l = factor(c("a","b","a","b"))) dt2 <- data.table(l = factor(c("a","b")), L = factor(c("A","B"))) test(501, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l"))) dt1 <- data.table(l = c("a","b","a","b")) dt2 <- data.table(l = c("a","b"), L = c("A","B")) test(501.5, setkey(merge(dt1,dt2,by="l"),NULL), as.data.table(merge(as.data.frame(dt1), as.data.frame(dt2), by="l"))) # ... similar example from DM dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17) dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21) test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE), data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8), A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k")) test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference test(504, dtB$k, 1:4) # or dtB # Test new i. JIS prefix in 1.7.10 DT = data.table(a=1:2,b=1:4,key="a") test(505, DT[J(a=1,b=6),sum(i.b*b),by=.EACHI]$V1, 24) # 24 now 'double' because i.b is 'double' # Test := after a key<- DT = data.table(a=3:1,b=4:6) test(506, key(DT)<-"a", "a", warning="can copy the whole table") test(508, DT, data.table(a=1:3,b=6:4,key="a")) test(509, DT[,b:=10L], data.table(a=1:3,b=10L,key="a")) test(510, DT[,c:=11L], data.table(a=1:3,b=10L,c=11L,key="a")) # Used to be warning about invalid .internal.selfref detected and fixed. As from v1.8.3 data.table() returns a NAMED==0 object, and key<- appears not to copy that. But within functions, key<- would still copy. TO DO: add tests.... #test(511,) # Test new functons chmatch and %chin% y=letters x=c(sample(letters,12),"foo","bar") test(512, chmatch(x,y), match(x,y)) test(513, chmatch(x,y,nomatch=0), match(x,y,nomatch=0)) test(514, x %chin% y, x %in% y) # Test new function set() in v1.8.0 DT = data.table(a=1:3,b=4:6) test(515, set(DT,2,1,3), data.table(a=c(1L,3L,3L),b=4:6), warning="Coerced i") test(516, set(DT,"2",1,3), error="i is type 'character'") test(517, set(DT,2L,1,3), DT, warning="Coerced j") # FR #2551 implemented - removed warning from 518 # test(518, set(DT,2L,1L,3), DT, warning="Coerced 'double' RHS to 'integer'") test(518, set(DT,2L,1L,3), DT) test(519, set(DT,2L,1L,3L), data.table(a=INT(1,3,3),b=4:6)) test(520, set(DT,2L,"a",2L), data.table(a=1:3,b=4:6)) test(521, set(DT,2:3,"b",7:8), data.table(a=1:3,b=INT(4,7,8))) test(522, set(DT,2L,"foo",7L), data.table(a=1:3,b=INT(4,7,8), foo=INT(NA,7,NA))) # error="foo.*is not a column name[.] Cannot add columns with set.*use := instead") test(523, set(DT,2L,c("a","a"),list(9L,10L)), error="Can't assign to the same column twice in the same query (duplicates detected).") test(523.1, set(DT,2L,"a",10L), data.table(a=INT(1,10,3),b=INT(4,7,8), foo=INT(NA,7,NA))) setkey(DT,b) test(524, set(DT,2L,"a",2L), data.table(a=1:3, b=INT(4,7,8), foo=INT(NA,7,NA), key="b")) test(525, set(DT,1L,"b",6L), data.table(a=1:3, b=6:8, foo=INT(NA,7,NA))) test(525.1, set(DT,j="b",value=9:11), data.table(a=1:3, b=9:11, foo=INT(NA,7,NA))) # plonk syntax via missing i (fixed in 1.8.1) test(525.2, set(DT,NULL,"b",12:14), data.table(a=1:3, b=12:14, foo=INT(NA,7,NA))) # plonk syntax via NULL i # NEW ADDITIONAL TESTS FOR set() - bug #2077 - for using set to add columns by reference DT1 <- data.table(x = 1, y = 1:10, fac = sample(LETTERS[1:3], 10, replace = TRUE)) # from SO DT2 <- copy(DT1) mul=c(5.3,2.8) for (j in seq_along(mul)) set(DT1, i=NULL, j=paste("dot", j, sep=""), mul[j]*DT1[[j]]) DT2[, `:=`(dot1=5.3*x, dot2=2.8*y)] test(1096.1, DT1, DT2) set(DT1, i=NULL, j="dot2", value=NULL) # remove "dot2" test(1096.2, DT1, DT2[, list(x,y,fac, dot1)]) DT2[, dot2 := NULL][5:9, `:=`(bla1 = 0L, x = 3L, bla2 = 2L)] set(DT1, i=5:9, j=c("bla1", "x", "bla2"), value=list(0L, 3L, 2L)) test(1096.3, DT1, DT2) # more testing with many columns including existing columns test(1096.4, set(DT1, i=NULL, j=7L, value=5L), error="Item 1 of column numbers in j is 7 which is outside range.*1.*6.*Use column names instead in j to add new columns.") # Test that data.frame incompability is fixed, came to light in Feb 2012 DT = data.table(name=c('a','b','c'), value=1:3) test(526, base::droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # base:: because we'll implement a fast droplevels, too. if ("package:nlme" %in% search()) { # commented out to be consistent with base R, as #1078 and #1128 are more common cases.. # until we can find a workaround for this, I'm commenting this one.. # Search for "Fix for #1078" for the tests.. # test(527, {x=Orthodont;tt=lme(distance ~ age, data=x); tt[["data"]]=NULL; tt}, # {x=as.data.table(Orthodont);tt=lme(distance ~ age, data=x);tt[["data"]]=NULL;tt}) test(528, {x=iris;tt=groupedData( Sepal.Length ~ Sepal.Width | Species, data=x);attr(tt,"class")=NULL;attr(tt,"FUN")=NULL;tt}, {x=as.data.table(iris);tt=groupedData( Sepal.Length ~ Sepal.Width | Species, data=x);attr(tt,"class")=NULL;attr(tt,"FUN")=NULL;attr(tt,".internal.selfref")=NULL;tt}) } # Speed test of chmatch vs match. # sortedmatch was 40 times slower and the wrong approach, removed in v1.8.0. # Example from Tom in Jan 2011 who first found and raised the issue with sortedmatch. if (.timingtests) { cat("Running 30sec (max) test ...");flush.console() n = 1e6 a = as.character(as.hexmode(sample(n,replace=TRUE))) b = as.character(as.hexmode(sample(n,replace=TRUE))) test(529, system.time(ans1<-match(a,b))["user.self"] > system.time(ans2<-chmatch(a,b))["user.self"]) test(530, ans1, ans2) # sorting a and b no longer makes a difference since both match and chmatch work via hash in some way or another cat("done\n") } # Test that .set_row_names() is maintained on .SD for each group DT = data.table(a=INT(1,1,2,2,2,3,3,3,3),b=1:9) test(531, DT[,length(rownames(.SD)),by=a], data.table(a=1:3,V1=2:4)) # Test column names with spaces, bug#1880, and check.names default is now FALSE, too # Thanks to Yang Zhang for the tests. DT = data.table("a b"=INT(1,1,2,2,2),c=1:5) test(532, DT[,sum(c),by="a b"], data.table("a b"=1:2,V1=c(3L,12L))) test(533, names(data.table('a b'=1)[, list('c d'=`a b`)]), "c d") test(534, names(transform(data.table('a b'=1), `c d`=`a b`)), c("a b","c d")) # Test keyby, new in v1.8.0 DT = data.table(a=INT(1,3,1,2,3,2),b=1:2,c=1:3,v=1:6) test(535, DT[,sum(v),by=a, keyby=a], error="not both") test(536, DT[,sum(v),by=a], data.table(a=c(1L,3L,2L),V1=c(4L,7L,10L))) # retains appearance order ans = data.table(a=1:3,V1=c(4L,10L,7L),key="a") test(537, DT[,sum(v),keyby=a], ans) test(538, DT[,sum(v),keyby="a"], ans) var="a" test(539, DT[,sum(v),keyby=eval(var)], ans) a=quote(a%%2L) test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) test(545, DT[,sum(v),by=a%%2L], data.table(a=1:0,V1=c(12L,14L))) test(546, DT[,sum(v),by=month(date)], data.table(month=2:3,V1=c(11L,15L))) # Test that factor levels no longer need to be sorted, and that 'ordered' class is retained. # Posted by Allan Engelhardt ... x = factor(LETTERS[1:3], levels=rev(LETTERS), ordered=TRUE) DT = data.table(A=x,B=x,v=1:3, key="A") test(547,is.ordered(DT$A) && is.ordered(DT$B)) test(548.1, DT["A",v,verbose=TRUE], output="Coercing character column i.'V1' to factor to match type of x.'A'") # msg back to i.V1 after a change to i.A for FR #2693. That still works, just differently which no longer overwrites names(i) test(548.2, DT["A",v],1L) # Posted by Damian Betebenner ... set.seed(123) my.course.sample = sample(1:5, 10, replace=TRUE) Y = factor(my.course.sample, levels=1:5, labels=c("Basic Math", "Calculus", "Geometry", "Algebra I", "Algebra II")) DT = data.table(ID=1:10, COURSE=Y) test(549, DT[,sum(ID),by=COURSE]$V1, INT(1,2,29,17,6)) setkey(DT, COURSE) test(550, DT[,sum(ID),by=key(DT)]$V1, INT(6,1,29,2,17)) # Another test of DT[i] syntax from datatable-unaware packages, #1794 from ilprincipe. DF = structure(list(sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("panel.1yr", "panel.2yr", "panel.3yr", "panel.inc", "pre.inc", "pre.prev", "post.inc", "post.prev"), class = "factor"), base = c(2003, 2003, 2003, 2003, 2003, 2003, 2003, 2003, 2002, 2002, 2002, 2002, 2002, 2002, 2002, 2002), ref = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("2004", "2002-2004", "2001", "2000", "2009", "2008"), class = "factor"), var = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("distance", "time"), class = "factor"), treated = c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1), distance = c(10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05, 10000, 30000, 50000, 1e+05), all = c(602L, 6357L, 8528L, 9272L, 435L, 2438L, 3456L, 6360L, 245L, 2693L, 3699L, 4084L, 187L, 983L, 1400L, 2660L), di.recip = c(5L, 39L, 57L, 62L, 4L, 16L, 22L, 45L, 2L, 25L, 36L, 37L, 1L, 11L, 16L, 35L), irr = c(0.00830564784053156, 0.00613496932515337, 0.00668386491557223, 0.00668679896462468, 0.00919540229885057, 0.00656275635767022, 0.00636574074074074, 0.00707547169811321, 0.00816326530612245, 0.00928332714444857, 0.0097323600973236, 0.00905974534769833, 0.0053475935828877, 0.0111902339776195, 0.0114285714285714, 0.0131578947368421)), .Names = c("sample", "base", "ref", "var", "treated", "distance", "all", "di.recip", "irr"), row.names = c(NA, 16L), class = "data.frame") DT = as.data.table(DF) test(551, nrow(reshape(DT, v.names = c("all", "di.recip", "irr"), timevar = "treated", idvar = c("sample", "var", "distance"), direction = "wide" )), 8L) # Test bug report #1275 from S Bagley : DT = data.table(a=c("1","1"), b=c(2,2)) test(552, is.character(DT$a)) test(553, unique(DT), data.table(a="1",b=2)) # Test bug #1726 from Ivan Zhang. DT = data.table(V1=c('a', 'b', 'a'), V2 = c('hello', 'ello', 'llo')) test(554, nrow(DT[V1=='a' & V2 %like% 'll']), 2L) test(555, nrow(DT[V1=='a' & V2 %like% 'ello']), 1L) # Test can't := to .SD, #1727 DT = data.table(x = 1:5, y = rnorm(5)) test(556, DT[,.SD[,z:=rnorm(1)],by=x], error="[.]SD is locked.*reserved for possible future use") f = function(.SD) .SD[,z:=rnorm(1)] test(557, DT[, f(.SD), by=x], error="[.]SD is locked.*reserved for possible future use") # Test printing on nested data.table, bug #1803 DT = data.table(x=letters[1:3],y=list(1:10,letters[1:4],data.table(a=1:3,b=4:6))) test(558, capture.output(print(DT)), c(" x y","1: a 1,2,3,4,5,6,","2: b a,b,c,d","3: c ")) test(559, setkey(DT,x)["a",y][[1]], 1:10) # y is symbol representing list column, specially detected in dogroups # Test renaming of .N to N DT = data.table(a=INT(1,1,2,2,2),b=INT(1,2,2,2,1)) test(560.1, DT[,.N,a][,.N], 2L) test(560.2, DT[,.N,a][,N], 2:3) test(561, DT[,.N,a][,N], 2:3) test(562, DT[,list(.N),a][,N], 2:3) test(563, DT[,.N,a][,unique(.N),a]$V1, c(1L,1L)) test(564, DT[,.N,a][,unique(N),a]$V1, 2:3) test(565, DT[,.N,a][N>2], data.table(a=2L, N=3L)) test(566, DT[,list(.N=.N),a][.N>2], data.table(a=2L,.N=3L)) test(567, DT[,.N,list(a,b)][,N,by=a]$N, c(1L,1L,2L,1L)) test(568, DT[,.N,list(a,b)][,unique(N),by=a]$V1, c(1L,2L,1L)) test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be grouped because") test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") # Test spaces in by="..." format, datatable-help on 31 March DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) test(573, DT[,sum(v),by="b, a"], error="object ' a' not found") # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) test(574, dim(unname(DT)), 3:2) # Test that CJ retains explicit names (useful if used independently) test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y")) test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y")) test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2")) # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") Y = data.table(a=factor(c("D","B"),levels=rev(LETTERS)),key="a") test(578, X[Y,verbose=TRUE], output="Coercing factor column i.'a' to character to match type of x.'a'") test(579, X[Y], data.table(a=c("D","B"), v=c(4L,2L))) # Test that logical i in set() returns helpful error DT = data.table(a=1:3,b=4:6) test(580, set(DT,a<3,"b",0L), error="simply wrap with which(), and take the which() outside the loop if possible for efficiency") # Test by on empty tables (and when i returns no rows), #1945 DT = data.table(a=1:3,v=1:6) test(581, DT[a<1,sum(v),by=a], data.table(a=integer(),V1=integer())) test(582, DT[a<1,sum(v),by=list(a)], data.table(a=integer(),V1=integer())) test(583, DT[a<1], DT[0]) test(584, DT[a<1], output="Empty data.table (0 rows) of 2 cols: a,v") test(585, DT[a<1,list(v)], output="Empty data.table (0 rows) of 1 col: v") test(586, data.table(a=integer(),V1=integer()), output="Empty data.table (0 rows) of 2 cols: a,V1") # Test that .N is available in by on empty table, also in #1945 test(587, DT[a<1,list(sum(v),.N),by=a], data.table(a=integer(),V1=integer(),N=integer())) # Realised that DT[NULL] returned an error. test(588, DT[NULL], data.table(NULL)) # Test that .N, .SD and .BY are available when by is missing and when by is 0 length DT = data.table(x=rep(1:3,each=3), y=c(1,3,6), v=1:9) test(589, DT[,sapply(.SD,sum)*.N], c(x=162, y=270, v=405)) test(590, DT[,sapply(.SD,sum)*.N,by=NULL], data.table(V1=c(162,270,405))) test(591, DT[,sapply(.SD,sum)*.N,by=character()], data.table(V1=c(162,270,405))) test(592, DT[,sapply(.SD,sum)*.N,by=""], data.table(V1=c(162,270,405))) test(593, DT[,lapply(.SD,sum)], data.table(x=18L, y=30, v=45L)) # bug fix #2263 in v1.8.3: now data.table result for consistency test(594, DT[,lapply(.SD,sum),by=NULL], data.table(x=18L, y=30, v=45L)) test(595, DT[,lapply(.SD,sum),by=character()], data.table(x=18L, y=30, v=45L)) test(596, DT[,lapply(.SD,sum),by=""], data.table(x=18L, y=30, v=45L)) # Test keys of two numeric columns, bug#2004 DT = data.table(x=0.0,y=c(0.0,0.1,0.0,0.2,0.0)) test(597, unique(DT), DT[c(1,2,4)]) test(598, DT[,list(count=.N),by=c("x","y")], data.table(x=0.0,y=c(0.0,0.1,0.2),count=c(3L,1L,1L))) # And that numeric NAs sort stably to the beginning. Whether NAs are allowed in keys, another issue but DT = data.table( c(1.34, 1.34, 1.34, NA, 2.22, 2.22, 1.34, NA, NA, 1.34, 0.999), c(75.1, NA, 75.1, 75.1, 2.3, 2.4, 2.5, NA, 1.1, NA, 7.9 )) test(599, DT[c(8,9,4,11,2,10,7,1,3,5,6)], setkey(setkey(DT),NULL)) set.seed(1) DT = data.table(x=rep(c(1,2), each=10), y=rnorm(20)) setkey(DT, x, y) test(600, is.sorted(DT$x)) test(601, !is.sorted(DT$y)) test(602, base::order(DT$x,DT$y), 1:20) # Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000 N = if (.devtesting) 1e6 else 1e4 DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N) # 15MB in dev testing, but test with N=1e7 if (.devtesting) test(603, system.time(sapply(DT,class))["user.self"] < 0.1) # Tests on loopability, i.e. that overhead of [.data.table isn't huge, as in speed example in example(":=") # These are just to catch slow down regressions where instead of 1s it takes 40s if (.devtesting) { # TO DO: find more robust way to turn these on for CRAN checks test(604, system.time(for (i in 1:1000) nrow(DT))["user.self"] < 0.5) test(605, system.time(for (i in 1:1000) ncol(DT))["user.self"] < 0.5) test(606, system.time(for (i in 1:1000) length(DT[[1L]]))["user.self"] < 0.5) # much faster than nrow, TO DO: replace internally } # TO DO: move to stress test script off CRAN ... # DT = as.data.table(matrix(1L,nrow=100000,ncol=100)) # test(607, system.time(for (i in 1:1000) DT[i,V1:=i])["user.self"] < 10) # 10 to be very wide margin for CRAN # test(608, DT[1:1000,V1], 1:1000) # Crash bug of chorder(character()), #2026 test(609, chorder(character()), base::order(character())) test(610, chorder(""), base::order("")) # Extra tests of chorder and chgroup x = sample(LETTERS) test(610.1, chorder(x), base::order(x)) test(610.2, chgroup(x), seq_along(x)) x = sample(LETTERS,1000,replace=TRUE) test(610.3, chorder(x), base::order(x)) test(610.4, unique(x[chgroup(x)]), unique(x)) # := by group DT = data.table(a=1:3,b=(1:9)/10) test(611, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) test(612, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Assign to subset ok (NA initialized in the other items) ok : test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Combining := by group with i test(616, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) test(617, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] test(618, copy(DT)[a>3,r:=sum(b)], ans) test(619, copy(DT)[J(-1),r:=sum(b)], ans) test(620.1, copy(DT)[NA,r:=sum(b)], ans) test(620.2, copy(DT)[0,r:=sum(b)], ans) test(620.3, copy(DT)[NULL,r:=sum(b)], null.data.table()) DT = data.table(x=letters, key="x") test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) test(623, DT[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Tests on POSIXct attributes DT = data.table(a=c(1,1,2,2,2)) test(624, attributes(DT[,as.POSIXct("2011-12-13 18:50",tz="EST"),by=a][[2]]), list(class=c("POSIXct","POSIXt"),tzone="EST")) DT = data.table(x = rnorm(5)) DT$time1 <- Sys.time() # recycle via *tmp* DT$time2 <- rep(Sys.time(), 5) # plonk via *tmp* DT[,time3:=Sys.time()] # recycle DT[,time4:=rep(Sys.time(),5)] # plonk test(625, all(sapply(DT,is,"POSIXct")[-1])) # unique on ITime doesn't lose attributes, #1719 t = as.ITime(strptime(c("09:10:00","09:11:00","09:11:00","09:12:00"),"%H:%M:%S")) test(626, unique(t), t[c(1,2,4)]) test(627, class(unique(t)), "ITime") # Test recycling list() rbind - with recent C-level changes, this seems not possible (like rbindlist) # old test commented. # test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), # data.table(a=c(1:3,rep(4L,3L)),b=c(5:7,rep(8L,3L)),c=list(1:2,1:3,1:4,1L,2L,3L))) test(628, rbind(data.table(a=1:3,b=5:7,c=list(1:2,1:3,1:4)), list(4L,8L,as.list(1:3))), error = "inconsistent with first column of that item which is length") # Test switch in .rbind.data.table for factor columns test(628.5, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list(4L,factor("d"),factor("bar"))), data.table(a=1:4,b=factor(letters[1:4]),c=factor(c(rep("foo",3),"bar"), levels = c("foo", "bar")))) # Test merge with common names and all.y=TRUE, #2011 DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a") DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a") # 629+630 worked before anyway. 631+632 test the bug fix. adf=as.data.frame adt=as.data.table test(629, merge(DT1,DT2), data.table(a=c(3,5),total.x=c(1,1),total.y=c(1,2),key="a")) test(629.1, merge(DT1,DT2), setkey(adt(merge(adf(DT1),adf(DT2),by="a")),a)) test(630, merge(DT1,DT2,all.x=TRUE), data.table(a=c(1,3,4,5),total.x=c(2,1,3,1),total.y=c(NA,1,NA,2),key="a")) test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.x=TRUE)),a)) test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a")) test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a)) test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a")) test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) # Test that unsettting datatable.alloccol is caught, #2014 old = getOption("datatable.alloccol") options(datatable.alloccol=NULL) # search above for R bug fix in 3.1.1 - why split into getOption first here. test(633, data.table(a=1:3), error="n must be integer length 1") options(datatable.alloccol=old) # Test that with=FALSE by number isn't messed up by dup column names, #2025 DT = data.table(a=1:3,a=4:6) test(634, DT[,2:=200L], data.table(a=1:3,a=200L)) # Test names when not all items are named, #2029 DT = data.table(x=1:3,y=1:3) test(635, names(DT[,list(x,y,a=y)]), c("x","y","a")) test(636, names(DT[,list(x,a=y)]), c("x","a")) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. set.seed(1) DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a") test(637, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a")) test(638, key(DT[J(43L),a:=99L]), NULL) setkey(DT,a) test(639, key(DT[,a:=99L,by=a]), NULL) # Test printing is right aligned without quotes etc, and rownames are repeated ok for more than 20 rows DT=data.table(a=8:10,b=c("xy","x","xyz"),c=c(1.1,22.1,0)) test(640, capture.output(print(DT)), c(" a b c","1: 8 xy 1.1","2: 9 x 22.1","3: 10 xyz 0.0")) DT=data.table(a=letters,b=1:26) test(641, tail(capture.output(print(DT[1:20])),2), c("19: s 19","20: t 20")) test(642, tail(capture.output(print(DT[1:21])),2), c("21: u 21"," a b")) DT=data.table(a=as.character(as.hexmode(1:500)), b=1:500) test(643, capture.output(print(DT)), c(" a b"," 1: 001 1"," 2: 002 2"," 3: 003 3"," 4: 004 4"," 5: 005 5"," --- ","496: 1f0 496","497: 1f1 497","498: 1f2 498","499: 1f3 499","500: 1f4 500")) # Test inconsistent length of columns error. DT = list(a=3:1,b=4:3) setattr(DT,"class",c("data.table","data.frame")) test(644, setkey(DT,a), error="Column 2 is length 2 which differs from length of column 1 (3)") test(645, setkey(DT,b), error="Column 2 is length 2 which differs from length of column 1 (3)") # Test faster mean. Example from (now not needed as much) data.table wiki point 3. # Example is a lot of very small groups. set.seed(100) n=1e5 # small n so as not to overload daily CRAN checks. DT=data.table(grp1=sample(1:750, n, replace=TRUE), grp2=sample(1:750, n, replace=TRUE), x=rnorm(n), y=rnorm(n)) DT[c(2,5),x:=NA] # seed chosen to get a group of size 2 and 3 in the first 5 to easily inspect. DT[c(3,4),y:=NA] tt1 = system.time(ans1<-DT[,list(mean(x),mean(y)),by=list(grp1,grp2)]) # 1.1s tt2 = system.time(ans2<-DT[,list(.Internal(mean(x)),.Internal(mean(y))),by=list(grp1,grp2)]) # 1.1s basemean = base::mean # to isolate time of `::` itself tt3 = system.time(ans3<-DT[,list(basemean(x),basemean(y)),by=list(grp1,grp2)]) # 11s test(646, ans1, ans2) test(647, ans1, ans3) # this'll error with `valgrind` because of the 'long double' usage in gsumm.c (although I wonder if we need long double precision). # http://valgrind.org/docs/manual/manual-core.html#manual-core.limits # http://comments.gmane.org/gmane.comp.debugging.valgrind/10340 test(648, any(is.na(ans1$V1)) && !any(is.nan(ans1$V1))) # test 649 removed as compared 1.1s to 1.1s if (.devtesting) test(650, tt1["user.self"] < tt3["user.self"]) tt1 = system.time(ans1<-DT[,list(mean(x,na.rm=TRUE),mean(y,na.rm=TRUE)),by=list(grp1,grp2)]) # 2.0s tt2 = system.time(ans2<-DT[,list(mean.default(x,na.rm=TRUE),mean.default(y,na.rm=TRUE)),by=list(grp1,grp2)]) # 5.0s test(651, ans1, ans2) test(652, any(is.nan(ans1$V1))) if (.devtesting) test(653, tt1["user.self"] < tt2["user.self"]) # See FR#2067. Here we're just testing the optimization of mean and lapply, should be comparable to above tt2 = system.time(ans2<-DT[,lapply(.SD,mean,na.rm=TRUE),by=list(grp1,grp2)]) setnames(ans2,"x","V1") setnames(ans2,"y","V2") test(654, ans1, ans2) test(655, abs(tt1["user.self"] - tt2["user.self"])<2.0) # unoptimized tt2 takes 30 seconds rather than 2. The difference between tt1 and tt2 is under 0.2 seconds usually, so 2.0 is very large margin for error to ensure it's not 30secs. test(656, DT[,mean(x),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") test(657, DT[,list(mean(x)),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") test(658, DT[,list(mean(x),mean(y)),by=grp1,verbose=TRUE], output="GForce optimized j to.*gmean") tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. # Test .N for logical i subset DT = data.table(a=1:10, b=rnorm(10)) test(660, DT[a==8L, .N], 1L) # Test that growing is sensible in worst case DT = data.table(a=rep(1:10,1:10),b=rnorm(55)) tt = capture.output(DT[,sum(b)*b,by=a,verbose=TRUE]) test(661, length(grep("growing from",tt))<3) # was 6 when we simply grew enough for latest result # Test that adding a new logical column is supported, #2094 DT=data.table(a=1:3) test(662, DT[,newcol:=NA], data.table(a=1:3,newcol=NA)) test(663, sapply(DT,class), c(a="integer",newcol="logical")) # Test that setting names in the presence of dups is ok, #2103 DT = data.table(a=1:3, b=2:4, a=3:5) test(664, setnames(DT, c('d','e','f')), data.table(d=1:3,e=2:4,f=3:5)) # Test by=c(...) in combination with i subset, #2078 DT = data.table(a=1:3,b=1:6,key="a") test(665, DT[a<3,sum(b),by=c("a"),verbose=TRUE], DT[a<3,sum(b),by="a"], output="i clause present and columns used in by detected") test(666, DT[a<3,sum(b),by=key(DT),verbose=TRUE], DT[a<3,sum(b),by=a], output="i clause present and columns used in by detected") test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) should work') test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") test(670, DT[,z:=sum(y),keyby=x], ans) DT = data.table(x=1:2, y=1:6) test(671, DT[,z:=sum(y),keyby="x"], ans) DT = data.table(x=1:2, y=1:6) test(672, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning=":= keyby not straightforward character column names or list() of column names, treating as a by") DT = data.table(x=1:2, y=1:6) test(673, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) DT = data.table(x=1:2, y=1:6) test(674, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # Test new .() DT = data.table(x=1:2, y=1:6, key="x") test(675, DT[.(1L)], DT[1:3]) # Test new rbindlist l = list(data.table(a=1:2, b=7:8), data.table(a=3:4, 9:10), data.table(5:6, 11:12), data.table(b=13:14), list(15:16,17L), list(c(18,19),20:21)) test(676, rbindlist(l[1:3]), data.table(a=1:6,b=7:12)) test(677, rbindlist(l[c(10,1,10,2,10)]), data.table(a=1:4,b=7:10)) # NULL items ignored test(678, rbindlist(l[c(1,4)]), error="Item 2 has 1 columns, inconsistent with item 1 which has 2") test(679, rbindlist(l[c(1:2,5)]), error="Column 2 of item 3 is length 1, inconsistent with first column of that item which is length 2.") test(680, rbindlist(l[c(2,6)]), data.table(a=c(3,4,18,19), V2=c(9:10,20:21))) # coerces 18 and 19 to numeric (with eddi's changes in commit 1012 - highest type is preserved now) --- Caught and changed by Arun on 26th Jan 2014 (in commit 1099). ### ----> Therefore this TO DO may not be necessary here anymore (added by Arun 26th Jan 2014) ---> # TO DO when options(datatable.pedantic=TRUE): test(680.5, rbindlist(l[c(2,6)]), warning="Column 1 of item 2 is type 'double', inconsistent with column 1 of item 1's type ('integer')") test(681, rbindlist(list(data.table(a=letters[1:2],b=c(1.2,1.3),c=1:2), list("c",1.4,3L), NULL, list(letters[4:6],c(1.5,1.6,1.7),4:6))), data.table(a=letters[1:6], b=seq(1.2,1.7,by=0.1), c=1:6)) test(682, rbindlist(NULL), data.table(NULL)) test(683, rbindlist(list()), data.table(NULL)) test(684, rbindlist(list(NULL)), data.table(NULL)) test(685, rbindlist(list(data.table(NULL))), data.table(NULL)) # Test merge when no overlap of data in by columns when all=TRUE, #2114 DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=FALSE) DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=FALSE) DT1=as.data.table(DF1) DT2=as.data.table(DF2) test(686, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE))) DF1=data.frame(foo=letters[1:5], bar=1:5, stringsAsFactors=TRUE) DF2=data.frame(foo=letters[6:10], baz=6:10, stringsAsFactors=TRUE) DT1=as.data.table(DF1) DT2=as.data.table(DF2) test(687, merge(DF1, DF2, by="foo", all=TRUE), as.data.frame(merge(DT1,DT2,by="foo",all=TRUE))) # And a more basic test that #2114 revealed that factor to factor join was leaving NA in the i # factor columns, caught in 1.8.1 beta before release to CRAN. DT = data.table(a=factor(letters[1:4]), b=5:8, key="a") test(688, DT[J(factor("b"))], data.table(a=factor("b"), b=6L, key="a")) # Test removing a column followed by adding a new column using := by group, #2117 DT = data.table(a=1:3,b=4:6) DT[,b:=NULL] test(689, DT[,b:=.N,by=a], data.table(a=1:3, b=1L)) test(690, DT[,c:=2,by=a], data.table(a=1:3, b=1L, c=2)) # Test combining i with by, with particular out of order circumstances, #2118 set.seed(1) DT=data.table(a=sample(1:5,20,replace=TRUE),b=1:4,c=1:10) test(691, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b]) test(692, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L))) test(693, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L))) setkey(DT,b) test(694, DT[a>2,sum(c),by=b], DT[a>2][,sum(c),by=b]) test(695, DT[a>2,sum(c),by=b%%2L], data.table(b=1:0,V1=c(34L,42L))) test(696, DT[a>2,sum(c),by=(b+1)%%2], data.table(b=c(0,1),V1=c(34L,42L))) # Test subset and %chin% crash with non-character input, #2131 test(697, 4 %chin% letters, error="type") test(698, 4L %chin% letters, error="type") test(699, "a" %chin% 4, error="type") DT = data.table(aa=1:6,bb=7:12) test(700, subset(DT,select="aa"), DT[,list(aa)]) test(701, subset(DT,select=aa), DT[,list(aa)]) test(702, subset(DT,select=c(aa)), DT[,list(aa)]) setkey(DT,aa) test(703, subset(DT,select="aa"), data.table(aa=1:6,key="aa")) test(704, subset(DT,select=aa), data.table(aa=1:6,key="aa")) test(705, subset(DT,select=c(aa)), data.table(aa=1:6,key="aa")) # Test rbinding of logical columns, #2133 DT1 = data.table(A=1:3,B=letters[1:3],C=c(TRUE,TRUE,FALSE)) DT2 = data.table(A=4:5,B=letters[4:5],C=c(TRUE,FALSE)) test(706, rbind(DT1,DT2), data.table(A=1:5, B=letters[1:5], C=c(TRUE,TRUE,FALSE,TRUE,FALSE))) test(707, rbindlist(list(DT1,DT2)), rbind(DT1,DT2)) # Test non ascii characters when passed as character by, #2134 # ***** # TO DO: reinstate. Temporarily removed to pass CRAN's Mac using C locale (R-Forge's Mac is ok) # ***** # Test := adding column after a setnames of all column names (which [,list(x)] does), #2146 DT = data.table(x=1:5)[,list(x)] test(713, DT[,y:=5], data.table(x=1:5,y=5)) setnames(DT,c("A","B")) test(714, DT[,z:=6:10], data.table(A=1:5,B=5,z=6:10)) # Test J alias is now removed outside DT[...] from v1.8.7 (to resolve rJava::J conflict) test(715, J(a=1:3,b=4), data.table(a=1:3,b=4), error="could not find function.*J") # Test get in j DT = data.table(a=1:3,b=4:6) test(716, DT[,get("b")], 4:6) # TO DO: add warning about inefficiency when datatable.pedantic=TRUE test(717, DT[,get("b"),verbose=TRUE], output="ansvars being set to all columns") # Test that j can be a logical index when `with=FALSE` (#1797) DT = data.table(a=1:10, b=rnorm(10), c=letters[1:10]) test(718, DT[, c(FALSE, TRUE, FALSE), with=FALSE], DT[, 2, with=FALSE]) test(719, nrow(DT[, c(FALSE, FALSE, FALSE), with=FALSE]), 0L) # Test combining join with missing groups with group by, #2162 DT = data.table(a = 1, b = 2, c = 4, key="a") test(720, DT[list(c(5,6,7)), .N, by=b], data.table(b=NA_real_,N=3L)) test(721, DT[list(c(5,6,7))][, .N, by=b], DT[list(c(5,6,7)), .N, by=b]) test(722, DT[list(c(5,6,7)), .N, by=b, mult="first"], data.table(b=NA_real_,N=3L)) test(723, DT[list(c(5,6,7)), .N, by=b, nomatch=0], data.table(b=numeric(),N=integer(),key="b")) # Key here is correct. by is ordered (albeit empty) test(724, DT[list(c(5,6,7)), .N, by=b, nomatch=0], DT[list(c(5,6,7)),nomatch=0][,.N,by=b]) # Splitting should always be consistent # another test linked from #2162 DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1L,3L,6L), v=1:9, key="x") test(725, DT[c("a","b","d"),list(v)], DT[J(c("a","b","d")),"v",with=FALSE]) # unfiled bug fix for NA matches; see NEWS 1.8.3 test(726, DT[c("a", "b", "d"), sum(v), by=y, nomatch=0], data.table(y=INT(1,3,6),V1=INT(5,7,9))) test(727, DT[c("a", "b", "d"), sum(v), by=y], data.table(y=INT(1,3,6,NA),V1=INT(5,7,9,NA))) test(728, DT[c("a", "b", "d"), sum(v), by=y], DT[J(c("a", "b", "d"))][, sum(v), by=y]) # explicit verbose=FALSE needed here because tests are run a second time with verbose=TRUE test(729.1, capture.output(DT[c("a", "b", "d"), print(.SD), by=.EACHI, verbose=FALSE]), capture.output(suppressWarnings(DT[c("a", "b", "d"), print(.SD), by=x, verbose=FALSE]))) test(729.2, capture.output(DT[c("a", "b"), print(.SD), by=y, verbose=FALSE]), # TO DO: why doesn't last group have x=d, maybe groups=i in dogroups capture.output(DT[c("a", "b"),verbose=FALSE][, print(.SD), by=y, verbose=FALSE])) test(729.3, DT[c("b","d"),.SD,by=.EACHI], data.table(x=c("b","b","b","d"),y=INT(1,3,6,NA),v=INT(4,5,6,NA))) # no debate here test(729.4, DT[c("b","d"),.SD, by=y], DT[c("b","d")][,.SD, by=y][4L,x:=NA_character_]) # the i groups when no match don't get carried through (would be hard to implement this and very unlikely to be useful. Just break into compound query, if needed to be used in j, to get them to carry through. TO DO: add to FAQ. # That unnamed i gets x's join column names when j is .SD (or any named list, which verbose warns is inefficient), #2281 test(729.5, DT[c("a","b"),.SD], data.table(x=rep(c("a","b"),each=3),y=INT(1,3,6),v=1:6,key="x")) # check := when combining join with missing groups and then group by test(730, DT[c("b","a"),w:=sum(v),by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # by over a different column than was joined to test(731, DT["d",w:=99,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group, before getting as far as type error test(732, DT["d",w:=99L,by=y]$w, INT(5,7,9,5,7,9,NA,NA,NA)) # do nothing for missing group test(733, DT[c("c","e","b"),w:=sum(v),by=y%%2L]$w, INT(5,7,9,24,24,15,24,24,15)) # Test column type change in the 0 row case (#2274) DT = data.table(a=1:3,b=4:6)[0] test(734, DT[,b:=as.character(b)], data.table(a=integer(),b=character())) test(735, DT[,c:=double()], data.table(a=integer(),b=character(),c=double())) # Deleting multiple columns out-of-order, #2223 DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21) test(736, DT[,c("b","d","g","f","c"):=NULL], data.table(a=1:3,e=13:15)) # test redundant with=FALSE is ok DT = data.table(a=1:3,b=4:6,c=7:9,d=10:12,e=13:15,f=16:18,g=19:21) test(737, DT[,c("b","d","g","f","c"):=NULL], data.table(a=1:3,e=13:15)) # with no longer needed # Mixing column adds and deletes in one := gave incorrect results, #2251. DT = data.table(c1=1:2) test(738, DT[,c("c2", "c1"):=list(c1+1L, NULL)], data.table(c2=2:3)) # `:=`(c1=v1,v2=v2,...) is now valid , #2254 DT = data.table( c1=1:3 ) test(739, DT[,`:=`(c2=4:6, c3=7:9)], data.table(c1=1:3,c2=4:6,c3=7:9)) test(740, DT[,`:=`(4:6,c3=7:9)], error="all arguments must be named") test(741, DT[,`:=`(4:6,7:9,10:12)], error="all arguments must be named") # test the same error message in the other branch # that out of bounds LHS is caught, root cause of #2254 test(742, DT[,3:6:=1L], error="outside.*range") test(743, DT[,2:3:=99L], data.table(c1=1:3,c2=99L,c3=99L)) test(744, DT[,(ncol(DT)+1):=1L], error="outside.*range") test(745, DT[,ncol(DT):=1L], data.table(c1=1:3,c2=99L,c3=1L)) # multiple LHS with by without by, #2215 DT = data.table(a=letters[c(1:3,3L)],key="a") test(746, DT["a",c("new1","new2"):=list(4L, 5L)], data.table(a=letters[c(1:3,3L)],new1=INT(4,NA,NA,NA),new2=INT(5,NA,NA,NA),key="a")) test(747, DT[,new1:=4:6], data.table(a=letters[c(1:3,3L)],new1=INT(4L,5L,6L,4L),new2=INT(5,NA,NA,NA),key="a"), warning="recycled leaving remainder of 1 item") suppressWarnings(DT[,new1:=4:6]) test(748, DT[c("c","b"),`:=`(new3=.N,new2=sum(new1)+1L),by=.EACHI], data.table(a=letters[c(1:3,3L)],new1=INT(4,5,6,4),new2=INT(5,6,11,11),new3=INT(NA,1,2,2),key="a")) # and multiple LHS by group, #1710 DT = data.table(a=rep(6:8,1:3),b=1:6) test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8,1:3),b=1:6,c=rep(1:3,1:3),d=INT(rep(c(1,5,15),1:3)),e=rep(6:8,1:3)*10L)) test(750, DT[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" test(751, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) test(752, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) test(753, setkey(DT[,c("x1","x2"):=x],a), data.table(a=letters[1:3],x=3:1,x1=3:1,x2=3:1,key="a")) DT = data.table(a=letters[3:1],x=1:3,y=4:6) test(754, setkey(DT[,c("x1","y1","x2","y2"):=list(x,y)],a), data.table(a=letters[1:3],x=3:1,y=6:4,x1=3:1,y1=6:4,x2=3:1,y2=6:4,key="a")) # And non-recycling i.e. that a single column copy does copy the column DT = data.table(a=1:3) test(754.1, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) test(754.2, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) test(754.3, DT[,a:=as.character(a),verbose=TRUE], output="Direct plonk.*no copy") RHS = as.integer(DT$a) test(754.4, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") test(755, DT[c("b","c"),sum(x),by=.EACHI], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # 'by' less than number of join columns # join then by when mult=="last"|"first", #2303 (crash in dev 1.8.3 only) DT = data.table(a=1:3,b=1:6,c=7:12,key="a") test(758, DT[J(c(1L,1L)),sum(c),by=b,mult="last"], DT[J(c(1L,1L)),mult="last"][,sum(c),by=b]) test(759, DT[J(1L),c,by=b,mult="last"], DT[J(1L),mult="last"][,c,by=b]) test(760, DT[2:5,sum(c),by=b], DT[2:5][,sum(c),by=b]) test(761, DT[2:5,sum(c),by=b%%2], DT[2:5][,sum(c),by=b%%2]) # joining from empty i table, #2194 DT = data.table(a=1:3,b=4:6,key="a") test(762, DT[J(integer()),b,by=.EACHI], data.table(a=integer(),b=integer(),key="a")) test(763, DT[J(integer()),1L,by=b], data.table(b=integer(),V1=integer(),key="b")) # ordered by is detected now (empty is ordered), otherwise a join to the result would fail just because it's empty which wouldn't be consistent with non empty case test(764, DT[J(integer()),b,mult="last"], integer()) test(765, DT[J(2L),b,mult="last"], 5L) test(766, DT[J(5L),b,nomatch=0,by=.EACHI], data.table(a=integer(),b=integer(),key="a")) test(767, DT[J(5:6),b,nomatch=0,by=.EACHI], data.table(a=integer(),b=integer(),key="a")) # Crash on by-without-by with mixed type non join i columns, #2314. Despite not being used by j they were still being assigned to .BY. DT = data.table(iris,key="Species") Y = data.table(date=as.POSIXct("2011-01-01"),num=as.numeric(1:26)) Y[,get("letters"):=LETTERS] Y[,A:=1:26] Y[,p:=factor(p)] # coerce type to match DT$Species to save warning. Crash was related to .BY internally, not the coercion. setkey(Y,p) for (i in 1:10){DT[Y,Petal.Width];DT[Y];NULL} # reliable crash in 1.8.2 (tested). test(768, DT[Y,Petal.Width,by=.EACHI], data.table(Species=factor(LETTERS),Petal.Width=NA_real_,key="Species")) DT = data.table(a=1:3,b=1:6,c=7:12, key="a") test(769, DT[,.BY[[1]]==a,by=a], data.table(a=1:3,V1=TRUE,key="a")) test(770, DT[J(2:3),.BY[[1]]==b,by=.EACHI], data.table(a=INT(2,2,3,3),V1=c(TRUE,FALSE),key="a")) # A data.table RHS of := caused a crash, #2311. a = data.table(first=1:6, third=c(1,1,1,3,3,4), key="first") b = data.table(first=c(3,4,4,5,6,7,8), second=1:7, key="first") test(771, b[,third:=a[b,third,by=.EACHI]], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") test(772, copy(b)[,third:=as.list(a[b,third,by=.EACHI])], b, warning="Supplied 2 items.*to 7.*recycled leaving remainder of 1 item") test(773, b[4,third[[1]]], c(1,3,3,3,4,NA,NA)) test(774.1, b[,third:=a[b,third,mult="first"]], ans<-data.table(first=c(3,4,4,5,6,7,8), second=1:7, third=c(1,3,3,3,4,NA,NA), key="first")) test(774.2, b[,third:=a[b,third]], ans) # mult="first" no longer needed as from v1.9.3. It now does what was naturally expected. # That names are dropped. (Names on the column vectors don't display. They increase size and aren't much use.) DT = data.table(a=1:3,b=LETTERS[1:3]) map = c("A"="Foo",B="Bar",C="Baz") DT[,b:=map[b]] test(775, names(DT$b), NULL) # Test that names of named vectors don't carry through, #2307. DT = data.table(a=1:3,b=c("a"="a","b"="a","c"="b")) test(776, names(DT$b), NULL) # From v1.8.11, data.table() drops vector names DT = data.table(a=1:3,b=c("a","a","b")) setattr(DT$b, "names", c("a","b","c")) # Force names in there to test #2307 test(777, names(DT$b), c("a","b","c")) test(778, DT[,sum(a),by=b], data.table(b=c("a","b"),V1=c(3L,3L))) #2307 retained names length 3 on the length 2 vector result causing it not to print. test(779, print(DT[,sum(a),by=b]), output=" b V11: a 32: b 3$") # Test new .GRP binding test(780, data.table(a=1:3,b=1:6)[,i:=.GRP,by=a][,i2:=.GRP], data.table(a=1:3,b=1:6,i=rep(1:3,2),i2=1L)) # Test new .I binding DT = data.table(a=1:4,b=1:8) test(781, DT[,.I,by=a]$I, INT(1,5,2,6,3,7,4,8)) test(782, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=5:8)) setkey(DT,a) test(783, DT[,.I,by=a]$I, 1:8) test(784, DT[,.I[which.max(b)],by=a], data.table(a=1:4,V1=INT(2,4,6,8),key="a")) test(785, DT[J(2:4),.I,by=a%%2L], data.table(a=rep(0:1,c(4,2)),I=INT(3,4,7,8,5,6))) test(786, DT[J(c(3,2,4)),list(.I,.GRP),by=.EACHI], data.table(a=rep(c(3L,2L,4L),each=2),I=INT(5,6,3,4,7,8),GRP=rep(1:3,each=2L))) test(787, DT[J(3:2),`:=`(i=.I,grp=.GRP),by=.EACHI][,list(i,grp)], data.table(i=INT(NA,NA,3:6,NA,NA),grp=INT(NA,NA,2,2,1,1,NA,NA))) # New not-join (a.k.a. not-select, since not just for data.table i but integer, logical and character too) DT = data.table(A=rep(1:3,each=2),B=1:6,key="A") test(788, DT[!J(2)], data.table(A=c(1L,1L,3L,3L),B=c(1L,2L,5L,6L),key="A")) test(789, DT[!(2:6)], DT[1]) test(790, DT[!(2:6)], DT[!2:6]) # nicer than DT[-2:6] applying - to 2 first test(791, DT[!6], DT[1:5]) test(792.1, DT[!rep(c(TRUE,FALSE),length=.N)], DT[rep(c(FALSE,TRUE),length=.N)]) test(792.2, DT[!A>=2], DT[A<2]) test(793, setkey(DT[,A:=letters[A]],A)[!c("b","c")], DT["a"]) test(794, DT[!"b"], DT[c("a","c")]) test(795, DT[!0], DT) test(796, DT[!NULL], DT[NULL]) test(797, DT[!integer()], DT) test(798, DT[!-1], DT[1]) test(799, DT[--1], DT[1]) myi = c("a","c") test(800, DT[!myi], DT["b"]) test(801, DT[!"c",sum(B),by=A], data.table(A=c("a","b"),V1=c(3L,7L),key="A")) test(802, DT[!"missing",sum(B),by=A], DT[,sum(B),by=A]) test(803, DT[!c("a","missing","b","missing2"),sum(B),by=A], DT["c",sum(B),by=.EACHI]) # Combining not-join with which test(804, DT[!"b",which=TRUE], INT(1:2,5:6)) # row numbers in DT that don't match # New which=NA value test(805, DT[c("b","foo","c"),which=NA], 2L) # row numbers in i that don't match test(806, DT[!c("b","foo","c"),which=NA], c(1L,3L)) # row numbers in i that do match test(807, DT[!c("b","foo","c"),nomatch=0], error="not-join.*prefix is present on i.*Please remove nomatch") test(808, DT[c("b","foo","c"),which=TRUE,nomatch=NA], INT(3:4,NA,5:6)) test(809, DT[c("b","foo","c"),which=TRUE,nomatch=0], INT(3:4,5:6)) test(810, DT[c("b","foo","c"),which=NA,nomatch=NA], 2L) test(811, DT[c("b","foo","c"),which=NA,nomatch=0], error="which=NA with nomatch=0 would always return an empty vector[.] Please change or remove either which or nomatch") # New notj for column names and positions when with=FALSE, #1384 DT = data.table(a=1:3,b=4:6,c=7:9) test(812, DT[,!"b",with=FALSE], DT[,-match("b",names(DT)),with=FALSE]) test(813, DT[,"foo",with=FALSE], error="column(s) not found: foo") test(814, DT[,!"foo",with=FALSE], DT, warning="column(s) not removed because not found: foo") test(815, DT[,!c("b","foo"),with=FALSE], DT[,list(a,c)], warning="column(s) not removed because not found: foo") test(816, DT[,!2:3,with=FALSE], DT[,-(2:3),with=FALSE]) # for consistency, but ! is really for character column names mycols = "b" test(817, DT[,!mycols,with=FALSE], DT[,list(a,c)]) mycols = 2 test(818, DT[,!mycols,with=FALSE], DT[,list(a,c)]) # Test X[Y] slowdown, #2216 X = CJ(a=seq_len(1e3),b=seq_len(1e3)) Y = copy(X) X[4,b:=3L] # create a dup group, to force allLen1=FALSE setkey(X) test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 5) # Many minutes in 1.8.2! Now well under 1s, but 5s for wide tolerance for CRAN. We like CRAN to tell us if any changes # in R or elsewhere cause the 2 minute bug to return. Hence not excluded by an if(.devtesting) test(820, system.time(X[Y,mult="first"])["user.self"] < 5) # Optimization of lapply(,"+"), #2212 DT = data.table(a=rep(1:3,each=2L),b=1:6,c=7:12) ans = data.table(a=rep(1:3,each=2L),b=INT(2,3,5,6,8,9),c=INT(8,9,11,12,14,15)) test(821, DT[,lapply(.SD, "+", a), by=a], ans) test(822, DT[,lapply(.SD, `+`, a), by=a], ans) ans = data.table(a=1:3,b=INT(4,9,14),c=INT(16,21,26)) test(823, DT[,lapply(.SD, "sum", a), by=a], ans) test(824, DT[,lapply(.SD, sum, a), by=a], ans) test(825, DT[,lapply(.SD, `sum`, a), by=a], ans) DT[2,b:=NA_integer_] test(825.1, DT[,lapply(.SD, function(x)sum(x)), by=a], data.table(a=1:3,b=INT(NA,7,11),c=INT(15,19,23))) test(825.2, DT[,lapply(.SD,function(x,...)sum(x,...),na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23))) test(825.3, DT[,lapply(.SD,sum,na.rm=TRUE),by=a], data.table(a=1:3,b=INT(1,7,11),c=INT(15,19,23))) # Test illegal names in merge are ok and setcolorder length error, #2193i and #2090 DT1 = data.table(a=letters[1:5], "Illegal(name%)"=1:5, key="a") DT2 = data.table(a=letters[1:5], b=6L, key="a") test(826, merge(DT1,DT2), cbind(DT1,b=6L)) test(827, merge(DT2,DT1), cbind(DT2,"Illegal(name%)"=1:5)) a=data.table('User ID'=c(1,2,3), 'Blah Blah'=c(1,2,3), key='User ID') #2090's test b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID') test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda")) # setcolorder and merge check for dup column names, #2193(ii) setnames(DT2,"b","a") test(828, setcolorder(DT2,c("a","b")), error="x has some duplicated column name(s): a. Please remove or rename") test(829, merge(DT1,DT2), error="y has some duplicated column name(s): a. Please remove or rename") test(830, merge(DT2,DT1), error="x has some duplicated column name(s): a. Please remove or rename") # attribs such as "comments" should be retained, #2270 DT1 <- data.table(id = seq.int(1, 10), A = LETTERS[1:10], key = "id") comment(DT1$A) <- "first comment" # copies, setattr would be better as on next line DT2 <- data.table(id = seq.int(2, 10, 2), b = letters[1:5], key = "id") setattr(DT2$b,"comment","second comment") test(831, comment(DT1[DT2]$A), "first comment") test(832, comment(DT2[DT1]$b), "second comment") test(833, sapply(merge(DT1,DT2),comment), list(id=NULL, A="first comment", b="second comment")) test(834, comment(DT1[2:3]$A), "first comment") # Test that matrix RHS of := is caught, #2333 DT = data.table(a=1:3) DT[,a:=scale(a)] # 1 column matrix auto treated as vector test(835, na.omit(DT), DT) test(836, DT[,a:=as.integer(a)], data.table(a=INT(-1,0,1))) test(837, DT[,a:=cbind(1,2)], data.table(a=c(1L,2L,1L)), warning="2 column matrix RHS of := will be treated as one vector") DT = data.table(a=1:3,b=1:6) test(838, DT[,c:=scale(b), by=a][,c:=as.integer(1000*c)], data.table(a=1:3,b=1:6,c=rep(as.integer(1000*scale(1:2)), each=3))) # Test data.table's last(). (last is used internally in data.table, too). test(839, last(1:10), 10L) # If xts is loaded, this'll just test xts's last. Ok as they're consistent, for vectors. DT = data.table(a=1:3,b=4:6) test(840, last(DT), DT[3L]) # xts's last returns a one row data.table ok. So this test is ok too, whether or not xts is loaded. # But not true when DT is a one column data.table/data.frame, see below. if ("package:xts" %in% search()) { # e.g. when run via R CMD check x = xts(1:100, Sys.Date()+1:100) test(841, last(x,10), x[91:100,]) # The important thing this tests is that data.table's last() dispatches to xts's method when data.table is loaded above xts. # But that isn't tested by R CMD check because xts is loaded above data.table, there. # So to make this test relevant, in a fresh R session type: "require(xts);require(data.table);test.data.table()" # rather than: "require(data.table);require(xts);test.data.table()" # Which was the main thrust of bug#2312 fixed in v1.8.3 } else { cat("Test 841 not run. If required call library(xts) first.\n") # So these won't run from R CMD check (deliberately, for now) ... ans = if ("package:gdata" %in% search()) list(89) else 89 test(842, last(list("a",1:2,89)), ans) # xts's last and gdata::last returns a one item list here. Would prefer it to return the item itself. DT = data.table(a=1:3) test(842.1, last(DT), DT[3L]) # xts's last returns a 3L atomic here for 1 column data.frame, strangely. We wish for the last row, consistently. I tried # providing a last.data.table method and using Enhances and Imports in DESCRIPTION with import() and S3method() in # NAMESPACE but nothing I tried made last.data.table available to xts's last if xts was loaded above data.table (which was # frustrating to test as well, see comment to test 839 above). } # Test L[[1L]][,:=] updates by reference, #2204 l = list(data.table(a=1:3), data.table(b=4:6)) test(843, l[[2L]][,c:=7:9], data.table(b=4:6,c=7:9)) test(844, l, list(data.table(a=1:3), data.table(b=4:6,c=7:9))) names(l) = c("foo","bar") # R >= 3.1 no longer copies all the contents, yay test(845, l[["foo"]][2,d:=4L], data.table(a=1:3,d=c(NA,4L,NA)), warning= if (!.R.assignNamesCopiesAll) NULL else "Invalid .internal.selfref detected and fixed") l = list(data.table(a=1:3), data.table(b=4:6)) setattr(l,"names",c("foo","bar")) test(846, l[["foo"]][2,d:=4], data.table(a=1:3,d=c(NA,4,NA))) test(847, l, list(foo=data.table(a=1:3,d=c(NA,4,NA)), bar=data.table(b=4:6))) old = options(datatable.alloccol=0L) l = list(foo=data.table(a=1:3,b=4:6),bar=data.table(c=7:9,d=10:12)) # list() doesn't copy the NAMED==0 objects here test(848, truelength(l[[1L]]), 2L) test(849, {l[[1L]][,e:=13:15]; l[[1L]]}, data.table(a=1:3,b=4:6)[,e:=13:15]) test(850, truelength(l[[1L]]), 3L) test(851, truelength(l[[2L]]), 2L) options(datatable.alloccol=1L) l[["bar"]][,f:=16:18] test(852, truelength(l[[2L]]), 4L) options(old) # Now create the list from named objected DT1 = data.table(a=1:3, b=4:6) DT2 = data.table(c=7:9) l = list(DT1, DT2) if (!.R.listCopiesNamed) { # From R>=3.1, list() no longer copies NAMED inputs (a very welcome change in Rdevel, r63767) test(853, address(DT1) == address(l[[1L]])) w = NULL } else { test(853, address(DT1) != address(l[[1L]])) w = "Invalid .internal.selfref detected and fixed.*R's list() used to copy named objects" } test(854, l[[1]][,d:=10:12], data.table(a=1:3,b=4:6,d=10:12), warning = w) test(855, l[[1]], data.table(a=1:3,b=4:6,d=10:12)) # Test setnames on data.frame, #2273. DF = data.frame(foo=1:2,bar=3:4) setnames(DF,c("baz","qux")) test(856, DF, data.frame(baz=1:2,qux=3:4)) test(857.1, set(DF,NULL,"quux",5:6), error="set() on a data.frame is for changing existing columns, not adding new ones") test(857.2, set(DF,NULL,3L,5:6), error="set() on a data.frame is for changing existing columns, not adding new ones") test(858.1, set(DF,NULL,"qux",5:6), data.frame(baz=1:2, qux=5:6)) test(858.2, set(DF,NULL,2L,7:8), data.frame(baz=1:2, qux=7:8)) # Test DT[J(data.frame())], #2265 DT = data.table(foo=c(1,2,3), bar=c(1.1,2.2,3.3), key="foo") i = data.frame(foo=1) test(859, DT[i], DT[J(i)]) test(860, DT[i], DT[data.table(i)]) # test no memory leak, #2191 and #2284 # These take a few seconds each, and it's important to run these on CRAN to check no leak gc(); before = gc()["Vcells","(Mb)"] for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB gc(); after = gc()["Vcells","(Mb)"] test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin gc(); before = gc()["Vcells","(Mb)"] DF = data.frame(x=1:20, y=runif(20)) for (i in 1:2000) { DT = as.data.table(DF); rm(DT) } gc(); after = gc()["Vcells","(Mb)"] test(862, after < before+0.5) gc(); before = gc()["Vcells","(Mb)"] DT = data.table(x=1:20, y=runif(20)) for (i in 1:2000) { x <- DT[1:5,]; rm(x) } gc(); after = gc()["Vcells","(Mb)"] test(863, after < before+0.5) # rbindlist should look for the first non-empty data.table - New changes (from Arun). Explanation below: # Even if data.table is empty, as long as there are column names, they should be considered. # Ex: What if all data.tables are empty? What'll be the column name then? # If there are no names, then the first non-empty set of names will be allocated. I think this is the way to do it.. TODO: Should write to Matt about it. test(864.1, rbindlist(list(data.table(foo=logical(0),bar=logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))), setnames(DT, c("foo", "bar"))) test(864.2, rbindlist(list(list(logical(0),logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))), DT) test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz=letters[1:3],qux=4:6))), setnames(DT, c("V1", "V2"))) # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) test(865, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="GForce optimized.*gsum(v), gsum(w)") # v1.9.7 treats wrapped {} better, so this is now optimized test(866, names(ans1), c("a","b","name1","name2")) test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") test(869, DT[J(2,42,84),print(.SD),by=.EACHI], output=" b1: 22: 5.*Empty data.table (0 rows) of 3 cols: a,V2,V3") # Test setnames with duplicate colnames DT = data.table(a=1:3,b=4:6,b=7:9) test(870, setnames(DT,"b","foo"), error="Some items of 'old' are duplicated (ambiguous) in column names: b") test(871, setnames(DT,c("bar","bar"),c("x","y")), error="Some duplicates exist in 'old': bar") test(872, setnames(DT,3,"c"), data.table(a=1:3,b=4:6,c=7:9)) test(873, setnames(DT,"foo","bar"), error="Items of 'old' not found in column names: foo") test(874, setnames(DT,c(1,1),c("foo","bar")), error="Some duplicates exist in 'old': 1") test(875, setnames(DT,"c","b"), data.table(a=1:3,b=4:6,b=7:9)) test(875.1, setnames(DT,"a","c"), data.table(c=1:3,b=4:6,b=7:9)) # 'a' isn't duplicated so not a problem as from v1.8.11 test(875.2, setnames(DT,c("c","b"), c("C","B")), error="Some items of 'old' are duplicated (ambiguous) in column names: b") # check error msg when 2nd one in old is the problem # Test local var problem introduced in v1.8.3 DT = data.table(a=1:3,b=1:6) f = function() { localvar = 2 print(DT[a>localvar]) print(DT[a>localvar,sum(b)]) print(DT[a>localvar,sum(b),by=a]) # bug fix 2368 } test(876, f(), output=" a b1: 3 32: 3 6.*[1] 9.* a V11: 3 9") # segfault when assigning NA names, #2393 DT = data.table(a=1:3, b=4:6) test(877, setnames(DT, c(NA, NA)), error="Passed a vector of type 'logical'. Needs to be type 'character'") # test no warning when use.names explicitly set, #2385 - changed 'warning' to 'message' as we just check if usenames is missing, due to C-level changes. # commented the message for now until confirmation with Matt. test(878, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6)), data.table(a=1:6,b=4:9)) #, message="Columns will be bound by name for consistency with base") test(879, rbind(data.table(a=1:3,b=4:6), data.table(b=7:9,a=4:6), use.names=TRUE), data.table(a=1:6,b=4:9)) # Test fread() n=110 # 110 just to be over the 100 limit for printing head, as a convenience DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(1:1000,n,replace=TRUE)-500L, c=rnorm(n), d=sample(c("foo","bar","baz","qux","quux"),n,replace=TRUE), e=rnorm(n), f=sample(1:1000,n,replace=TRUE) ) DT[2,b:=NA_integer_] DT[4,c:=NA_real_] DT[3,d:=NA_character_] DT[5,d:=""] DT[2,e:=+Inf] DT[3,e:=-Inf] DT[4,e:=NaN] # write.table writes NaN as NA, though, and all.equal considers NaN==NA. fread would read NaN as NaN if "NaN" was in file write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(880, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(881, fread(f), DT) # test that columns are not coerced if nastring=NULL DT[3,d:="NA"] test(882, fread(f,na.strings=NULL)[['d']], DT[['d']]) DT[3,d:=NA_character_] unlink(f) write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=TRUE) test(883, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(884, fread(f), DT) unlink(f) # Test short files. # All the unlinks and using a new file each time are to work around apparent Windows issues it seems when writing, appending # rereading (possibly via the MapViewOfFile) the same file that has just been appended to. These apparent issues have only # showed up on winbuilder so far, so might be in combination with the D: tempdir() there; perhaps D: is on a network drive or something. cat("",file=f<-tempfile()); test(885, fread(f), error="empty"); unlink(f) test(885.1, fread(""), error="empty") test(886, fread("\n"), error="empty") test(887, fread(" \n\t \t \n \n "), error="empty") cat("A", file=f<-tempfile()); test(888, fread(f), data.table(A=logical())); unlink(f) test(889, fread("A\n"), data.table(A=logical())) cat("AB,CDE",file=f<-tempfile()); test(890, fread(f), data.table(AB=logical(),CDE=logical())); unlink(f) test(891, fread("AB,CDE\n"), data.table(AB=logical(),CDE=logical())) cat("3.14",file=f<-tempfile()); test(892, fread(f), data.table(V1=3.14)); unlink(f) cat("A,3",file=f<-tempfile()); test(893, fread(f), data.table(V1="A",V2=3L)); unlink(f) if (.Platform$OS.type=="unix") test(893.5, fread("A,B\r\n\r\n"), data.table(A=logical(),B=logical())) for (nc in c(0,1,2)) { # 0 means all cols here for (nr in c(0,1,2,3,5,10,18,19,20,21,22,28,29,30,31,32,38,39,40,41,42)) { # 30 and 40 are trigger points for auto skip for (eol in if (.Platform$OS.type=="unix") c("\n","\r\n") else "\n") { headDT = head(DT,nr)[,seq_len(if (nc==0) ncol(DT) else nc),with=FALSE] if (nr==0) for (j in seq_len(ncol(headDT))) set(headDT,j=j,value=logical()) # when read back in empty cols are the lowest type (logical) f = tempfile() cat(names(headDT),sep=",",file=f) # no \n at the end here for (i in seq_len(nr)) { cat(eol,file=f,append=TRUE) # on unix we simulate windows too. on windows \n will write \r\n (and \r\n will write \r\r\n) write.table(headDT[i],file=f,quote=FALSE,sep=",",eol="",row.names=FALSE,col.names=FALSE,append=TRUE) # loop approach is to get no \n after last line } test(894+nr/100+nc/1000, fread(f), headDT) file.copy(f,f2<-tempfile()); unlink(f) # again trying to work around apparent issue on Windows cat(eol,file=f2,append=TRUE) # now a 'normal' file ending with \n test(895+nr/100+nc/1000, fread(f2), headDT) file.copy(f2,f3<-tempfile()); unlink(f2) cat(eol,file=f3,append=TRUE) # extra \n should be ignored test(896+nr/100+nc/1000, fread(f3), headDT) unlink(f) unlink(f2) unlink(f3) }}} if ("package:bit64" %in% search()) { n = 2000 DT = data.table( a=sample(1:1000,n,replace=TRUE), b=sample(as.integer64(2)^35 * 1:10, n, replace=TRUE), c=sample(c("foo","bar","baz"),n,replace=TRUE) ) write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) # Test all mid read bump coercions DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] DT[,r:=a/100][,r2:=gsub(" ","",format(r))] DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 DT[113, a3:=3.14] DT[114, a4:="123A"] DT[115, b2:=1234567890123.45] DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) DT[117, r2:="3.14A"] write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(899, fread(f), DT, warning="Bumped column.*to type character.*may not be lossless") unlink(f) } else { cat("Tests 897-899 not run. If required call library(bit64) first.\n") } # getwd() has been set by test.data.table() to the location of this tests.Rraw file. Test files should be in the same directory. f = "ch11b.dat" # http://www.stats.ox.ac.uk/pub/datasets/csb/ch11b.dat test(900, fread(f), as.data.table(read.table(f))) f = "1206FUT.txt" # a CRLF line ending file (DOS) test(901.1, DT<-fread(f,strip.white=FALSE), setDT(read.table(f,sep="\t",header=TRUE,colClasses=as.vector(sapply(DT,class))))) test(901.2, DT<-fread(f), setDT(read.table(f,sep="\t",header=TRUE,colClasses=as.vector(sapply(DT,class)),strip.white=TRUE))) # Test the coerce of column 23 to character on line 179 due to the 'A' for the first time. # As from v1.9.8 the columns are guessed better and there is no longer a warning. Test 899 tests the warning. f = "2008head.csv" test(902, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(903, fread("A,B\n1,3,foo,5\n2,4,barbaz,6"), data.table(1:2,3:4,c("foo","barbaz"),5:6), warning="Starting data input on line 2 and discarding line 1 because.*: A,B") # invalid colnames (too short) test(904, fread("A,B,C,D\n1,3,foo,5\n2,4,barbaz,6"), DT<-data.table(A=1:2,B=3:4,C=c("foo","barbaz"),D=5:6)) # ok test(905, fread('A,B,C,D\n1,3,foo,5\n2,4,"barbaz",6'), DT) test(906, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,r,baz",6'), DT[2,C:="ba,r,baz"]) test(907, fread('A,B,C,D\n1,3,foo,5\n2,4,"ba,\\"r,baz",6'), DT[2,C:='ba,\\"r,baz']) # \" protected ok, but \ needs taking off too (TO DO) test(908, fread("A,B,C\n1,3,\n2,4,\n"), data.table(A=1:2,B=3:4,C=NA)) # where NA is type logical test(909, fread(" Date and Time,Open,High,Low,Close,Volume 2007/01/01 22:51:00,5683,5683,5673,5673,64 2007/01/01 22:52:00,5675,5676,5674,5674,17 2007/01/01 22:53:00,5674,5674,5673,5674,42 ")$Open, c(5683L,5675L,5674L)) # , is higher than ' ' in the hierarchy of separators, so ',' is auto detected here. # blanks when testing if header row is all character test(910, fread(" 02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60 02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60 02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60 ")$V13, rep(2865.60,3)) test(911, fread("02-FEB-2009,09:55:04:962,26022009,2500,PE,36,500,44,200,11850,1100,,2865.60 02-FEB-2009,09:55:04:987,26022009,2800,PE,108.75,200,111,50,11700,1450,,2865.60 02-FEB-2009,09:55:04:939,26022009,3100,CE,31.1,3000,36.55,200,3500,5250,,2865.60")$V13, rep(2865.60,3)) # Check manually setting separator txt = "A;B;C|D,E\n1;3;4|5,6\n2;4;6|8,10\n" test(912, names(fread(txt)), c("A;B;C|D","E")) test(913, fread(txt,sep=";"), data.table(A=1:2,B=3:4,"C|D,E"=c("4|5,6","6|8,10"))) test(914, fread(txt,sep="*"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10"))) test(915, fread(txt,sep="\n"), data.table("A;B;C|D,E"=c("1;3;4|5,6","2;4;6|8,10"))) # like a fast readLines # Crash bug when RHS is 0 length and := by group, fixed in 1.8.7 DT = data.table(a=1:3,b=1:6) test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts') test(917, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key='state,uid,ts')) # NA in join column of type double, #2453. X = data.table(name=c("Joh","Raf","Jon","Ste","Rob","Smi"),depID=c(NA,31,33,33,34,34),key="depID") Y = data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID") test(918, Y[X], data.table(depID=c(NA,31,33,33,34,34),depName=c(NA,"Sal","Eng","Eng","Cle","Cle"),name=c("Joh","Raf","Jon","Ste","Rob","Smi"),key='depID')) # Y[X] same as merge.data.frame(X,Y,all.x=TRUE) test(919, X[Y], data.table(name=c("Raf","Jon","Ste","Rob","Smi",NA), depID=c(31,33,33,34,34,35), depName=c("Sal","Eng","Eng","Cle","Cle","Mar"),key='depID')) test(920, X[Y,nomatch=0], data.table(name=c("Raf","Jon","Ste","Rob","Smi"),depID=c(31,33,33,34,34),depName=c("Sal","Eng","Eng","Cle","Cle"),key='depID')) test(921, Y[X,nomatch=0], data.table(depID=c(31,33,33,34,34),depName=c("Sal","Eng","Eng","Cle","Cle"),name=c("Raf","Jon","Ste","Rob","Smi"),key='depID')) # setnames bug on keyed table, when full vector is given and target key isn't the positions in columns 1:length(key) DT = data.table(a=1:2,b=3:4,c=5:6,key="b") test(922, setnames(DT,c("A","B","C")), data.table(A=1:2,B=3:4,C=5:6,key="B")) # vecseq overflow, crash bug #2464 DT = data.table(x=rep(1L,50000),key="x") test(923, DT[DT], error="Join results in more than 2^31 rows (internal vecseq reached physical limit). Very likely misspecified join.") X = data.table(x=1:2,y=1:6,key="x") test(924.1, X[J(c(1,1,1))], X[rep(1:3,3)]) test(924.2, X[J(c(1,1,1,1))], error="Join results in 12 rows; more than 10 = nrow(x)[+]nrow(i). Check for duplicate key values in i each of") # sorting of 'double' columns not correct for ties (tolerance nuance in C code), #2484 DT = data.table(X=as.POSIXct( c(rep("15DEC2008:00:00:00",10),"15DEC2008:00:00:00",rep("17DEC2008:00:00:00",2)),format="%d%b%Y:%H:%M:%S"),Y=c(1534,61,74,518,519,1519,1520,1524,3127,29250,30609,43,7853)) setkey(DT,X,Y) test(925, DT[,base::order(X,Y)], 1:nrow(DT)) # Test new dogroup warning for zero length columns in result when other columns are >1, #2478 DT = data.table(a=1:3,b=1:6) test(926, DT[, if(a==2L) list(42:43,NULL) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the") test(927, DT[, if(a==2L) list(42:43,numeric()) else list(42L,3.14), by=a], data.table(a=INT(1,2,2,3),V1=INT(42,42,43,42),V2=c(3.14,NA,NA,3.14)), warning="Item 2 of j's result for group 2 is zero length. This will be filled with 2 NAs to match the") # And the root cause of #2478: that cbind(DT,1:3) created invalid data.table with empty column test(928, cbind(data.table(a=1L),b=1:3), data.table(a=1L,b=1:3)) # FR #4813 implementation resulted in changing 929 error to warning # test(929, cbind(data.table(a=1L,b=2:3),c=1:3), error="argument 1 (nrow 2) cannot be recycled without remainder to match longest nrow (3)") test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 is of size 2 but maximum size is 3") test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) DT = data.table(x=c(1,1,1,1,2,2,3),y=c(1,1,2,3,1,1,2)) DT[,rep:=1L][c(2,7),rep:=c(2L,3L)] # duplicate row 2 and triple row 7 DT[,num:=1:.N] # to group each row by itself test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3))) # New roll=+/- and rollends DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b") test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v], INT(1,1,6,7)) test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v], INT(1,2,7,7)) test(934, DT[J(c(0,2,6,8)), roll=+Inf, v], INT(NA,1,6,7)) test(935, DT[J(c(0,2,6,8)), roll=-Inf, v], INT(1,2,7,NA)) test(936, DT[J(c(-10,-1,2,12,13)), roll=5, rollends=TRUE, v], INT(NA,1,1,7,NA)) test(937, DT[J(c(-10,-1,2,12,13)), roll=-5, rollends=TRUE, v], INT(NA,1,2,7,NA)) test(938, DT[J(c(-10,2,6,7,8)), roll="nearest", v], INT(1,1,7,7,7)) test(939, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(TRUE,FALSE), v], INT(1,1,7,7,NA)) test(940, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=c(FALSE,TRUE), v], INT(NA,1,7,7,7)) test(941, DT[J(c(-10,2,6,7,8)), roll="nearest", rollends=FALSE, v], INT(NA,1,7,7,NA)) # merge all=TRUE with space in a y column name, #2555 X = data.table(a=1:3,b=4:6) Y = data.table(a=2:4,"d 1"=5:7) # space in Y's column name test(942, merge(X,Y,all=TRUE,by="a"), data.table(a=1:4,b=INT(4:6,NA),"d 1"=INT(NA,5:7),key="a")) test(943, merge(X,Y,all.y=TRUE,by="a"), data.table(a=2:4,b=INT(5:6,NA),"d 1"=5:7,key="a")) # Test error message say NULL rather than empty table DT = data.table(NULL) test(944, DT[,a:=1L], error = "Cannot use := to add columns to a null data.table.*You can use") DT = data.table(a=numeric()) test(945, DT[,b:=a+1], data.table(a=numeric(),b=numeric())) # fread blank column names get default names test(946, fread('A,B,,D\n1,3,foo,5\n2,4,bar,6\n'), data.table(A=1:2,B=3:4,c("foo","bar"),D=5:6)) test(947, fread('0,2,,4\n1,3,foo,5\n2,4,bar,6\n'), data.table(0:2,2:4,c("","foo","bar"),4:6)) test(948, fread('A,B,C\nD,E,F\n',header=TRUE), data.table(A="D",B="E",C=FALSE)) test(949, fread('A,B,\nD,E,F\n',header=TRUE), data.table(A="D",B="E",V3=FALSE)) # +/- with no numbers afterwards should read as character test(950, fread('A,B,C\n1,+,4\n2,-,5\n3,-,6\n'), data.table(A=1:3,B=c("+","-","-"),C=4:6)) # catching misuse of `:=` x = data.table(a=1:5) test(951, x[,{b=a+3; `:=`(c=b)}], error="defined for use in j, once only and in particular ways") # fread colClasses input = 'A,B,C\n01,foo,3.140\n002,bar,6.28000\n' test(952, fread(input, colClasses=c(C="character")), data.table(A=1:2,B=c("foo","bar"),C=c("3.140","6.28000"))) test(953, fread(input, colClasses=c(C="character",A="numeric")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(954, fread(input, colClasses=c(C="character",A="double")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(955, fread(input, colClasses=list(character="C",double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(956, fread(input, colClasses=list(character=2:3,double="A")), data.table(A=c(1.0,2.0),B=c("foo","bar"),C=c("3.140","6.28000"))) test(957, fread(input, colClasses=list(character=1:3)), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000"))) test(958, fread(input, colClasses="character"), data.table(A=c("01","002"),B=c("foo","bar"),C=c("3.140","6.28000"))) test(959, fread(input, colClasses=c("character","double","numeric"), verbose=TRUE), warning = "Column 2 ('B') has been detected as type 'character'. Ignoring request from colClasses to read as 'numeric' (a lower type) since NAs (or loss of precision) may result", output = "Detected 3 columns", # including output= just so that verbose output is captured, just the warning will be checked. data.table(A=c("01","002"),B=c("foo","bar"),C=c(3.14,6.28))) test(960, fread(input, colClasses=c("character","double")), error="colClasses is unnamed and length 2 but there are 3 columns. See") test(961, fread(input, colClasses=1:3), error="colClasses is not type list or character vector") test(962, fread(input, colClasses=list(1:3)), error="colClasses is type list but has no names") test(963, fread(input, colClasses=list(character="D")), error="Column name 'D' in colClasses..1.. not found") test(964, fread(input, colClasses=c(D="character")), error="Column name 'D' in colClasses..1.. not found") test(965, fread(input, colClasses=list(character=0)), error="Column number 0 (colClasses..1...1.) is out of range .1,ncol=3.") test(966, fread(input, colClasses=list(character=2:4)), error="Column number 4 (colClasses..1...3.) is out of range .1,ncol=3.") # Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649 test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L) # Test fread warns about removal of any footer (and autostart skips up over it) test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 4.*discarded.*Rowcount: 2") # fread skip override input = "some,bad,data\nA,B,C\n1,3,5\n2,4,6\n" test(971, fread(input), data.table(some=c("A",1:2),bad=c("B",3:4),data=c("C",5:6))) test(972, fread(input, skip=1), data.table(A=1:2,B=3:4,C=5:6)) test(973, fread(input, skip=2), data.table(V1=1:2,V2=3:4,V3=5:6)) test(974, fread(input, skip=2, header=TRUE), data.table("1"=2L,"3"=4L,"5"=6L)) test(975, fread(input, skip="B"), data.table(A=1:2,B=3:4,C=5:6)) input = "\n\nA,B\n1,3\n2,4\n\nC,D\n5,7\n6,8\n\nE,F\n9,11\n10,12\n" # 3 tables in one file test(976, fread(input), data.table(A=1:2,B=3:4), warning="Stopped reading at empty line 6.*discarded.*C,D") test(977, fread(input, autostart=8), data.table(C=5:6,D=7:8), warning="Stopped reading at empty line 10.*discarded.*E,F") test(978, fread(input, skip="D"), data.table(C=5:6,D=7:8), warning="Stopped reading at empty line 10.*discarded.*E,F") # mixed add and update in same `:=` bug/crash, #2528 and #2778 DT = data.table(x=rep(1:2, c(3,2)), y=6:10) DT[, z:=.GRP, by=x] # first assignment test(979, DT[, `:=`(z=.GRP, w=2), by=x], data.table(x=INT(1,1,1,2,2),y=6:10,z=INT(1,1,1,2,2),w=2)) # mixed update and add # and example from http://stackoverflow.com/a/14732348/403310 : dt1 = fread("Date,Time,A,B 01/01/2013,08:00,10,30 01/01/2013,08:30,15,25 01/01/2013,09:00,20,20 02/01/2013,08:00,25,15 02/01/2013,08:30,30,10 02/01/2013,09:00,35,5") dt2 = fread("Date,A,B,C 01/01/2013,100,300,1 02/01/2013,200,400,2") setkey(dt1, "Date") setkey(dt2, "Date") test(980, dt1[dt2, `:=`(A=A+i.A, B=B+i.B, C=i.C)][,list(A,B,C)], data.table(A=INT(110,115,120,225,230,235),B=INT(330,325,320,415,410,405),C=rep(1:2,each=3))) DT = data.table(A=1:2,B=3:4,C=5:6) test(981, DT[,`:=`(D=B+4L,B=0:1,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L)), warning="RHS 2 is length 2") DT = data.table(A=1:2,B=3:4,C=5:6) test(982, DT[,`:=`(D=B+4L,B=0L,E=A*2L,F=A*3L,C=C+1L,G=C*2L),by=A], data.table(A=1:2,B=0L,C=6:7,D=7:8,E=c(2L,4L),F=c(3L,6L),G=c(10L,12L))) # Also note that G is not yet iterative. In future: c(12,14) # rbindlist binding factors, #2650 test(983, rbindlist(list(data.table(factor(c("A","A","B","C","A"))), data.table(factor(c("B","F","A","G"))))), data.table(V1=factor(c("A","A","B","C","A","B","F","A","G")))) test(984, rbindlist(list(data.table(factor(c("A","B"))), data.table(c("C","A")))), data.table(factor(c("A","B","C","A")))) test(985, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C","A"))))), data.table(factor(c("A","B","C","A")))) # with NA test(985.1, rbindlist(list(data.table(factor(c("A","B"))), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA)))) test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))))), data.table(factor(c("A","B","C",NA)))) ## Allow unique/duplicated to accept custom colum combination to query for ## uniqueness dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") df <- as.data.frame(dt) test(986, unique(dt, by=key(dt)), dt[!duplicated(df[, key(dt)]),]) test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])]) test(988, unique(dt, by='B'), dt[!duplicated(df[, 'B'])]) test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])]) test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])]) test(991, unique(dt, by=NULL), dt[!duplicated(df)]) test(991.1, unique(dt, by=4), error="Integer values between 1 and ncol are required") test(991.2, unique(dt, by=c(1,3.1)), error="Integer values between 1 and ncol are required") test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])]) test(991.4, unique(dt, by=c('C','D','E')), error="by specifies column names that do not exist. First 5: D,E") # :=NULL on factor column in empty data.table, #4809 DT = data.table(A = integer(), B = factor()) test(992, DT[, B:=NULL], data.table(A=integer())) # That including FUN= works in j=lapply, #4839 DT = as.data.table(iris) test(993, DT[, lapply(.SD, function(x) sum(!is.na(x), na.rm=TRUE)), by = Species], DT[, lapply(.SD, FUN=function(x) sum(!is.na(x), na.rm=TRUE)), by = Species]) # fread more than 50,000 columns, the R_PPSSIZE limit in Defn.h # Takes too long for routine use. TO DO: move to a long running stress test script #M = matrix(1,nrow=3,ncol=200000) #f = tempfile() #write.csv(M,f,row.names=FALSE) #test(994, fread(f)[[200000]], rep(1L,3)) #unlink(f) # CJ with `sorted = FALSE` option DT <- data.table(x=rep(3:5, each=4), y=rep(1:6, each=2), z=1:12) setkey(DT, x, y) OUT <- DT[J(c(5,5,3,3), c(5,1,5,1))] test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT) # CJ with ordered factor xx <- factor(letters[1:2], ordered=TRUE) yy <- sample(2) test(996, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2)))) # That CJ orders NA consistently with setkey and historically, now it doesn't use setkey. # NA must always come first in data.table throughout, since binary search relies on that internally. test(997, DT <- CJ(c(1,3,NA,2), 5:6), setkey(setkey(copy(DT),NULL))) # double setkey to really rebuild key test(998, DT <- CJ(as.integer(c(1,3,NA,2)), 5:6), setkey(setkey(copy(DT),NULL))) test(999, DT <- CJ(c("A","B",NA,"C"), 5:6), setkey(setkey(copy(DT),NULL))) test(1000, DT <- CJ(c(1,NA,3), c("B",NA,"A"), c(5L,NA_integer_)), setkey(setkey(copy(DT),NULL))) test(1001, DT <- CJ(c(1,NA,3)), setkey(setkey(copy(DT),NULL))) # The 1 column case is switched inside CJ() so test that too. # merge all=TRUE when y is empty, #2633 a = data.table(P=1:2,Q=3:4,key='P') b = data.table(P=2:3,R=5:6,key='P') test(1002, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='P')) a = data.table(c=c(1,2),key='c') b = data.table(c=3,key='c') test(1003, merge(a,b[0],all=TRUE), data.table(merge.data.frame(a,b[0],all=TRUE),key='c')) # setkey with backticks, #2452 DT = data.table("Date and Time"=1:3,x=4:6) test(1004, setkey(copy(DT),`Date and Time`), setkey(DT,"Date and Time")) # rbinding with duplicate names, NA or "", #2384 and #2726 DT = data.table(a=1:3,b=4:6,b=7:9,c=10:12) test(1005, rbind(DT,DT), data.table(a=rep(1:3,2),b=rep(4:6,2),b=rep(7:9,2),c=rep(10:12,2))) M <- mtcars colnames(M)[11] <- NA test(1006, print(as.data.table(M), nrows=10), output="gear NA.*1: 21.0") # rbinding factor with non-factor/character DT1 <- data.table(x=1:5, y=factor("a")) DT2 <- data.table(x=1:5, y=2) test(1007, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) test(1008, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) # rbindlist different types DT1 <- data.table(a = 1L, b = 2L) DT2 <- data.table(a = 2L, b = 'a') DT3 <- data.table(a = 2L, b = 2.5) test(1008.1, rbindlist(list(DT1, DT2)), data.table(a = c(1L,2L), b = c('2', 'a'))) test(1008.2, rbindlist(list(DT1, DT3)), data.table(a = c(1L,2L), b = c(2, 2.5))) # optimized mean() respects na.rm=TRUE by default, as intended DT = data.table(a=c(NA,NA,FALSE,FALSE), b=c(1,1,2,2)) test(1009, DT[,list(mean(a), sum(a)),by=b], data.table(b=c(1,2),V1=c(NA,0),V2=c(NA_integer_,0L))) # sum(logical()) should be integer, not real # an fread error shouldn't hold a lock on the file on Windows. f = tempfile() cat('A,B\n"aa",2\n"bb,2\n"cc",3\n', file=f) # NB: deliberate missing quote after bb. test(1010, fread(f), data.table(A=c("aa", "\"bb", "cc"), B=c(2L,2L,3L))) cat('dd",4\n',file=f,append=TRUE) # tests file lock on Windows after error test(1011, fread(f), data.table(A=c("aa", "\"bb", "cc", "dd\""), B=c(2L,2L,3L,4L))) cat('A,B\n"aa",1\n"bb",2\n"cc",3\n', file=f) # testing overwrite test(1012, fread(f), data.table(A=c("aa","bb","cc"),B=1:3)) unlink(f) # testing file can be removed after error # integer64 control to fread test(1013, fread("A,B\n123,123\n", integer64="integer"), error="integer64='%s' which isn't 'integer64'|'double'|'numeric'|'character'") test(1014, fread("A,B\n123456789123456,21\n", integer64="character"), data.table(A="123456789123456",B=21L)) test(1015, fread("A,B\n123456789123456,21\n", integer64="double"), data.table(A=as.double("123456789123456"),B=21L)) # and that mid read bumps respect integer64 control too .. x = sample(1:1000,2000,replace=TRUE) DT = data.table( A=as.character(x), B=1:100) DT[115, A:="123456789123456"] # row 115 is outside the 100 rows at 10 points. write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(1016, fread(f,integer64="numeric"), copy(DT)[,A:=as.numeric(A)]) test(1017, fread(f,integer64="character"), DT, warning="Bumped column.*to type character.*may not be lossless") unlink(f) # ERANGE warning, #4879 tt = try(fread("1.46761e-313\n"), silent=TRUE) # options(warn=2) while this test file runs if (inherits(tt,"try-error")) { # All CRAN machines including SPARC test(1018, fread("1.46761e-313\n"), data.table(V1=as.numeric("1.46761e-313")), warning="strtod() returned ERANGE") } else { # on PowerPC only via QEMU emulation : test(1018, fread("1.46761e-313\n"), data.table("1.46761e-313"=logical())) } test(1019, fread("1.23456789123456789123456789\n"), data.table(V1=as.numeric("1.23456789123456789123456789"))) # no warning, as standard # crash assigning to row 0, #2754 DT = data.table(A=1:5,B=6:10) test(1020, DT[0,A:=6L], DT) test(1021, DT[NA,A:="foo"], DT) test(1022, DT[5:0,A:=21L], data.table(A=21L, B=6:10)) test(1023, DT[c(1,2,NA,3), B:=42L], data.table(A=21L, B=c(42L,42L,42L,9:10))) test(1024, DT[6,A:=0L], error="i[[]1[]] is 6 which is out of range [[]1,nrow=5[]]") # crash assigning to duplicated column names/numbers, #2751 test(1024.1, DT[,c("B","B"):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).") test(1024.2, DT[,c(1,2,1):=NULL], error="Can't assign to the same column twice in the same query (duplicates detected).") # as.data.table.table, #4848 DF <- data.frame(x = c(1,1,2,NA,1,2), y = c("b", "b", "b", "a", "c", "a"), z = c(1,1,1,1,1,2), stringsAsFactors=FALSE ) tab1 <- as.data.table(as.data.frame(table(DF$x), stringsAsFactors=FALSE)); setattr(tab1, 'names', c("V1", "N")) tab2 <- as.data.table(as.data.frame(table(DF$x, DF$y), stringsAsFactors=FALSE)); setattr(tab2, 'names', c("V1", "V2", "N")) tab3 <- as.data.table(as.data.frame(table(DF$x, DF$y, DF$z), stringsAsFactors=FALSE)); setattr(tab3, 'names', c("V1", "V2", "V3", "N")) test(1025, as.data.table(table(DF$x)), tab1) test(1026, as.data.table(table(DF$x, DF$y)), tab2) test(1027, as.data.table(table(DF$x, DF$y, DF$z)), tab3) # catch printing of data.table(table()), #4847 (as.data.table should be used instead) # new, updated 14th Feb, 2015. data.table(table) now redirects to as.data.table test(1027.1, data.table(table(1:99)), as.data.table(table(1:99))) # data.table() and rbindlist() in v1.8.11 now catch and removes the dim attribute. For it on to test print catches it : test(1027.2, {DT<-data.table(table(1:99));setattr(DT[[1]],"dim",99L);print(DT)}, error="Invalid column: it has dimensions. Can't format it. If it's the result of data.table(table()), use as.data.table(table()) instead.") # as.data.table.x where x is integer, numeric, etc... set.seed(45) test(1028, as.data.table(x<-sample(5)), data.table(V1=x)) test(1029, as.data.table(x<-as.numeric(x)), data.table(V1=x)) test(1030, as.data.table(x<-as.Date(x, origin="2013-01-01")), data.table(V1=x)) test(1031, as.data.table(x<-factor(sample(5))), data.table(V1=x)) test(1032, as.data.table(x<-factor(x, ordered=TRUE)), data.table(V1=x)) test(1033, as.data.table(x<-as.logical(sample(0:1, 5, TRUE))), data.table(V1=x)) test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) ######################################### # All melt.data.table tests go in here # ######################################### if ("package:reshape2" %in% search()) { set.seed(45) DT <- data.table( i_1 = c(1:5, NA), i_2 = c(NA,6,7,8,9,10), f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), c_1 = sample(c(letters[1:3], NA), 6, TRUE), d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), d_2 = as.Date(6:1, origin="2012-01-01")) DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] # generate list cols DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] test(1035, melt(DT, id=1:2, measure=3:4), melt(DT, id=c("i_1", "i_2"), measure=c("f_1", "c_1"))) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] test(1036, melt(DT, id=c("i_1", "i_2", "l_2"), measure=c("l_1")), ans1) # melt retains attributes if all are of same type (new) ans2 = data.table(c_1=DT$c_1, variable=rep(c("d_1", "d_2"), each=6), value=as.Date(c(DT$d_1, DT$d_2)))[!is.na(value)] test(1037, melt(DT, id=4, measure=5:6, na.rm=TRUE, variable.factor=FALSE), ans2) DT2 <- data.table(x=1:5, y=1+5i) # unimplemented class test(1038, melt(DT2, id=1), error="Unknown column type 'complex'") # more tests DT[, f_2 := factor(c("z", "a", "x", "z", "a", "a"), ordered=TRUE)] DT[, id := 1:6] ans1 = cbind(melt(DT, id="id", measure=5:6, value.name="value1"), melt(DT, id=integer(0), measure=7:8, value.name="value2")[, variable:=NULL]) levels(ans1$variable) = as.character(1:2) test(1038.2, ans1, melt(DT, id="id", measure=list(5:6, 7:8))) test(1038.3, ans1, melt(DT, id="id", measure=list(5:6, 7:8), na.rm=TRUE)) # should've no effect test(1038.7, ans1, melt(DT, id="id", measure=patterns("d_", "l_"))) # melt retains ordered factors! test(1038.4, melt(DT, id="id", measure=c("f_1", "f_2"), value.factor=TRUE)$value, factor(c(as.character(DT$f_1), as.character(DT$f_2)), ordered=TRUE)) # if measure is integer(0) just returns a duplicated data.table with all idcols test(1038.5, melt(DT, id=1:6, measure=integer(0)), shallow(DT, 1:6)) # measure.var list with single entry recycles to maximum length ans = cbind(melt(DT, id="id", measure=c("c_1", "c_1"))[, variable := NULL], melt(DT, id=integer(0), measure=c("f_1", "f_2"))) setnames(ans, c("id", "value1", "variable", "value2")) setcolorder(ans, c("id", "variable", "value1", "value2")) levels(ans$variable) = as.character(1:2) test(1038.6, melt(DT, id="id", measure=list(c("c_1", "c_1"), c("f_1", "f_2"))), ans) # test to ensure attributes on non-factor id-columns are preserved after melt DT <- data.table(x=1:3, y=letters[1:3], z1=8:10, z2=11:13) setattr(DT$x, 'foo', 'bla1') setattr(DT$y, 'bar', 1:4) test(1222.1, attr(melt(DT, id=1:2)$x, "foo"), "bla1") test(1222.2, attr(melt(DT, id=1:2)$y, "bar"), 1:4) # bug #699 - melt segfaults when vars are not in dt x = data.table(a=c(1,2),b=c(2,3),c=c(3,4)) test(1316.1, melt(x, id="d"), error="One or more values") test(1316.2, melt(x, measure="d"), error="One or more values") test(1316.3, melt(x, id="a", measure="d"), error="One or more values") test(1316.4, melt(x, id="d", measure="a"), error="One or more values") # fix for #780. DT = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9) foo = function(input, by, var) { melt(input, id.vars = by, measure.vars=var) } test(1371.1, foo(DT, by="x"), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)), warning="are not all of the same type. By order of hierarchy, the molten data value column will be of type 'double'") test(1371.2, foo(DT), data.table(x=rep(DT$x, 2L), variable=factor(rep(c("y", "v"), each=9L), levels=c("y", "v")), value=c(DT$y, DT$v)), warning="To be consistent with reshape2's melt, id.vars and") # Fix for #1055 DT <- data.table(A = 1:2, B = 3:4, D = 5:6, D = 7:8) test(1495, melt(DT, id=1:2), data.table(A=1:2, B=3:4, variable=factor(rep(1L, 4L), labels="D"), value=5:8)) # segfault of unprotected var caught with the help of address sanitizer set.seed(1) val = sample(c(1:5, NA), 1e6L, TRUE) dt <- setDT(replicate(100L, val, simplify=FALSE)) ## to ensure there's no segfault... ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE) test(1509, ans, ans) # improper levels fix, #1359 dt = data.table(id=1:3, x=NA_character_, y=c('a', NA_character_, 'c')) test(1563, melt(dt, id.var="id", na.rm=TRUE), data.table(id=c(1L,3L), variable=factor(c("y", "y")), value=c("a", "c"))) # fixing segfault due to negative id and measure vars that I detected by accident dt = data.table(x=1:5, y=6:10, z=11:15) test(1569.1, melt(dt, id=-1, measure=NULL), error="One or more values in 'id.vars'") test(1569.2, melt(dt, id=-1, measure=-1), error="One or more values in 'id.vars'") test(1569.3, melt(dt, id=NULL, measure=-1), error="One or more values in 'measure.vars'") test(1569.4, melt(dt, id=5, measure=-1), error="One or more values in 'id.vars'") test(1569.5, melt(dt, id=1, measure=-1), error="One or more values in 'measure.vars'") } # sorting and grouping of Inf, -Inf, NA and NaN, #4684, #4815 & #4883 DT <- data.table(x = rep(c(1, NA, NaN, Inf, -Inf), each=2)) OUT <- data.table(x=c(1, NA, NaN, Inf, -Inf), N=2L) test(1039, DT[, .N, by=x], OUT) DT <- data.table(y =c(NA, Inf, NA, -Inf, -Inf, NaN, Inf, 1, NaN, 1)) OUT <- data.table(y = c(NA, Inf, -Inf, NaN, 1), N=2L) test(1040, DT[, .N, by=y], OUT) # rbindlist on *data.frame* input, #4648. Somehow not test for this. (Although, #4648 was the same as #2650 fixed in v1.8.9). l <- list(u1=data.frame(i1=c('a', 'b', 'c'), val=1:3, stringsAsFactors=TRUE), u2=data.frame(i1=c('d', 'e'), val=4:5, stringsAsFactors=TRUE)) test(1041, rbindlist(l), data.table(i1=factor(letters[1:5]),val=1:5)) # negative indexing in *i* leads to crash/wrong aggregates when dogroups is called. bug #2697 DT = data.table(x = c(1,2,3,4,5), group = c(1,1,2,2,3)) test(1042, DT[-5, mean(x), by = group], data.table(group=c(1,2), V1=c(1.5, 3.5))) # Test when abs(negative index) > nrow(dt) - should warn test(1042.1, DT[-10], DT, warning="Item 1 of i is -10 but there are only 5 rows. Ignoring this and 0 more like it out of 1.") test(1042.2, DT[c(-5, -10), mean(x), by = group], data.table(group=c(1,2),V1=c(1.5,3.5)), warning="Item 2 of i is -10 but there are only 5 rows. Ignoring this and 0 more like it out of 2.") # Test #1043 TO DO - mixed negatives test(1043, DT[c(1, -5)], error="Item 2 of i is -5 and item 1 is 1. Cannot mix positives and negatives.") # crash (floating point exception), when assigning null data.table() to multiple cols, #4731 DT = data.table(x=1:5,y=6:10) test(1044, DT[3,c("x","y"):=data.table()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list(list())") test(1045, DT[3,c("x","y"):=list()],error="Supplied 2 columns to be assigned an empty list.*use NULL instead.*list(list())") # negative indexing with head() and tail(). bug #2375 d1 = data.table(date = c(1,2,3,4,5), value = c(1,2,3,4,5)) d2 = data.frame(d1) test(1046, head(d1, -2), as.data.table(head(d2, -2))) test(1047, head(d1, 2), as.data.table(head(d2, 2))) test(1048, head(d1, -10), as.data.table(head(d2, -10))) test(1049, head(d1, 10), as.data.table(head(d2, 10))) test(1050, tail(d1, -2), as.data.table(tail(d2, -2))) test(1051, tail(d1, 2), as.data.table(tail(d2, 2))) test(1052, tail(d1, -10), as.data.table(tail(d2, -10))) test(1053, tail(d1, 10), as.data.table(tail(d2, 10))) # negative indexing with `:=` - new feature through fixing of #2697, performs as intended for negative subscripts. x <- data.table(letters=letters[1:5], number=1:5) test(1054, x[-(1:3), number := 1L], x[4:5, number := 1L]) test(1055, x[0, number := 1L], x) # print.data.table heeds digits=2 etc, #2535 DT = data.table(x=rep(c("a","b","c"),each=3), y=(30/7)^(2:10))[, logy := log(y)] test(1056, print(DT, digits=2), output=" x y logy1: a 18 2.92: a 79 4.43: a 337 5.8") test(1057, print(DT, digits=2, big.mark=","), output=" x y logy1: a 18 2.9.*6: b 26,556 10.27: c 113,811 11.6") # bug #2758 fix - segfault with zeros in i and factors in by x <- data.table(letters=letters[1:5], factor=factor(letters[1:5]), number=1:5) test(1058, x[c(0, 3), list(letters, number), by=factor], error="While grouping, i=0 is allowed") test(1059, x[c(3, 0), list(letters, number), by=factor], error="While grouping, i=0 is allowed") test(1060, x[c(0, 3), number:=5L, by=factor], error="While grouping, i=0 is allowed") test(1061, x[c(0, 3), number:=5L], data.table(letters=letters[1:5], factor=factor(letters[1:5]), number=c(1:2,5L,4:5))) # bug #2440 fix - seqfault when j refers to grouping variable when results are empty DT = data.table(x=rep(c("a","b"),each=3),v=c(42,42,42,4,5,6)) test(1062, DT[x %in% c('z'),list(x2=x),by=x], output="Empty data.table (0 rows) of 2 cols: x,x2") test(1063, DT[x %in% c('z'),list(vpaste=paste(v,collapse=','),x2=paste(x,x)),by=x], output="Empty data.table (0 rows) of 3 cols: x,vpaste,x2") test(1064, DT[integer(0), list(x2=x), by=x], output="Empty data.table (0 rows) of 2 cols: x,x2") # bug #2445 fix - := fails when subsetting yields NAs and with=FALSE X = data.table(A=1:3, B=1:6, key="A") var <- "B" test(1065, X[J(2:5), (var):=22L], data.table(A=rep(1:3, each=2), B=c(1L,4L,rep(22L,4)), key="A")) # fread single unnamed colClasses f = "A,B,C,D\n1,3,5,7\n2,4,6,8\n" test(1066, fread(f,colClasses=c("integer","integer","character")), error="colClasses is unnamed and length 3 but there are 4 columns") test(1067, fread(f,colClasses=c("integer","numeric","character","character")), data.table(A=1:2,B=c(3,4),C=c("5","6"),D=c("7","8"))) test(1068, fread(f,colClasses="character"), data.table(A=c("1","2"),B=c("3","4"),C=c("5","6"),D=c("7","8"))) # fread select and drop test(1069, fread(f,drop=c("D","B")), data.table(A=1:2,C=5:6)) test(1070, fread(f,drop="E"), fread(f), warning="Column name 'E' in 'drop' not found") test(1071, fread(f,select="B",colClasses=list(numeric="C")), data.table(B=3:4)) test(1072, fread(f,select="B",drop="C"), error="not both") test(1073, fread(f,drop=2:3), fread(f,select=c(1,4))) # tests coercing numeric select as well # that problem printing duplicate columns doesn't return, #4788 DT = data.table(V1 = c(1:1000), V2 = c(10001:11000)) test(1074, DT[, sum(V2), by = V1], output="1000: 1000 11000") # x has two columns both called V1 here # add test from #2446. Already fixed but add anyway. "names in neworder not found in x: 'colnames with spaces' from merge() when all.y=TRUE" X = data.table(a=1:3,b=4:6,"c d"=7:9) Y = data.table(e=10:12,a=2:4) test(1075, merge(X,Y,by="a",all=TRUE), data.table(a=c(1:4),b=c(4:6,NA),"c d"=c(7:9,NA),e=c(NA,10:12),key="a")) # Fixes #2670. `by` sometimes incorrect for expressions of keyed columns. When by is used like `by=month(date)`, with key column set to "date", grouping+aggregation would be wrong. DT = data.table(date=as.Date("2013-01-01")+seq(1,1000,by=10),1:100) setkey(DT,date) test(1076, DT[,sum(V2),by=month(date)], DT[, sum(V2), by=list(month(date))]) # just to be sure, second test with another function using sample. setkey(DT, V2) ff <- function(x) { set.seed(45); (sample(x)-1) %/% 10} test(1077, DT[, sum(V2),by=ff(V2)], DT[, sum(V2),by=list(ff(V2))]) # rbindlist should discard names on columns, #4890 d = data.frame(x=1:5) f = function(x) {suppressWarnings(DF<-data.frame(x=x, y=1:10)); setattr(DF$x,"names","a");DF} l = apply(d, 1, f) test(1078.1, length(names(l[[1]]$x)), 10) # test this test is creating names on the column test(1078.2, length(names(l[[2]]$x)), 10) a = rbindlist(l) test(1078.3, a$x, rep(1:5,each=10)) # a$x would segfault before the fix to rbindlist # data.table() shouldn't retain column names, root cause of #4890 x = 1:5 names(x) = letters[1:5] test(1079.1, DF<-data.frame(x=x, y=1:10), data.frame(x=rep(1:5,2),y=1:10), warning="row names.*discarded") test(1079.2, lapply(DF, names), list(x=NULL, y=NULL)) test(1079.3, DT<-data.table(x=x, y=1:10), data.table(x=rep(1:5,2),y=1:10)) test(1079.4, lapply(DT, names), list(x=NULL, y=NULL)) # test from similar #4912 for completeness z = c(a=1,b=2,c=3) a = data.table(z,x=1:3) b = rbind(a, data.table(z=2,x=1)) test(1080, b$z, c(1,2,3,2)) # mid row logical detection test(1081, fread("A,B,C\n1,T,2\n"), data.table(A=1L,B=TRUE,C=2L)) # cartesian join answer's key should contain only the columns considered in binary search. Fixes #2677 set.seed(45) n <- 10 DT1 <- data.table(a=sample(1:3, n, replace=TRUE), b=sample(1:3, n, replace=TRUE), c=sample(1:10, n,replace=TRUE), key=c("a", "b", "c")) DT2 <- data.table(p=sample(1:3, n, replace=TRUE), q=sample(1:3, n, replace=TRUE), r=sample(1:n), w=sample(1:n)) setkey(DT2, p,q) ans <- DT1[DT2, nomatch=0, allow.cartesian=TRUE] # NB: DT2 contains duplicate key values so columns c ends up not being sorted test(1082.1, key(ans), c("a","b")) test(1082.2, setkeyv(ans, key(ans)), ans) # i.e. key is valid, otherwise re-built warning will be caught check <- setkey(as.data.table(aggregate(r ~a+b+c, ans, length)), a, b) test(1083, setkeyv(ans[, list(r = .N), by=key(DT1)], key(ans)), check) # if the key is set properly, then and only then will the aggregation results match with "check" # Tests for #2531. `:=` loses POSIXct or ITime attribute: # first test from this SO post: http://stackoverflow.com/questions/15996692/cannot-assign-columns-as-date-by-reference-in-data-table dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := as.IDate(min(date)), by = group] test(1084, class(dt$min.group.date), c("IDate", "Date")) dt <- data.table(date = as.IDate(sample(10000:11000, 10), origin = "1970-01-01")) dt[, group := rep(1:2, 5)] dt[, min.group.date := min(date), by = group] # don't need to wrap it with as.IDate(.) test(1085, class(dt$min.group.date), c("IDate", "Date")) # second test from this SO post: http://stackoverflow.com/questions/14604820/why-does-this-posixct-or-itime-loses-its-format-attribute DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) DT[,x1:=as.ITime(x)] DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y] test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") # Tests 1088-1093 were non-ASCII. Now in DtNonAsciiTests # print of unnamed DT with >20 <= 100 rows, #97 (RF#4934) DT <- data.table(x=1:25, y=letters[1:25]) DT.unnamed <- unname(copy(DT)) test(1094.1, capture.output(print(DT.unnamed)), c(" ", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 d", " 5: 5 e", " 6: 6 f", " 7: 7 g", " 8: 8 h", " 9: 9 i", "10: 10 j", "11: 11 k", "12: 12 l", "13: 13 m", "14: 14 n", "15: 15 o", "16: 16 p", "17: 17 q", "18: 18 r", "19: 19 s", "20: 20 t", "21: 21 u", "22: 22 v", "23: 23 w", "24: 24 x", "25: 25 y", " ")) # print of blank-named DT (eliminating matrix notation) # #545 (RF#5253) and part of #1523 DT <- data.table(x = 1:3) setnames(DT, "") test(1094.2, capture.output(print(DT)), c(" ", "1: 1", "2: 2", "3: 3")) # DT[!TRUE] or DT[!TRUE, which=TRUE], #4930. !TRUE still can be a recycling operation with !(all TRUE) DT <- data.table(x=1:3, y=4:6) test(1095.1, DT[!TRUE], DT[FALSE]) test(1095.2, DT[!TRUE, which=TRUE], DT[FALSE, which=TRUE]) ######### incremented tests by 1 as I've used 1096 for FR #2077 (above along with already existing tests 522): ########### # roll backwards when i is keyed and rollends=FALSE # http://stackoverflow.com/questions/18984179/roll-data-table-with-rollends dt1 = data.table(Date=as.Date(c("2013-01-03","2013-01-07")),key="Date")[,ind:=.I] dt2 = data.table(Date=seq(from=as.Date("2013-01-01"),to=as.Date("2013-01-10"), by="1 day"),key="Date") test(1097, dt1[dt2,roll=-Inf,rollends=FALSE]$ind, INT(NA,NA,1,2,2,2,2,NA,NA,NA)) # now ok test(1098, dt1[dt2,roll=-Inf,rollends=TRUE]$ind, INT(1,1,1,2,2,2,2,2,2,2)) # ok before test(1099, dt1[dt2,roll=-Inf,rollends=c(TRUE,FALSE)]$ind, INT(1,1,1,2,2,2,2,NA,NA,NA)) # ok before test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2,2,2)) # now ok ######################################### # All dcast.data.table tests go in here # ######################################### if ("package:reshape2" %in% search()) { names(ChickWeight) <- tolower(names(ChickWeight)) DT <- melt(as.data.table(ChickWeight), id=2:4) # calls melt.data.table # changed 'mean' to 'sum' to avoid valgrind floating point precision based error. test(1101, as.data.frame(dcast(DT, time ~ variable, fun=sum)), dcast(as.data.frame(DT), time~variable, fun=sum)) test(1102, as.data.frame(dcast(DT, diet ~ variable, fun=sum)), dcast(as.data.frame(DT), diet~variable, fun=sum)) x1 <- as.data.frame(dcast(DT, diet+chick ~ time, drop=FALSE)) x1$chick <- factor(x1$chick, levels=levels(x1$chick), ordered=FALSE) x2 <- dcast(as.data.frame(DT), diet+chick~time, drop=FALSE) test(1103, x1,x2) x1 <- as.data.frame(dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)) x1$chick <- factor(x1$chick, levels=levels(x1$chick), ordered=FALSE) x2 <- dcast(as.data.frame(DT), diet+chick~time, drop=FALSE, fill=0) test(1104.1, x1,x2) # add test for 'subset' in dcast x1 <- dcast(as.data.frame(DT), time + chick ~ variable+diet, fun=sum, subset=.(time> 20)) x2 <- as.data.frame(dcast(DT, time + chick ~ variable+diet, fun=sum, subset=.(time> 20))) test(1104.2, x1, x2) # testing without aggregation x <- data.table(a=5:1, b=runif(5)) test(1104.3, as.data.frame(dcast(x, a ~ b, value.var="b")), dcast(as.data.frame(x), a ~ b, value.var="b")) # Fix for case 2 in bug report #5149 - dcast dint aggregate properly when formula RHS has "." set.seed(45) DT = data.table(x=rep(1:5, each=3), y=runif(15, 0, 1)) ans = setDT(dcast(as.data.frame(DT), x ~ ., mean, value.var="y")) setkey(ans, x) test(1148.1, dcast(DT, x ~ ., mean, value.var="y"), ans) # also quashed another bug with `.` in formula (when there's no aggregate function): DT <- data.table(a=sample(5), b=runif(5), c=5:1) ans1 = setDT(dcast(as.data.frame(DT), a ~ ., value.var="c")) ans2 = setDT(dcast(as.data.frame(DT), b+a ~ ., value.var="c")) setkey(ans1, "a") setkey(ans2, "b", "a") test(1148.2, dcast(DT, a ~ ., value.var="c"), ans1) test(1148.3, dcast(DT, b+a~., value.var="c"), ans2) # more tests for `dcast` with formula being character and errors when formula is a hybrid set.seed(1) x <- data.table(a=rep(1:5, each=5), b=runif(25)) ### adding all extra arguments for no verbose during "test.data.table()" to all dcast tests test(1150.1, dcast(x, " a~ . ", value.var="b", fun=length), data.table(a=1:5, `.`=5L, key="a")) test(1150.2, dcast(x, "a ~ c ", value.var="b"), error="not found or of unknown type") test(1150.3, dcast(x, a ~ a, value.var="c"), error="are not found in 'data'") # fix for #5379 - issue when factor columns on formula LHS along with `drop=FALSE` set.seed(1L) df <- data.frame(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), b=factor(sample(tail(letters, 5), 10, replace=TRUE))) dt <- as.data.table(df) test(1198.1, setkey(setDT(dcast(as.data.frame(df), a~b, drop=FALSE, value.var="b", fun=length)), a), dcast(dt, a~b, drop=FALSE, fun=length, value.var="b")) # reverse the levels set.seed(1L) df <- data.frame(a=factor(sample(letters[1:3], 10, replace=TRUE), letters[5:1]), b=factor(sample(tail(letters, 5), 10, replace=TRUE))) dt <- as.data.table(df) test(1198.2, setkey(setDT(dcast(as.data.frame(df), a~b, drop=FALSE, value.var="b", fun=length)), a), dcast(dt, a~b, drop=FALSE, value.var="b", fun=length)) # more factor cols set.seed(1L) df <- data.frame(a1=factor(sample(letters[1:3], 10, replace=TRUE), letters[1:5]), # factor col 1 a2=factor(sample(letters[6:10], 10, replace=TRUE), letters[6:10]), # factor col 2 a3=sample(letters[1:3], 10, TRUE), # no factor b=factor(sample(tail(letters, 5), 10, replace=TRUE))) dt <- as.data.table(df) ans <- dcast(dt, a1+a2+a3~b, drop=FALSE, value.var="b") ans[, c(4:7) := lapply(.SD, as.character), .SDcols=4:7] test(1198.3, setkey(setDT(dcast(as.data.frame(df), a1+a2+a3~b, drop=FALSE, value.var="b")), a1,a2,a3), ans) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): dt <- data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) test(1252, dcast(dt, x~y, value.var="z", subset=.(!is.na(z))), data.table(x=c(1,2), `1`=c(1,5), `2`=c(2,4), `3`=c(3,NA), key="x")) # FR #5675 and DOC #5676 set.seed(1L) dt <- data.table(a=sample(10), b=2013:2014, variable=rep(c("c", "d"), each=10), value=runif(20)) ans1 <- names(dcast(dt, a ~ ... + b, value.var="value")) test(1286, ans1, c("a", "c_2013", "c_2014", "d_2013", "d_2014")) # bug git #693 - dcast error message improvement: dt <- data.table(x=c(1,1), y=c(2,2), z = 3:4) test(1314, dcast(dt, x ~ y, value.var="z", fun.aggregate=identity), error="should take vector inputs and return a single value") # bug #688 - preserving attributes DT = data.table(id = c(1,1,2,2), ty = c("a","b","a","b"), da = as.Date("2014-06-20")) test(1315, dcast(DT, formula = id ~ ty, value.var="da"), data.table(id=c(1,2), a=as.Date("2014-06-20"), b=as.Date("2014-06-20"), key="id")) # issues/713 - dcast and fun.aggregate DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7) foo <- function (tbl, fun.aggregate) { dcast(tbl, id ~ k, value.var='v', fun.aggregate=fun.aggregate, fill=NA_integer_) } test(1345, foo(DT, last), dcast(DT, id ~ k, value.var='v', fun.aggregate=last, fill=NA_integer_)) # more minor changes to dcast (subset argument handling symbol - removing any surprises with data.table's typical scoping rules) - test for that. DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7) bla <- c(TRUE, rep(FALSE, 6L)) # calling `subset=.(bla)` gives eval error when testing... not sure what's happeing! using values directly instead for now. test(1346.1, dcast(DT, id ~ k, value.var="v", subset=.(c(TRUE, rep(FALSE, 6L)))), dcast(DT[1L], id ~ k, value.var="v")) DT[, bla := !bla] test(1346.2, dcast(DT, id ~ k, value.var="v", subset=.(bla), fun.aggregate=length), dcast(DT[(bla)], id ~ k, value.var="v", fun.aggregate=length)) # issues/715 DT <- data.table(id=rep(1:2, c(3,2)), k=c(letters[1:3], letters[1:2]), v=1:5) test(1347.1, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take vector inputs and return a single value") test(1347.2, dcast(DT, id ~ k, fun.aggregate=last, value.var="v", fill=NA_integer_), data.table(id=1:2, a=c(1L, 4L), b=c(2L,5L), c=c(3L,NA_integer_), key="id")) # Fix for #893 dt <- data.table( x = factor("a", levels = c("a", "b")), y = factor("b", levels = c("a", "b")), z = 1 ) test(1457, dcast(dt, y ~ x, drop = FALSE, value.var="z"), data.table(dcast(as.data.frame(dt), y ~ x, drop = FALSE, value.var="z"), key="y")) # dcast.data.table new tests # Fix for #1070 (special case of ... on LHS) dt <- data.table(label= month.abb[1:5], val=0) test(1501.1, dcast(dt,... ~ label, value.var="val", sum), data.table(`.`=".", Apr=0, Feb=0, Jan=0, Mar=0, May=0, key=".")) # Fix for #862 (optional prefixes) dt <- data.table(name=c("Betty","Joe","Frank","Wendy","Sally"), address=c(rep("bla1",2), rep("bla2",2), "bla3")) test(1501.2, dcast(dt, address ~ paste("cust", dt[, seq_len(.N), by=address]$V1, sep=""), value.var="name"), data.table(address=paste("bla",1:3,sep=""), cust1=c("Betty", "Frank", "Sally"), cust2=c("Joe", "Wendy", NA), key="address")) # Fix for #1037 (optional prefixes + undefined variables) dt <- structure(list(V1 = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L), V2 = c(1.052, 0.542, 0.496, 0.402, 0.278, 5.115, 4.329, 4.121, 4.075, 4.0088)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA, -10L)) setDT(dt) ans1 = dcast(as.data.frame(dt), cumsum(V1 == 0) ~ V1, value.var = 'V2') ans2 = dcast(dt, cumsum(V1 == 0) ~ V1, value.var = 'V2') setkey(setnames(setDT(ans1), names(ans2)), V1) test(1501.3, ans1, ans2) # Implement #716 and #739 (multiple value.var and fun.aggregate) # multiple value.var dt = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) ans21 <- dcast(as.data.frame(dt), x + y ~ z, fun=sum, value.var="d1") ans22 <- dcast(as.data.frame(dt), x + y ~ z, fun=sum, value.var="d2") ans23 <- dcast(as.data.frame(dt), x + y ~ z, fun=mean, value.var="d1") ans24 <- dcast(as.data.frame(dt), x + y ~ z, fun=mean, value.var="d2") ans1 <- dcast(dt, x + y ~ z, fun=sum, value.var=c("d1","d2")) ans2 <- cbind(ans21, ans22[, 3:4]) setkey(setnames(setDT(ans2), names(ans1)), x, y) test(1501.4, ans1, ans2) # multiple fun.agg ans1 <- dcast(dt, x + y ~ z, fun=list(sum, mean), value.var="d1") ans2 <- cbind(ans21, ans23[, 3:4]) setkey(setnames(setDT(ans2), names(ans1)), x, y) test(1501.5, ans1, ans2) # multiple fun.agg and value.var (all combinations) ans1 <- dcast(dt, x + y ~ z, fun=list(sum, mean), value.var=c("d1", "d2")) ans2 <- cbind(ans21, ans22[, 3:4], ans23[, 3:4], ans24[, 3:4]) setkey(setnames(setDT(ans2), names(ans1)), x, y) test(1501.6, ans1, ans2) # multiple fun.agg and value.var (one-to-one) ans1 <- dcast(dt, x + y ~ z, fun=list(sum, mean), value.var=list("d1", "d2")) ans2 <- cbind(ans21, ans24[, 3:4]) setkey(setnames(setDT(ans2), names(ans1)), x, y) test(1501.7, ans1, ans2) # Additional test after fixing fun.agg creation - using the example here: https://github.com/Rdatatable/data.table/issues/716 DT = data.table(x=1:5, y=paste("v", 1:5, sep=""), v1=6:10, v2=11:15, k1=letters[1:5], k2=letters[6:10]) DT.m = melt(DT, id=1:2, measure=list(3:4, 5:6)) ans1 <- dcast(DT.m, x ~ y, fun.aggregate = list(sum, function(x) paste(x, collapse="")), value.var=list("value1", "value2")) ans21 <- dcast(as.data.frame(DT.m), x ~ y, fun.agg=sum, value.var="value1") ans22 <- dcast(as.data.frame(DT.m), x ~ y, fun.agg=function(x) paste(x, collapse=""), value.var="value2") ans2 <- cbind(ans21, ans22[, -1L]) setkey(setnames(setDT(ans2), names(ans1)), x) test(1501.8, ans1, ans2) # more testing on fun.aggregate dt = as.data.table(airquality) ans = suppressWarnings(melt(dt, id=c("Month", "Day"), na.rm=TRUE)) ans = ans[ , .(min=min(value), max=max(value)), by=.(Month, variable)] ans = melt(ans, id=1:2, variable.name="variable2") ans = dcast(ans, Month ~ variable + variable2) setnames(ans, c("Month", paste(names(ans)[-1L], sep="_"))) valvars = c("Ozone", "Solar.R", "Wind", "Temp") ans2 <- suppressWarnings(dcast(dt, Month ~ ., fun=list(min, max), na.rm=TRUE, value.var=valvars)) setcolorder(ans, names(ans2)) test(1501.9, setkey(ans, Month), ans2[, names(ans2)[-1L] := lapply(.SD, as.numeric), .SDcols=-1L]) # test for #1210, sep argument for dcast dt = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) test(1501.10, names(dcast(dt, x ~ y + z, fun=length, value.var = "d2", sep=".")), c("x", "1.a", "1.b", "2.a", "2.b")) } # test for freading commands x1 <- data.table(a = c(1:5), b = c(1:5)) f <- tempfile() write.csv(x1, f, row.names = FALSE) if (.Platform$OS.type == "unix") { gl = identical(Sys.getenv("CI_SERVER_NAME"), "GitLab CI") if(gl){ # skip test which fails in CI, data.table#1506 x2 = try(fread(paste('grep -v 3 ', f, sep="")), silent = TRUE) if(is.data.table(x2)) test(1105, x1[a != 3], x2) } else { test(1105, x1[a != 3], fread(paste('grep -v 3 ', f, sep=""))) } } else { # x2 <- fread(paste('more ', f, sep="")) # Doesn't work on winbuilder. Relies on 'more' available in DOS via Cygwin? # Error: # Syntax error: end of file unexpected (expecting ")") # Error: (converted from warning) running command 'sh.exe -c (more D:\temp\RtmpgB8D2P\file1ed828a511cd) > D:\temp\RtmpgB8D2P\file1ed84f9f44f8' had status 2 # test(1105, x1, x2) } unlink(f) # test for "key" argument of [.data.table #x1 <- data.table(a = c(1:5), b = c(5:1)) #x1[J(2), key = 'a'] #test(1106, key(x1) == 'a') #x1[, a, key = NULL] #test(1107, is.null(key(x1))) # test that eval works inside expressions DT <- data.table(a = c(1:5)) s <- quote(a) test(1108, DT[, sum(eval(s))], DT[, sum(a)]) # test that boolean expression does not trigger a not-join DT <- data.table(a = 1:3, b = c(TRUE,FALSE,NA)) test(1109, DT[b != TRUE], DT[!(b == TRUE)]) # commented for now (by Arun) # # test the speed of simple comparison # DT <- data.table(a = 1:1e7) # t1 = system.time(DT[a == 100])[3] # t2 = system.time(DT[which(a == 100)])[3] # # make sure we're at most 30% slower than "which" (should pass most of the time) # test(1110, (t1 - t2)/t2 < 0.3) # test that a column named list is ok (this also affects other functions in by, might be worth adding a test for that) DT <- data.table(list = 1:6, a = 1:2) test(1111, DT[, lapply(.SD, sum), by = a], DT[, list(list = sum(list)), by = a]) # fix for #4995. "rbind" retains key when the first argument isn't a data.table (.rbind.data.table is never run is the issue) DT <- data.table(name=c('Guff','Aw'),id=101:102,id2=1:2,key='id') y <- rbind(list('No','NON',0L),DT,list('Extra','XTR',3L)) test(1112, key(y), NULL) # fix for http://stackoverflow.com/questions/14753411/why-does-data-table-lose-class-definition-in-sd-after-group-by # where, .SD loses class information. format.myclass <- function(x, ...){ paste("!!", x, "!!", sep = "") } DT <- data.table(L = rep(letters[1:3],3), N = 1:9) setattr(DT$N, "class", "myclass") test(1113, class(DT[, .SD, by = L]$N), class(DT$N)) setkey(DT, L) test(1114, class(DT[, .SD, by = L]$N), class(DT$N)) test(1115, class(DT[J(unique(L)), .SD, by=.EACHI]$N), class(DT$N)) # Fix for #4994 - not-join quoted expression dint work... dt = data.table(a = 1:2, key = 'a') dt1 = data.table(a = 1) expr = quote(!dt1) test(1116, dt[eval(expr)], dt[2]) expr = quote(!1) test(1117, dt[eval(expr)], dt[2]) # Fix for #2381 - optimisation of `DT[, lapply(.SD, function(x) FUN(x, bla)), by=key(DT)]` where "bla" is a column in DT dint work. set.seed(45) dt <- data.table(x=rep(1:4, each=4), b1=sample(16), b2=runif(16)) setkey(dt, x) test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x], dt[, lapply(.SD, weighted.mean, b2, na.rm=TRUE), by=x]) # a(nother) test of #295 DT <- data.table(x=5:1, y=1:5, key="y") test(1119, is.null(key(DT[, list(z = y, y = 1/y)]))) ## various ordered factor rbind tests DT = data.table(ordered('a', levels = c('a','b','c'))) DT1 = data.table(factor('a', levels = c('b','a','f'))) DT2 = data.table(ordered('b', levels = c('b','d','c'))) DT3 = data.table(c('foo', 'bar')) DT4 = data.table(ordered('a', levels = c('b', 'a'))) test(1120, rbind(DT, DT1, DT2, DT3), data.table(ordered(c('a','a','b', 'foo', 'bar'), levels = c('a','b','d','c','f', 'foo', 'bar')))) test(1121, rbindlist(list(DT, DT1, DT2, DT3)), data.table(ordered(c('a','a','b', 'foo', 'bar'), levels = c('a','b','d','c','f', 'foo', 'bar')))) test(1122, rbind(DT, DT4), data.table(factor(c('a','a'), levels = c('a','b','c'))), warning="ordered factor levels cannot be combined, going to convert to simple factor instead") test(1123, rbindlist(list(DT, DT4)), data.table(factor(c('a','a'), levels = c('a','b','c'))), warning="ordered factor levels cannot be combined, going to convert to simple factor instead") test(1124, rbind(DT1, DT1), data.table(factor(c('a','a'), levels = c('b','a','f')))) test(1125, rbindlist(list(DT1, DT1)), data.table(factor(c('a','a'), levels = c('b','a','f')))) ## test rbind(..., fill = TRUE) DT = data.table(a = 1:2, b = 1:2) DT1 = data.table(a = 3:4, c = 1:2) test(1126, rbind(DT, DT1, fill = TRUE), data.table(a = 1:4, b = c(1:2, NA, NA), c = c(NA, NA, 1:2))) ## check for #4959 - rbind'ing empty data.table's DT = data.table(a=character()) #test(1127, rbind(DT, DT), DT) ## check for #5005 DT = data.table(a=0:2,b=3:5,key="a") test(1128, DT[, (function(){b})()], DT[, b]) ## Fix for FR #4867 DT <- data.table(x=1:5, y=6:10) test(1129.1, DT[, as.factor(c("x", "y")), with=FALSE], DT) test(1129.2, DT[, as.factor(c("x", "x")), with=FALSE], DT[, list(x, x)]) # Fix for a specific case that results in error in `construct` function in data.table.R (found and fixed during #5007 bug fix) MyValueIsTen <- 10 set.seed(1) DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6)) cols <- c("Value1", "Value2") DT2 <- copy(DT) test(1130, DT[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID], DT2[, (cols) := 10]) # Fix for #5007 - The value MyValueIsTen = 10 was never recognised (value within the function environment) MyValueIsTen <- 5 set.seed(1) DT <- data.table(ID=sample(LETTERS[1:3], 6, TRUE), Value1=rnorm(6), Value2=runif(6)) My_Fun <- function(x=copy(DT)) { MyValueIsTen <- 10 cols <- c("Value1", "Value2") x[, (cols) := lapply(.SD, function(x) MyValueIsTen), by=ID] } DT[, (cols) := 10] test(1131, My_Fun(), DT) # Test for #4957 - where `j` doesn't know `.N` when used with `lapply(.SD, function(x) ...)` test(1132, DT[, lapply(.SD, function(x) .N), by=ID], data.table(ID=c("A", "B", "C"), Value1=2L, Value2=2L)) # Test for #4990 - `:=` does not generate recycling warning during 'by': DT <- data.table(x=c(1,1,1,1,1,2,2)) # on a new column test(1133.1, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") # on an already existing column test(1133.2, DT[, new := c(1,2), by=x], data.table(x=c(1,1,1,1,1,2,2), new=c(1,2,1,2,1,1,2)), warning="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") # Fix for FR #2496 - catch `{` in `:=` expression in `j`: DT <- data.table(x=c("A", "A", "B", "B"), val =1:4) DT2 <- copy(DT)[, a := 1L] test(1134.1, DT[, {a := 1L}], DT2) test(1134.2, DT[, {a := 1L; NULL}], error="You have wrapped.*which is ok.*Consider") test(1134.3, DT[, {b := 2L}, by=x], DT2[, b:=2L, by=x]) test(1134.3, DT[, {b := 2L; sum(val)}, by=x], error="You have wrapped.*which is ok.*Consider") # fix for bug #5069 if ("package:gdata" %in% search()) { DT <- data.table(a = c('asdfasdf','asdf','asdgasdgasdgasdg','sdg'), b = runif(4,0,1)) test(1135, write.fwf(DT, f<-tempfile()), NULL) unlink(f) } # FR #2693 and Gabor's suggestions from here: http://r.789695.n4.nabble.com/Problem-with-FAQ-2-8-tt4668878.html (correcting software according to FAQ 2.8) d1 <- data.table(id1 = c(1L, 2L, 2L, 3L), val = 1:4, key = "id1") d2 <- data.table(id2 = c(1L, 2L, 4L), val2 = c(11, 12, 14),key = "id2") d3 <- copy(d2) setnames(d3, names(d1)) test(1136.1, d1[d2, id1], INT(1,2,2,4)) test(1136.2, d1[d2, id1], d1[d2][,id1]) test(1136.3, d1[d2, id2], INT(1,2,2,4)) test(1136.4, d1[d2, id2], d1[d2, list(id1,id2,val,val2)][,id2]) test(1136.5, d1[d3, i.id1], INT(1,2,2,4)) test(1136.6, d1[d3, i.id1], d1[d3, list(id1,i.id1)][,i.id1]) test(1136.7, d1[d2, val], c(1:3, NA)) test(1136.8, d1[d2, val2], c(11,12,12,14)) test(1136.9, d1[d3, list(id1, val, i.val)], data.table(id1=INT(1,2,2,4), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1")) test(1136.11, d1[d3, list(id1, i.id1, val, i.val)], data.table(id1=INT(1,2,2,4), i.id1=INT(1,2,2,4), val=c(1:3, NA), i.val=c(11,12,12,14), key="id1")) test(1136.12, d1[d2], data.table(id1=INT(1,2,2,4), val=c(1:3, NA), val2=c(11,12,12,14), key="id1")) test(1136.13, d1[J(2), id1], INT(2,2)) test(1136.14, d1[J(2), i.id1], error="not found") DT <- data.table(x=c("A", "A", "C", "C"), y=1:4, key="x") test(1136.15, DT["C", i.x], error="not found") # test for FR #4979 DT <- data.table(x=1:5, y=6:10, z=11:15) test(1137.1, DT[, .SD, .SDcols=-1L], DT[, 2:3, with=FALSE]) test(1137.2, DT[, .SD, .SDcols=-(1:2)], DT[, 3, with=FALSE]) test(1137.3, DT[, .SD, .SDcols=-"y"], DT[, c(1,3), with=FALSE]) test(1137.4, DT[, .SD, .SDcols=-c("y", "x")], DT[, 3, with=FALSE]) test(1137.5, DT[, .SD, .SDcols=-which(names(DT) %in% c("x", "y", "z"))], null.data.table()) test(1137.6, DT[, .SD, .SDcols=c(1, -2)], error=".SDcols is numeric but has both") test(1137.7, DT[, .SD, .SDcols=c("x", -"y")], error="invalid argument to unary") test(1137.8, DT[, .SD, .SDcols=c(-1, "x")], error="Some items of .SDcols are") DT <- data.table(x=1:5, y=6:10, z=11:15, zz=letters[1:5]) test(1137.9, DT[, .SD, .SDcols=-grep("^z", names(DT))], DT[, 1:2, with=FALSE]) test(1137.10, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE)], DT[, 1:2, with=FALSE]) test(1137.11, DT[, .SD, .SDcols=-grep("^z", names(DT), value=TRUE, invert=TRUE)], DT[, 3:4, with=FALSE]) set.seed(45) DT = data.table(x=c("A", "A", "C", "C"), y=1:4, z=runif(4)) test(1137.12, DT[, lapply(.SD, sum), by=x, .SDcols=-"y"], DT[, lapply(.SD, sum), by=x, .SDcols="z"]) # test for FR #353 / R-Forge #5020 - print.data.table gets new argument "row.names", default=TRUE. if FALSE, the row-names don't get printed # Thanks to Eddi for `capture.output` function! DT <- data.table(x=1:5, y=6:10) test(1138.1, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10")) DT <- data.table(x=1:101, y=6:106) # bug described in #1307 test(1138.2, capture.output(print(DT, row.names=FALSE)), c(" x y", " 1 6", " 2 7", " 3 8", " 4 9", " 5 10", "--- ", " 97 102", " 98 103", " 99 104", " 100 105", " 101 106")) # test for FR #2591 (format.data.table issue with column of class "formula") DT <- data.table(x=c(a~b, c~d+e), y=1:2) test(1139, capture.output(print(DT)), c(" x y", "1: a ~ b 1", "2: c ~ d + e 2")) # FR #4813 - provide warnings if there are remainders for both as.data.table.list(.) and data.table(.) X = list(a = 1:2, b = 1:3) test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 is of size 2 but maximum") test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 is of size 2 but maximum") test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 is of size 2 but maximum") test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 is of size 2 but maximum") # Fix for bug #5098 - DT[, foo()] returns function definition. DT <- data.table(a=1:2) foo <- function() sum(1:5) test(1142, DT[, foo()], 15L) # Fix for bug #5106 - DT[, .N, by=y] was slow when "y" is not a column in DT DT <- data.table(x=sample.int(10, 1e6, replace=TRUE)) y <- DT$x te1 <- system.time(ans1 <- DT[, .N, by=x])[["elapsed"]] te2 <- system.time(ans2 <- DT[, .N, by=y])[["elapsed"]] test(1143.1, ans1, setnames(ans2, "y", "x")) test(1143.2, abs(te1-te2) < 1, TRUE) # Fix for bug #5104 - side-effect of fixing #2531 - `:=` with grouping (by) and assigning factor columns DT <- data.table(x=c(1,1,1,2,2), y=factor(letters[1:5])) test(1144.1, DT[, z := y, by=x], data.table(x=c(1,1,1,2,2), y=factor(letters[1:5]), z=factor(letters[1:5]))) # Added 3 more tests to close bug #5437 - partial regression due to recent changes (in 1.9.2) # This should catch any attributes being lost hereafter. DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1144.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1144.4, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) ans = copy(DT) test(1144.3, DT[,`:=`(lx=tail(x,1L)), by=y], ans[, lx := x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 setattr(x, 'names', letters[1:5]) test(1144.1, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.numeric(x) setattr(x, 'names', letters[1:5]) test(1144.2, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.character(x) setattr(x, 'names', letters[1:5]) test(1144.3, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.factor(x) setattr(x, 'names', letters[1:5]) test(1144.4, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) x <- as.Date(1:5, origin="2013-01-01") setattr(x, 'names', letters[1:5]) test(1144.5, as.data.table(x, keep=TRUE), data.table(rn=names(x), x=unname(x))) # Fix for bug #5114 - .data.table.locked ISSUE DT <- data.table(x=1:5, y=6:10) xx <- DT[, .SD, .SDcols="y"] test(1145, xx[, y := as.numeric(y)], data.table(y = as.numeric(6:10))) # Fix for bug #5115 - set not adding columns on class that builds on data.table DT <- as.data.table(BOD) ans = copy(DT)[, Time := as.numeric(Time)] setattr(DT, "class", c("myclass", class(DT))) setattr(ans, 'class', class(DT)) test(1146.1, DT[, Time:= as.numeric(Time)], ans) DF <- as.data.frame(DT) test(1146.2, {set(DF, i=NULL, j=1L, value=seq_len(nrow(DF)));setattr(DF,"reference",NULL);DF}, data.frame(Time=1:nrow(BOD), demand=BOD$demand)) test(1146.3, set(DF, i=NULL, j="bla", value=seq_len(nrow(DF))), error="set() on a data.frame is for changing existing columns, not adding new ones. Please use a data.table for that.") if (.Machine$sizeof.longdouble == 16) { # To not run on CRAN's solaris-sparc 32bit where sizeof.longdouble==0 old = getNumericRounding() set.seed(6) x = rnorm(1e6)*1e4 ans = base::sort.list(x, method="shell") setNumericRounding(0) test(1147.1, ans, forderv(x)) setNumericRounding(1) test(1147.2, ans, forderv(x)) setNumericRounding(2) test(1147.3, sum(ans != forderv(x)), 2) tol = 3.000214e-13 x = c(8, NaN, Inf, -7.18918, 5.18909+0.07*tol, NA, -7.18918111, -Inf, NA, 5.18909, NaN, 5.18909-1.2*tol, 5.18909-0.04*tol) # cat(data.table:::binary(x[c(5,10,12,13)]),sep="\n") # 0 10000000001 010011000001101000001100111100011000 00000000 11000000 # 0 10000000001 010011000001101000001100111100011000 00000000 10101000 # 0 10000000001 010011000001101000001100111100010111 11111111 00010011 # 0 10000000001 010011000001101000001100111100011000 00000000 10011010 setNumericRounding(0) test(1147.4, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 12, 13, 10, 5, 1, 3)) setNumericRounding(1) test(1147.5, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 12, 5, 10, 13, 1, 3)) setNumericRounding(2) test(1147.6, forderv(x), INT(6, 9, 2, 11, 8, 7, 4, 5, 10, 12, 13, 1, 3)) # rounds item 12 at bit 48 doesn't just truncate setNumericRounding(old) } test(1149.1, forderv(integer(0)), integer(0)) test(1149.2, forderv(numeric(0)), integer(0)) # test uniqlengths set.seed(45) x <- sample(c(NA_integer_, 1:1e5), 1e7, TRUE) ox <- forderv(x) o1 <- uniqlist(list(x), ox) test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) o1 <- uniqlist(list(x)) test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x))) # #5190 fix - grouping with .SDcols gave "symbol not subsettable error" - consequence of FR #4979 implementation dt = data.table(grp = sample(letters[1:3],20, replace = TRUE), v1 = rnorm(20), v2 = rnorm(20)) sd.cols <- "v1" test(1152, dt[, lapply(.SD, mean), by=grp, .SDcols=sd.cols], dt[, list(v1=mean(v1)), by=grp]) # #5171 fix - setattr attribute non-character led to segfault x <- FALSE test(1153, setattr(x, FALSE, FALSE), error="Attribute name must be") # Fixed binary search capabilities for NA (for int and double) and NaN (for double): set.seed(1) DT <- data.table(x=sample(c(NA, NaN, Inf, 1:10), 100, TRUE), y=sample(c(NA, 1:10), 100, TRUE), z=sample(c(NA_character_, letters[1:10]), 100, TRUE)) setkey(DT, x) test(1154.1, DT[J(NaN)], DT[is.nan(x)]) test(1154.2, DT[J(NA_real_)], DT[is.na(x) & !is.nan(x)]) setkey(DT, y) test(1154.3, setcolorder(DT[J(NA_integer_)], c("x", "y", "z")), DT[is.na(y)]) setkey(DT, z) test(1154.4, setcolorder(DT[J(NA_character_)], c("x", "y", "z")), DT[is.na(z)]) # Fixing the binary search above for NA/NaN also fixes BUG #4918 dt1 <- data.table(x = c('red','orange','green'), y=c(1,2,NA), key='y') dt2 <- data.table(y = c(1,2,3,NA), z = c('a','b','c','missing data'), key='y') test(1155.1, merge(dt1, dt2, by=c('y')), data.table(y=dt1$y, x=dt1$x, z=dt2$z[1:3], key="y")) test(1155.2, dt2[dt1], data.table(y=dt1$y, z=dt2$z[1:3], x=dt1$x, key="y")) test(1155.3, dt1[dt2, nomatch=0L], data.table(x=dt1$x, y=dt1$y, z=dt2$z[1:3], key="y")) # NaN wasn't properly searched for in some cases. Fixed that. Here's the fix! dt <- structure(list(x = c(NaN, NaN, NaN, NaN, NaN, NA, NA, -3, -3, -3, -2, -2, -1, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3), y = c(16L, 25L, 23L, 17L, 21L, 11L, 13L, 15L, 1L, 6L, 4L, 18L, 7L, 3L, 12L, 24L, 2L, 10L, 20L, 14L, 9L, 19L, 8L, 22L, 5L)), .Names = c("x", "y"), row.names = c(NA, -25L), class = c("data.table", "data.frame" )) setkey(dt, x) test(1155.4, dt[J(NaN)], dt[is.nan(x)]) test(1155.5, dt[J(NA_real_)], dt[is.na(x) & !is.nan(x)]) # Fix for (usually small) memory leak when grouping, #2648. # Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row). DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001)) before = gc()["Vcells",2] for (i in 1:50) DT[, sum(B), by=A] after = gc()["Vcells",2] test(1157, after < before+3) # +3 = 3MB # Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case. # Similar for when dogroups writes less rows than allocated, #2648. DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4)) before = gc()["Vcells",2] for (i in 1:50) DT[ , unlist(.SD), by = 'k'] after = gc()["Vcells",2] test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024 # tests for 'setDT' - convert list, DF to DT without copy x <- data.frame(a=1:4, b=5:8) test(1159.1, setDT(x), data.table(a=1:4, b=5:8)) x <- list(1:4, 5:8) test(1159.2, setDT(x), data.table(1:4, 5:8)) x <- list(a=1:4, b=5:8) test(1159.3, setDT(x), data.table(a=1:4, b=5:8)) x <- list(a=1:4, 5:8) test(1159.4, setDT(x), setnames(data.table(1:4, 5:8), c("a", "V1"))) x <- data.table(a=1:4, b=5:8) test(1159.5, setDT(x), data.table(a=1:4, b=5:8)) x <- 1:5 test(1159.6, setDT(x), error="Argument 'x' to 'setDT' should be a") x <- list(1, 2:3) test(1159.7, setDT(x), error="All elements in argument 'x' to 'setDT'") # tests for setrev x <- sample(10) y <- rev(x) setrev(x) test(1160.1, y, x) x <- sample(c(1:10, NA), 21, TRUE) y <- rev(x) setrev(x) test(1160.2, y, x) x <- sample(runif(10)) y <- rev(x) setrev(x) test(1160.3, y, x) x <- sample(c(runif(10), NA, NaN), 21, TRUE) y <- rev(x) setrev(x) test(1160.4, y, x) x <- sample(letters) y <- rev(x) setrev(x) test(1160.5, y, x) x <- as.logical(sample(0:1, 20, TRUE)) y <- rev(x) setrev(x) test(1160.6, y, x) x <- list(1:10) test(1160.7, setrev(x), error="Input 'x' must be a vector") # tests for setreordervec # integer x <- sample(c(-10:10, NA), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.1, x[o], y) # numeric x <- sample(c(NA, rnorm(10)), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.2, x[o], y) # character x <- sample(c(NA, letters), 100, TRUE) o <- base::order(x, na.last=FALSE) y <- copy(x) setreordervec(y, o) test(1161.3, x[o], y) # tests for setreordervec DT <- data.table(x=sample(c(NA, -10:10), 2e2, TRUE), y=sample(c(NA, NaN, -Inf, Inf, -10:10), 2e2, TRUE), z=sample(c(NA, letters), 2e2, TRUE)) # when not sorted, should return FALSE test(1162.1, is.sorted(DT[[1L]]), FALSE) setkey(DT, x) test(1162.2, is.sorted(DT[[1L]]), TRUE) test(1162.3, is.sorted(DT[[2L]]), FALSE) setkey(DT, y) test(1162.4, is.sorted(DT[[2L]]), TRUE) test(1162.5, is.sorted(DT[[3L]]), FALSE) setkey(DT, z) test(1162.6, is.sorted(DT[[3L]]), TRUE) setkey(DT, x, y) test(1162.7, length(forderv(DT, by=1:2)), 0) setkey(DT, x, z) test(1162.8, length(forderv(DT, by=c(1L, 3L))), 0) setkey(DT, y, z) test(1162.9, length(forderv(DT, by=2:3)), 0) setkey(DT) # test number 1162.10 skipped because if it fails it confusingly prints out as 1662.1 not 1662.10 test(1162.11, length(forderv(DT, by=1:3)), 0) test(1162.12, is.sorted(DT, by=1:3), TRUE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") test(1162.13, is.sorted(DT, by=2:1), FALSE, warning="Use.*forderv.*for efficiency in one step, so you have o as well if not sorted") # FR #5152 - last on length=0 arguments x <- character(0) test(1163, last(x), character(0)) # Test 1164 was a non-ASCII test, now in DtNonAsciiTests # Bug fix for #5117 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) y <- copy(x) test(1165, x[Time>100], rbindlist(list(x[Time > 100], y[Time > 200]))) # Bug fix for the #5300 - rbind(DT, NULL) should not result in error, but BOD has an attribute as well, which won't be preserved (due to C-impl). Changing test. setattr(x <- as.data.table(BOD), 'reference', NULL) test(1166, x, rbind(x, NULL)) # fix for bug #5307 - ordering with multiple columns in which at least one of them is a logical column foo = data.table(a=rep(c(0L,1L,0L,1L),2), b=rep(c(TRUE,TRUE,FALSE,FALSE),2), c=1L) test(1167, foo[, .N, by=list(b,a)], data.table(b=c(TRUE, TRUE, FALSE, FALSE), a=c(0L,1L,0L,1L), N=2L)) # fix for bug #5355 - rbindlist with factor columns and empty data.tables resulted in error. A <- data.table(x=factor(1), key='x') B <- data.table(x=factor(), key='x') test(1168.1, rbindlist(list(B,A)), data.table(x=factor(1))) # fix for bug #5120, it's related to rbind and factors as well - more or less similar to 1168.1 (#5355). Seems to have been fixed with that commit. Just adding test here. tmp1 <- as.data.table(structure(list(Year = 2013L, Maturity = structure(1L, .Label = c("<1", "1.0 - 1.5", "1.5 - 2.0", "2.0 - 2.5", "2.5 - 3.0", "3.0 - 4.0", "4.0 - 5.0", ">5.0"), class = "factor"), Quality = structure(2L, .Label = c(">BBB", "BBB", "BB", "B", "CCC", "700 tests so far - without NaN/NA ########################################################################################### # - Generate a random seed each time; the randomness allows catching errors quicker # - But save the seed so that we can generate the same data back if any error occurs seed = as.integer(Sys.time()) # sample(9999L, 1L) temporary fix, because all the set.seed(.) used above makes this sample() step deterministic (always seed=9107) seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") # no NaN (because it's hard to match with base:::order) ## TODO: add tests with NaN set.seed(seed) foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") i1 = as.integer(sample(c(-100:100), 1e3, TRUE)) i2 = as.integer(sample(c(-100:100, -1e6, 1e6), 1e3, TRUE)) d1 = as.numeric(sample(c(-100:100,Inf,-Inf), 1e3, TRUE)) d2 = as.numeric(rnorm(1e3)) c1 = sample(c(letters), 1e3, TRUE) c2 = sample(foo(200), 1e3, TRUE) DT = data.table(i1, i2, d1, d2, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") ans = vector("list", length(names(DT))) test_no = 1223.0 oldnfail = nfail for (i in seq_along(names(DT))) { cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) ans[[i]] = combn(names(DT), i, function(x) { tmp = apply(cj, 1, function(y) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { if (y[j] == 1L) as.name(x[j]) else { if (class(DT[[x[j]]]) =="character") as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) ) ) test(test_no, forderv(DT, by=x, order=y), with(DT, eval(ll))) }) dim(tmp)=NULL list(tmp) }) } ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce # fix for bug #5405 - unique on null data.table should return null data.table test(1224, unique(data.table(NULL)), data.table(NULL)) # forderv should return 'integer(0)' when 'x' is not atomic and of 0 length (to be consistent with base:::order) test(1225.1, forderv(list()), integer(0)) test(1225.2, forderv(data.table(NULL)), integer(0)) # fix for bug #5377 - data.table(null list, data.frame, data.table) should return null data.table test(1226.1, data.table(list()), null.data.table()) test(1226.2, data.table(data.frame(NULL)), null.data.table()) test(1226.3, data.table(data.table(NULL)), null.data.table()) test(1226.4, data.table(data.frame()), null.data.table()) test(1226.5, data.table(data.table()), null.data.table()) # fix for bug #5321 - POSIXlt issue. setDT(DT1 <- data.frame(id=1:3, d=strptime(c("06:02:36", "06:02:48", "07:03:12"), "%H:%M:%S"))) test(1227, data.table(id=1:3, d=strptime(c("06:02:36", "06:02:48", "07:03:12"), "%H:%M:%S")), DT1, warning="POSIXlt column type detected and converted to") # fix for bug #5296 - retaining class of original data.table after passing through `[.data.table` DT <- data.table(a=1:2,b=3:4) setattr(DT, "class", c("newclass", class(DT))) test(1228.1, class(DT), class(DT[a>1])) test(1228.2, class(DT), class(DT[, list(b)])) test(1228.3, class(DT), class(DT[, "b", with=FALSE])) test(1228.4, class(DT), class(DT[, sum(b), by=a])) test(1228.5, class(DT), class(DT[a>1, sum(b), by=a])) test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # test 1229 was non-ASCII, now in package DtNonAsciiTests # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") options(datatable.optimize=1) # turn off GForce, to test dogroups test(1230, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) test(1231, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") test(1232, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") test(1233, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") test(1234, DT[, sum(a), by=b, verbose=TRUE], output="collecting discontiguous groups") setkey(DT,a) test(1235, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") test(1236, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") test(1237, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") options(datatable.optimize=Inf) # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") test(1238.1, key(setorder(DT, x)), "x") test(1238.2, key(setorder(DT, -x)), NULL) # Fix for bug #5366 - setkey fails when non-key columns are of type list. DT <- data.table(x=5:1, y=as.list(1:5)) test(1239.1, setkey(DT, x), setattr(data.table(x=1:5, y=as.list(5:1)), 'sorted', 'x')) DT <- data.table(x=5:1, y=as.list(1:5)) test(1239.2, setorder(DT, x), data.table(x=1:5, y=as.list(5:1))) # Fix for bug #5408 - order of as.data.table.table is different when doing as.data.table(with(DT, table(x,y))) set.seed(123) DT <- data.table(XX = sample(LETTERS[1:5], 1000, replace = TRUE), yy = sample(1:5, 1000, replace = TRUE)) ans1 <- as.data.table(DT[, table(XX, yy)]) ans2 <- as.data.table(table(DT$XX, DT$yy)) setnames(ans1, 'N', 'Freq') setnames(ans2, names(ans1)) test(1240.1, ans1, setDT(as.data.frame(with(DT, table(XX, yy)), stringsAsFactors=FALSE))) test(1240.2, ans2, ans1) # R 3.3.0 started to use data.table's radix sort by default for order() on integer/factors. # Therefore we check against the non-data.table method ('shell') for correctness (otherwise we'd be # checking data.table code against itself) as well as checking data.table's ported code in R; # i.e. a three-way match. if (base::getRversion() < "3.3.0") { base_order <- base::order } else { base_order <- function(..., na.last=TRUE, method=c("shell","radix")) { ans1 = base::order(..., na.last=na.last, method="shell") if (!is.na(na.last) || base::getRversion()>"3.3.3") { ans2 = base::order(..., na.last=na.last, method="radix") if (!identical(ans1,ans2)) stop("Base R's order(,method='shell') != order(,method='radix')") } else { # Only when na.last=NA in just R 3.3.0-3.3.3 we don't check shell==radix # because there was a problem in base R's port of data.table code then when : # 1) 2 or more vectors were passed to base::order(,method="radix") # AND 2) na.last=NA # AND 3) there is a subgroup of size exactly 2 # AND 4) one of those 2 items in the subgroup is NA and the other is not NA # See tests 1728.3 and 1728.13. } ans1 } } # Test for optimisation of 'order' to 'forder' set.seed(45L) DT <- data.table(x=sample(1e2, 1e6,TRUE), y=sample(1e2, 1e6,TRUE)) old = options(datatable.optimize=Inf) t1 = system.time(ans1 <- DT[order(x,-y)])[['elapsed']] # optimized to forder() t2 = system.time(ans2 <- DT[base_order(x,-y)])[['elapsed']] # not optimized test(1241.1, ans1, ans2) if (.devtesting) test(1241.2, t1 < t2+0.1) # 0.2 < 3.8 on Matt's laptop seems safe enough to test. # Even so, 1241.2 has been known to fail, perhaps if system swaps and this R sessions pauses or something? # We shouldn't have timing tests here that run on CRAN for this reason. Hence wrapping with .devtesting options(old) DT = data.table(a=1:3, b=4:6) myCol = "a" test(1242.1, DT[2,myCol:=6L,with=FALSE], data.table(a=INT(1,6,3), b=4:6), warning="with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. Please") test(1242.2, DT[2,(myCol):=7L], data.table(a=INT(1,7,3), b=4:6)) # consistency of output type of mult, #5378 DT = data.table(id=rep(1:2,each=2), var=rnorm(4), key="id") test(1243, DT[.(1:2), list(var)][c(2,4)], DT[.(1:2), list(var), mult="last"]) test(1244, DT[.(1:2), var], DT$var) test(1245, DT[.(1:2), var][c(2,4)], DT[.(1:2), var, mult="last"]) ############################################# # FR #5205 - fromLast argument to duplicated ############################################# seed = as.integer(Sys.time()) seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") set.seed(seed) DT <- data.table(w=sample(-5:5, 100, TRUE), x=as.numeric(sample(-5:5, 100, TRUE)), y=sample(paste("id", 1:10, sep=""), 100, TRUE), z=sample(c(TRUE, FALSE), 100, TRUE)) colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") test_no = 1246.0 oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { test_no <<- signif(test_no+.01, 7) # first without key test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) test_no <<- signif(test_no+.01, 7) setkeyv(DT, jj) # with key test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce # with NA DT <- data.table(w=sample(c(-5:5,NA_integer_), 100, TRUE), x=as.numeric(sample(c(-5:5, NA), 100, TRUE)), y=sample(c(NA, paste("id", 1:10, sep="")), 100, TRUE), z=sample(c(NA, TRUE, FALSE), 100, TRUE)) colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { test_no <<- signif(test_no+.01, 7) # first without key test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) test_no <<- signif(test_no+.01, 7) setkeyv(DT, jj) # with key test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce # FR #5172 - anyDuplicated.data.table set.seed(45L) dt <- data.table(x=sample(3,10,TRUE), y=sample(letters[1:3], 10,TRUE)) test(1247.1, anyDuplicated(dt), anyDuplicated.data.frame(dt)) test(1247.2, anyDuplicated(dt, fromLast=TRUE), anyDuplicated.data.frame(dt, fromLast=TRUE)) test(1247.3, anyDuplicated(dt, by="y"), anyDuplicated.data.frame(dt[, "y", with=FALSE])) test(1247.4, anyDuplicated(dt, by="y", fromLast=TRUE), anyDuplicated.data.frame(dt[, "y", with=FALSE], fromLast=TRUE)) # Fix for #5423 - j-expression y * eval(parse(..)) should work without needing "(" DT <- data.table(x = seq(1,10,1), y = seq(2,20,2)) test(1248.1, DT[, y := y * eval(parse(text="1*2"))], data.table(x=seq(1,10,1), y=seq(4,40,4))) # fix in 1248 was not complete. resurfaced again as bug #5527. Fixed now, test added here below: DT <- data.table(id=1:5, var=letters[1:5]) ans <- copy(DT) idPrefix <- "va" # if this variable were named 'id' then the paste(id) below would see the 'id' _column_. test(1248.2, DT[, eval(parse(text=paste(idPrefix,"r",sep="")))], letters[1:5]) test(1248.3, DT[, id2:=eval(parse(text=paste(idPrefix,"r",sep="")))], ans[, id2 := var]) # test to make sure DT[order(...)] works fine when it's already sorted (forgot the case where forder returns integer(0) before) DT <- data.table(x=rep(1:4, each=5), y=1:20) test(1249.1, DT[order(x)], DT) test(1249.2, DT[order(y)], DT) test(1249.3, DT[order(x,y)], DT) # Fix for #5424 - duplicated 'by=FALSE' inconsistency set.seed(1L) DT <- data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), key="x") test(1250.1, duplicated(DT, by=NULL), duplicated.data.frame(DT)) test(1250.2, duplicated(DT, by=FALSE), error="Only NULL, column indices or column names are allowed in by") test(1250.3, duplicated(DT, by=TRUE), error="Only NULL, column indices or column names are allowed in by") # more tests for DT[order(...)] - now testing 'decreasing=FALSE/TRUE' argument set.seed(1L) DT <- data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE)) test(1251.1, DT[order(x,y,decreasing=TRUE)], DT[order(-x,-y)]) test(1251.2, DT[order(x,-y,decreasing=TRUE)], DT[order(-x,y)]) # test in case of complex calls. check out the note in setkey.R under 'forder' for differences in forder and order for 'list' inputs. base is inconsistent I find. ix = with(DT, order(x+y)) test(1251.3, DT[order(x+y)], DT[ix]) ix = with(DT, order(-x-y)) test(1251.4, DT[order(-x-y)], DT[ix]) ix = with(DT, order(x+y, decreasing=TRUE)) test(1251.5, DT[order(x+y, decreasing=TRUE)], DT[ix]) ix = with(DT, order(4*x-5*y, decreasing=TRUE)) test(1251.6, DT[order(4*x-5*y, decreasing=TRUE)], DT[ix]) ix = with(DT, order(1-DT$x, decreasing=TRUE)) test(1251.7, DT[order(1-DT$x, decreasing=TRUE)], DT[ix]) test(1251.8, DT[order(x, list(-y), decreasing=TRUE)], error = "Column .* for ordering currently") # consistent with base (not the same error, but will error with "forder's" error instead) # more "edge cases" to ensure we're consistent with base test(1251.9, DT[order("a")], DT[1L]) test(1251.10, DT[order("b", "a")], DT[1L]) test(1251.11, DT[order(list("b", "a"))], error = "Column .* for ordering currently") test(1251.12, DT[order(list("b"), list("a"))], DT[1L]) ############################################################## # extensive tests for order optimisation within `[.data.table` ############################################################## seed = as.integer(Sys.time()) seedInfo = paste("forder decreasing argument test: seed = ", seed," ", sep="") set.seed(seed) # these variable try to simulate groups of length 1, 2, < 200, > 200 so as to cover all different internal implementations foo <- function(n) apply(matrix(sample(letters, n*8L, TRUE), ncol=8L), 1, paste, sep="") i1 = as.integer(sample(rep(c(-3:3, NA_integer_), c(1, 2, 190, 300, 7, 190, 210, 100)))) i2 = as.integer(sample(rep(c(-2:2, -1e6, 1e6, NA_integer_), c(1, 2, 190, 300, 7, 190, 210, 100)))) d1 = as.numeric(sample(rep(c(-2:2,Inf,-Inf, NA_real_, 5, -1e3), c(1, 190, 2, 300, 7, 50, 50, 100, 150, 150)))) c1 = sample(rep(c(letters[1:5], NA_character_, "z"), c(1, 2, 190, 7, 300, 200, 300))) c2 = sample(c(foo(200), NA_character_), 1e3, TRUE) DT = data.table(i1, i2, d1, c1, c2) # randomise col order as well colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") ans = vector("list", length(names(DT))) test_no = 1253.13 oldnfail = nfail for (i in seq_along(names(DT))) { cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) ans[[i]] = combn(names(DT), i, function(x) { tmp = apply(cj, 1, function(y) { test_no <<- signif(test_no+.001, 7) ll = as.call(c(as.name("base_order"), lapply(seq_along(x), function(j) { if (y[j] == 1L) as.name(x[j]) else { if (class(DT[[x[j]]]) =="character") as.call(c(as.name("-"), as.call(list(as.name("xtfrm"), as.name(x[j]))))) else as.call(list(as.name("-"), as.name(x[j]))) } }) ) ) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA test(test_no, ans1, with(DT, eval(ll))) test_no <<- signif(test_no+.001, 7) ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. test(test_no, ans1[ans1 != 0], with(DT, eval(ll))) }) dim(tmp)=NULL list(tmp) }) } ans = NULL if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce ############### old_rounding = getNumericRounding() # turning off tolerance for UPCs (> 11 s.f. stored in numeric), #5369 DT <- data.table(upc = c(301426027592, 301426027593, 314775802939, 314775802940, 314775803490, 314775803491, 314775815510, 314775815511, 314933000171, 314933000172), year = 2006:2007) setNumericRounding(2L) test(1253, DT[,.N,by=upc]$N, rep.int(2L,5L)) setNumericRounding(0) test(1254, DT[,.N,by=upc], data.table(upc=DT$upc, N=1L)) test(1255, unique(DT, by="upc"), DT) setNumericRounding(2) test(1256, DT[,.N,by=upc]$N, rep.int(2L,5L)) DT = data.table(upc=rep(c(360734147771, 360734147770), each=3), year=rep(2009:2011, times=2)) setNumericRounding(0) test(1257, DT[,.N,by=upc], data.table(upc=c(360734147771, 360734147770), N=3L)) test(1258, DT[,.N,by=upc][order(upc)], data.table(upc=c(360734147770, 360734147771), N=3L)) setNumericRounding(1) test(1259, DT[,.N,by=upc], data.table(upc=c(360734147771, 360734147770), N=3L)) test(1260, DT[,.N,by=upc][order(upc)], data.table(upc=c(360734147770, 360734147771), N=3L)) test(1261, getNumericRounding(), 1L) # the limit of double precision (16 s.f.) ... if (.Machine$sizeof.longdouble==16) test(1262, length(unique(c(1.2345678901234560, 1.2345678901234561, 1.2345678901234562, 1.2345678901234563))), 2L) # 2 not 4 is double precision limit which base::unique() relies on in this test # valgrind will also return (3) instead of (2) here.. due to floating point precision limitation. changing the last two values to 1.2345678901234563 and 1.2345678901234564 returns 2. DT = data.table(id=c(1.234567890123450, 1.234567890123451, 1.234567890123452, 1.234567890123453)) # one less digit is limit test(1263, length(unique(DT$id)), 4L) test(1264, DT[,.N,by=id]$N, 4L) # 1 byte rounding isn't enough setNumericRounding(0) test(1265, DT[,.N,by=id]$N, INT(1,1,1,1)) test(1266, getNumericRounding(), 0L) setNumericRounding(old_rounding) # fread reading NA in logical columns, #4766 DF = data.frame(I=1:3, L=c(T,F,NA), R=3.14) write.csv(DF,f<-tempfile(),row.names=F) test(1267.1, fread(f)$L, c(TRUE, FALSE, NA)) test(1267.2, fread(f), as.data.table(read.csv(f))) unlink(f) ### FR #2722 test begins here ### ################################# # FR #2722 optimise j=c(lapply(.SD,sum, ...)) - here any amount of such lapply(.SD, ...) can occur and in any order set.seed(45L) dt <- data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) options(datatable.optimize=1L) ans2 <- dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a] options(datatable.optimize=Inf) test(1268.1, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans2, output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") options(datatable.optimize=1L) ans2 <- dt[, c(lapply(.SD, mean), .N), by=a] options(datatable.optimize=Inf) test(1268.2, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") options(datatable.optimize=1L) ans2 <- dt[, c(list(c), lapply(.SD, mean)), by=a] options(datatable.optimize=Inf) test(1268.3, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") test(1268.4, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") options(datatable.optimize=1L) ans2 <- dt[, c(sum(d), lapply(.SD, mean)), by=a] options(datatable.optimize=Inf) test(1268.5, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans2, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") options(datatable.optimize=1L) ans2 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a] options(datatable.optimize=Inf) test(1268.6, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans2, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") # newly added tests for #861 # optimise, but no GForce options(datatable.optimize=1L) ans2 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a] options(datatable.optimize=Inf) test(1268.7, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") # don't optimise .I in c(...) options(datatable.optimize=1L) dt = data.table(x=c(1,1,1,2,2,2), y=1:6) ans2 <- dt[, c(.I, lapply(.SD, mean)), by=x] options(datatable.optimize=Inf) test(1268.8, dt[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans2, output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") ### FR #2722 tests end here ### # Wide range numeric and integer64, to test all bits old_rounding = getNumericRounding() x = sample( c(seq(-1e100, 1e100, length=1e5), c(seq(-1e-100,1e-100,length=1e5))) ) setNumericRounding(0) test(1269, forderv(x), base::order(x)) setNumericRounding(2) # not affected by rounding test(1270, forderv(x), base::order(x)) if ("package:bit64" %in% search()) { x = as.integer64(2)^(0:62) x = sample(c(x,-x,0)) if (!inherits(try(bit64::order(x),silent=TRUE), "try-error")) # if for old version of bit64 test(1271, forderv(x), bit64::order(x)) # because GenomicRanges replaces this order DT = data.table( a=as.integer64(2)^45 + 1:3, b=1:6 ) test(1272, DT[,sum(b),by=a], data.table(a=DT$a[1:3], V1=INT(5,7,9))) test(1273, unique(DT, by="a"), DT[1:3]) test(1274, duplicated(DT, by="a"), rep(c(FALSE,TRUE),each=3)) setkey(DT,a) test(1275, DT[.(as.integer64(35184372088834))], DT[3:4]) test(1276, unique(DT, by=key(DT)), DT[c(1,3,5)]) test(1277, duplicated(DT, by=key(DT)), rep(c(FALSE,TRUE),3)) } setNumericRounding(old_rounding) # distinguishing small numbers from 0.0 as from v1.9.2, test from Rick # http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2 old_rounding = getNumericRounding() test_no = 1278.001 for (dround in c(0,2)) { setNumericRounding(dround) # rounding should not affect the result here because although small, it's very accurace (1 s.f.) for (i in c(-30:-1,1:30)) { DT = data.table(c(1 * (10^i),2,9999,-1,0,1)) test(test_no, nrow(DT[, .N, by=V1]), 6) test_no = test_no + 0.001 } } setNumericRounding(old_rounding) # rounding of milliseconds, workaround, TO DO: #5445 # http://stackoverflow.com/questions/22356957/rounding-milliseconds-of-posixct-in-data-table-v1-9-2-ok-in-1-8-10 old_rounding = getNumericRounding() DT = data.table(timestamp=as.POSIXct( c("2013-01-01 17:51:00.707", "2013-01-01 17:51:59.996", "2013-01-01 17:52:00.059", "2013-01-01 17:54:23.901", "2013-01-01 17:54:23.913", "2013-01-01 17:54:23.914"))) setNumericRounding(2) test(1279, duplicated(DT), rep(c(FALSE,TRUE), c(4,2))) setNumericRounding(1) test(1280, duplicated(DT), rep(FALSE, 6)) setNumericRounding(old_rounding) # FR #5465, keep.rownames argument for setDT, just for data.frames: DF <- data.frame(x=1:5, y=10:6) rownames(DF) <- letters[1:5] test(1281, setDT(DF, keep.rownames=TRUE), data.table(rn=letters[1:5], x=1:5, y=10:6)) # Bug #5415 fix - BY doesn't retain names: DT <- data.table(fruit=c("apple","peach","pear")) test(1282, DT[, ans := .BY$fruit, by=fruit], data.table(fruit=DT$fruit, ans=DT$fruit)) # bug #5443 - get() doesn't see i's columns, when i is a data.table: set.seed(1L) dt1 <- data.table(a=rep(1:2, each=2), c=sample(10,4)) dt2 <- data.table(b=rep(2:3), c=sample(20,2), d=sample(20,2)) setkey(dt1, a) setkey(dt2, b) # without by test(1283.1, dt1[dt2, list(a=a, c=get('c'), i.c=get('i.c'))], dt1[dt2, list(a=a, c=c, i.c=i.c)]) test(1283.2, dt1[dt2, list(a=a, d=get('d'))], dt1[dt2, list(a=a, d=d)]) # with by test(1283.3, dt1[dt2, list(a=a, c=get('c'), i.c=get('i.c')), by=.EACHI], dt1[dt2, list(a=a, c=c, i.c=i.c), by=.EACHI]) test(1283.4, dt1[dt2, list(a=a, d=get('d')), by=.EACHI], dt1[dt2, list(a=a, d=d), by=.EACHI]) # fix for bug #5583 - missed cases like dt[order(abs(x))]. dt <- data.table(x=c(1L,-2L,3L)) test(1284.1, dt[order(abs(x))], dt) test(1284.2, dt[order(-abs(x))], dt[3:1]) # fix for bug #5582 - unique/duplicated on empty data.table returned NA dt <- data.table(x=numeric(0), y=character(0), key="x") test(1285.1, duplicated(dt, by=key(dt)), duplicated.data.frame(dt)) test(1285.2, unique(dt, by=key(dt)), dt) # BUG #5672 fix a <- data.table(BOD, key="Time") b <- data.table(BOD, key="Time")[Time < 0] # zero row data.table ans <- merge(b, a, all=TRUE) test(1287, ans, data.table(Time=a$Time, demand.x=NA_real_, demand.y=a$demand, key="Time")) # more rbindlist tests - duplicate columns with "fill=TRUE" ll <- list(data.table(x=1, y=-1, x=-2), data.table(y=10, y=20, y=30, x=-10, a="a", b=Inf, c=factor(1))) test(1288.1, rbindlist(ll, use.names=TRUE, fill=FALSE), error = "Item 2 has 7 columns, inconsistent with item 1 which has 3 columns") # modified after fixing #725 test(1288.2, rbindlist(ll, use.names=TRUE, fill=TRUE), data.table(x=c(1,-10), y=c(-1,10), x=c(-2, NA), y=c(NA,20), y=c(NA,30), a=c(NA, "a"), b=c(NA, Inf), c=factor(c(NA, 1)))) # check the name of output are consistent when binding two empty dts with one empy and other non-empty dt dt1 <- data.table(x=1:5, y=6:10) dt2 <- dt1[x > 5] setnames(dt3 <- copy(dt2), c("A", "B")) test(1288.3, names(rbindlist(list(dt2,dt3))), c("x", "y")) test(1288.4, names(rbindlist(list(dt3,dt2))), c("A", "B")) test(1288.5, names(rbindlist(list(dt1,dt3))), c("x", "y")) test(1288.6, names(rbindlist(list(dt3,dt1))), c("A", "B")) # check fix for bug #5612 DT <- data.table(x=c(1,2,3)) test(1288.7, rbind(DT, DT, data.table()), rbind(DT, data.table(), DT)) # factor on fill=TRUE with NA column.. DT1 = data.table(A=1:3,B=letters[1:3]) DT2 = data.table(B=letters[4:5],C=factor(1:2)) l = list(DT1,DT2) test(1288.8, rbindlist(l, use.names=TRUE, fill=TRUE), data.table(A=c(1:3,NA_integer_,NA_integer_), B=letters[1:5], C=factor(c(NA,NA,NA,1,2)))) # adding more tests after modifying for better backwards compatibility: # rbindlist and rbind both work fine even when certain elements of list are not named at all, as long as fill = FALSE, but use.names=TRUE errors when all names are NULL # when fill=TRUE NO element of the list must have NULL names. ll <- list(list(1:3, 4:6), list(5:7, 8:10)) test(1288.9, rbindlist(ll), data.table(V1=c(1:3, 5:7), V2=c(4:6, 8:10))) test(1288.10, rbindlist(ll, use.names=TRUE), error="use.names=TRUE but no item of input list has any names.") ll <- list(list(a=1:3, b=4:6), list(5:7, 8:10)) test(1288.11, rbindlist(ll, use.names=TRUE), data.table(a=c(1:3, 5:7), b=c(4:6, 8:10))) ll <- list(list(1:3, 4:6), list(a=5:7, b=8:10)) test(1288.12, rbindlist(ll, use.names=TRUE), data.table(a=c(1:3, 5:7), b=c(4:6, 8:10))) ll <- list(list(a=1:3, 4:6), list(5:7, b=8:10)) test(1288.13, rbindlist(ll, use.names=TRUE), error="Answer requires 3 columns whereas one or more item(s) in the input list has only 2 columns. This could be because the items in the list may not") ll <- list(list(a=1:3, 4:6), list(5:7, b=8:10)) test(1288.14, rbindlist(ll, fill=TRUE), data.table(a=c(1:3, rep(NA_integer_,3L)), V1=c(4:6,5:7), b=c(rep(NA_integer_, 3L), 8:10))) ll <- list(list(1:3, 4:6), list(5:7, 8:10)) test(1288.15, rbindlist(ll, fill=TRUE), error="fill=TRUE, but names of input list at position 1") ll <- list(list(1:3, 4:6), list(a=5:7, b=8:10)) test(1288.16, rbindlist(ll, fill=TRUE), error="fill=TRUE, but names of input list at position 1") # TO DO: TODO: think of and add more tests for rbindlist # fix for #5647 dt = data.table(x=1L, y=1:10) cp = copy(dt) test(1289.1, dt[,z := c(rep(NA, 5), y), by=x], cp[, z := c(rep(NA, 5), y[1:5])], warning="RHS 1 is length 15") dt = data.table(x=c(1:2), y=1:10) cp = copy(dt) test(1289.2, dt[, z := c(rep(NA, 5),y), by=x], cp[, z := rep(NA_integer_, 10)], warning="RHS 1 is length 10") ######################################## # Extensve testing for "duplicate" names ######################################## # Rules: Basically, if index is directly given in 'j', just those columns are touched/operated on. But if 'column' names are given and there are more than one # occurrence of that column, then it's hard to decide which to keep and which to remove. So, to remove, all are removed, to keep, always the first is kept. # 1) when i,j,by are all absent (or) just 'i' is present then ALL duplicate columns are returned. # 2) When 'with=FALSE' and 'j' is a character and 'notj' is TRUE, all instances of the column to be removed will be removed. # 3) When 'with=FALSE' and 'j' is a character and 'notj' is FALSE, only the first column will be recognised in presence of duplicate columns. # 4) When 'with=FALSE' and 'j' is numeric and 'notj' is TRUE, just those indices will be removed. # 5) When 'with=FALSE' and 'j' is numeric and 'notj' is FALSE, all columns for indices given, if valid, are returned. (FIXES #5688) # 6) When .SD is in 'j', but '.SDcols' is not present, ALL columns are subset'd - FIXES BUG #5008. # 7) When .SD and .SDcols are present and .SDcols is numeric, columns corresponding to the given indices are returned. # 8) When .SD and .SDcols are present and .SDcols is character, duplicate column names will only return the first column, each time. # 9) When .SD and .SDcols are present and .SDcols is numeric, and it's -SDcols, then just those columns are removed. # 10) When .SD and .SDcols are present and .SDcols is character and -SDcols, then all occurrences of that object is removed. # 11) When no .SD and no .SDcols and no with=FALSE, only duplicate column names will return only the first column each time. # 12) With 'get("col")', it's the same as with all character types. # 13) A logical expression in 'j'. # 14) Finally, no tests but.. using 'by' with duplicate columns and aggregating may not return the intended result, as it may operate on column names in some cases. # All points are tested with this example: DT <- data.table(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12) DT1 <- data.table(x=1L, y=3L, x=5L, x=7L, y=9L, z=11L) DT2 <- data.table(x=2L, y=4L, x=6L, x=8L, y=10L, z=12L) ll <- list(x=1:2, y=3:4, x=5:6, x=7:8, y=9:10, z=11:12) # case (1) test(1290.1, DT[1], DT1) test(1290.2, DT[], DT) test(1290.3, DT[(TRUE)], DT) # case (2) test(1290.4, DT[, !"x", with=FALSE], as.data.table(ll[c(2,5,6)])) test(1290.5, DT[, !"y", with=FALSE], as.data.table(ll[c(1,3,4,6)])) test(1290.6, DT[, !c("x", "x"), with=FALSE], as.data.table(ll[c(2,5,6)])) test(1290.7, DT[, !c("y", "y"), with=FALSE], as.data.table(ll[c(1,3,4,6)])) # case (3) test(1290.9, DT[, "x", with=FALSE], as.data.table(ll[1])) test(1290.10, DT[, "y", with=FALSE], as.data.table(ll[2])) test(1290.11, DT[, c("x", "x"), with=FALSE], as.data.table(ll[c(1,1)])) test(1290.12, DT[, c("y", "y"), with=FALSE], as.data.table(ll[c(2,2)])) # case (4) test(1290.13, DT[, !3, with=FALSE], as.data.table(ll[c(1,2,4,5,6)])) test(1290.14, DT[, !c(1,1,3,4), with=FALSE], as.data.table(ll[c(2,5,6)])) test(1290.15, DT[, !2, with=FALSE], as.data.table(ll[c(1,3,4,5,6)])) test(1290.16, DT[, !c(2,5,2), with=FALSE], as.data.table(ll[c(1,3,4,6)])) # case (5) test(1290.17, DT[, 3, with=FALSE], as.data.table(ll[3])) test(1290.18, DT[, c(1,1,3,4), with=FALSE], as.data.table(ll[c(1,1,3,4)])) test(1290.19, DT[, 2, with=FALSE], as.data.table(ll[2])) test(1290.20, DT[, c(2,5,2), with=FALSE], as.data.table(ll[c(2,5,2)])) # case (6) test(1290.21, DT[, .SD], as.data.table(ll)) test(1290.22, DT[, .SD[1]], DT[1]) test(1290.23, DT[, .SD[1, !3, with=FALSE]], as.data.table(DT[1, !3, with=FALSE])) # case (7) test(1290.24, DT[, .SD, .SDcols=c(1,1,3,4)], as.data.table(ll[c(1,1,3,4)])) # case (8) test(1290.25, DT[, .SD, .SDcols=c("x", "x", "y")], as.data.table(ll[c(1,1,2)])) # case (9) test(1290.26, DT[, .SD, .SDcols=-c(1,2)], as.data.table(ll[c(-(1:2))])) # case (10) test(1290.27, DT[, .SD, .SDcols=-c("x")], as.data.table(ll[c(2,6)])) # case (11) test(1290.28, DT[, x], ll[[1]]) test(1290.29, DT[, list(x,x,y,y,y)], as.data.table(ll[c(1,1,2,2,2)])) test(1290.30, DT[, list(x,x,y)], as.data.table(ll[c(1,1,2)])) # cast (12) test(1290.31, DT[, get("x")], ll[[1]]) test(1290.32, DT[, list(get("x"))], setnames(as.data.table(ll[1]), "V1")) test(1290.33, DT[, list(get("x"), get("y"))], setnames(as.data.table(ll[1:2]), c("V1", "V2"))) # case (13) test(1290.34, DT[, names(DT) == "x", with=FALSE], as.data.table(ll[c(1,3,4)])) # Bug #5376.. DT[, bla ;= character(0), by=.] dint add new column when `DT is empty DT. dt1 = data.table(a=character(0),b=numeric(0)) ans1 = data.table(a=character(0), b=numeric(0), c=numeric(0)) ans2 = data.table(a=character(0), b=numeric(0), c=numeric(0), d=integer(0)) test(1291.1, dt1[, c:=max(b), by='a'], ans1, warning="no non-missing arguments to max") test(1291.2, dt1[, d := integer(0), by=a], ans2) # Bug #5714 test(1292.1, data.table(x=1:2, y=3:4)[, -(1:2), with=FALSE], null.data.table()) test(1292.2, data.table(x=1:2)[, -1, with=FALSE], null.data.table()) test(1292.3, data.table(x=1:2, y=3:4)[, !c("x","y"), with=FALSE], null.data.table()) test(1292.4, data.table(x=1:2)[, !c("x"), with=FALSE], null.data.table()) # Bug #5435 - print.data.table and digits option: DT <- structure(list(fisyr = 1995:1996, er = list(c(1, 3), c(1, 3)), eg = c(0.0197315833926059, 0.0197315833926059), esal = list( c(2329.89763779528, 2423.6811023622), c(2263.07456978967, 2354.16826003824)), fr = list(c(4, 4), c(4, 4)), fg = c(0.039310363070415, 0.039310363070415), fsal = list(c(2520.85433070866, 2520.85433070866 ), c(2448.55449330784, 2448.55449330784)), mr = list(c(5, 30), c(5, 30)), mg = c(0.0197779376457164, 0.0197779376457164 ), msal = list(c(2571.70078740157, 4215.73622047244), c(2497.94263862333, 4094.82600382409))), .Names = c("fisyr", "er", "eg", "esal", "fr", "fg", "fsal", "mr", "mg", "msal"), class = c("data.table", "data.frame"), row.names = c(NA, -2L)) if (options()$width<80) options(width=80) ans1 = capture.output(print(DT, digits=4, row.names=FALSE)) ans2 = c(" fisyr er eg esal fr fg fsal mr mg msal", " 1995 1,3 0.01973 2330,2424 4,4 0.03931 2521,2521 5,30 0.01978 2572,4216", " 1996 1,3 0.01973 2263,2354 4,4 0.03931 2449,2449 5,30 0.01978 2498,4095") test(1293, ans1, ans2) ## Fixes bug #5442 ## Also improves upon bug fix #2551 to provide better warnings and at better places: dt <- data.table(a=1:3, b=c(7,8,9), c=c(TRUE, NA, FALSE), d=as.list(4:6), e=c("a", "b", "c")) test(1294.1, dt[, a := 1]$a, rep(1L, 3L)) test(1294.2, dt[, a := 1.5]$a, rep(1L, 3L), warning="Coerced 'double' RHS to 'integer' to match the column's type") test(1294.3, dt[, a := NA]$a, rep(NA_integer_, 3L)) test(1294.4, dt[, a := "a"]$a, rep(NA_integer_, 3L), warning="NAs introduced by coercion") test(1294.5, dt[, a := list(list(1))]$a, rep(1L, 3L), warning="Coerced 'list' RHS to 'integer' to match the column's type") test(1294.6, dt[, a := list(1L)]$a, rep(1L, 3L)) test(1294.7, dt[, a := list(1)]$a, rep(1L, 3L)) test(1294.8, dt[, a := TRUE]$a, rep(1L, 3L), warning="Coerced 'logical' RHS to 'integer' to match the column's type") test(1294.9, dt[, b := 1L]$b, rep(1,3)) test(1294.10, dt[, b := NA]$b, rep(NA_real_,3)) test(1294.11, dt[, b := "bla"]$b, rep(NA_real_, 3), warning="NAs introduced by coercion") test(1294.12, dt[, b := list(list(1))]$b, rep(1,3), warning="Coerced 'list' RHS to 'double' to match the column's type") test(1294.13, dt[, b := TRUE]$b, rep(1,3), warning="Coerced 'logical' RHS to 'double' to match the column's type") test(1294.14, dt[, b := list(1)]$b, rep(1,3)) test(1294.15, dt[, c := 1]$c, rep(TRUE, 3), warning="Coerced 'double' RHS to 'logical' to match the column's type") test(1294.16, dt[, c := 1L]$c, rep(TRUE, 3), warning="Coerced 'integer' RHS to 'logical' to match the column's type") test(1294.17, dt[, c := NA]$c, rep(NA, 3)) test(1294.18, dt[, c := list(1)]$c, rep(TRUE, 3), warning="Coerced 'double' RHS to 'logical' to match the column's type") test(1294.19, dt[, c := list(list(1))]$c, rep(TRUE, 3), warning="Coerced 'list' RHS to 'logical' to match the column's type") test(1294.20, dt[, c := "bla"]$c, rep(NA, 3), warning="Coerced 'character' RHS to 'logical' to match the column's type") test(1294.21, dt[, d := 1]$d, rep(list(1), 3), warning="Coerced 'double' RHS to 'list' to match the column's type") test(1294.22, dt[, d := 1L]$d, rep(list(1L), 3), warning="Coerced 'integer' RHS to 'list' to match the column's type") test(1294.23, dt[, d := TRUE]$d, rep(list(TRUE), 3), warning="Coerced 'logical' RHS to 'list' to match the column's type") test(1294.24, dt[, d := "bla"]$d, rep(list("bla"), 3), warning="Coerced 'character' RHS to 'list' to match the column's type") test(1294.25, dt[, d := list(list(1))]$d, rep(list(1), 3)) test(1294.26, dt[, e := 1]$e, rep("1", 3), warning="Coerced 'double' RHS to 'character' to match the column's type") test(1294.27, dt[, e := 1L]$e, rep("1", 3), warning="Coerced 'integer' RHS to 'character' to match the column's type") test(1294.28, dt[, e := TRUE]$e, rep("TRUE", 3), warning="Coerced 'logical' RHS to 'character' to match the column's type") test(1294.29, dt[, e := list(list(1))]$e, rep("1", 3), warning="Coerced 'list' RHS to 'character' to match the column's type") test(1294.30, dt[, e := "bla"]$e, rep("bla", 3)) test(1294.31, dt[, e := list("bla2")]$e, rep("bla2", 3)) # FR #5357, when LHS evaluates to integer(0), provide warning and return dt, not an error. dt = data.table(a = 1:5, b1 = 1:5, b2 = 1:5) test(1295, dt[, grep("c", names(d)) := NULL], dt, warning="length(LHS)==0; no columns to delete or assign RHS to") # Updating logical column in one-row DT (corruption of new R 3.1 internal globals for TRUE, FALSE and NA) DT = data.table(a=1:6, b=c(TRUE,FALSE)) test(1296, DT[,list(b,sum(b)),by=a], data.table(a=1:6, b=c(TRUE,FALSE), V2=c(1L,0L))) # was error "the ... list does not contain 2 elements" DT = DT[1L] set(DT,1L,"b",FALSE) # passing 1L as i here is needed to avoid column plonk, so changes the logical singleton in place test(1297, as.integer(TRUE[1]), 1L) # In R 3.1, TRUE[1] returns the global TRUE but TRUE doesn't yet (parses as new vector) test(1298, as.integer(TRUE), 1L) # orignal example, verbatim from James Sams : upc_table = data.table(upc=1:100000, upc_ver_uc=rep(c(1,2), times=50000), is_PL=rep(c(T, F, F, T), each=25000), product_module_code=rep(1:4, times=25000), ignore.column=2:100001) test(1299, upc_table[, .N, by=list(upc, upc_ver_uc)][,max(N)], 1L) # all size 1 groups test(1300, upc_table[, list(is_PL, product_module_code), keyby=list(upc, upc_ver_uc)][,upc[1:3]], 1:3L) # was warning "internal TRUE value has been modified" # Same test but for singleton small integers which r-devel also plan to globalise internally. DT = data.table(a=1:6, b=0:1) test(1301, DT[,list(b,sum(b)),by=a], data.table(a=1:6, b=c(0L,1L), V2=c(0L,1L))) DT = DT[1L] set(DT,1L,"b",3L) test(1302, 0L[1L], 3L-3L) test(1303, 0L, 3L-3L) # FR #5760. Test to just make sure that GForce and dogroups with .N are giving the same results. set.seed(2L) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) options(datatable.optimize = 1L) ans1 <- dt[, list(.N, sum(y)), by=x] options(datatable.optimize = 2L) ans2 <- dt[, list(.N, sum(y)), by=x] test(1304.1, ans1, ans2) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") options(datatable.optimize = 1L) ans1 <- dt[, list(.N, sum(y)), by=x] options(datatable.optimize = 2L) ans2 <- dt[, list(.N, sum(y)), by=x] test(1304.2, ans1, ans2) # FR #5528 DT <- data.table(x=1:5, y=6:10) test(1305.1, setDF(DT), data.frame(x=1:5, y=6:10)) # setDF should return if input is data.frame, not error. df <- data.frame(x=1:5, y=6:10) test(1305.2, setDF(df), df) # setDF works on data.frame # setDF also works on lists with equal lengths, #1132 df <- list(a=1:5, b=6:10) test(1305.3, data.frame(df), setDF(df)) df <- list(1:5, 6:10) test(1305.4, setDF(as.data.table(df)), setDF(df)) test(1305.5, setDF(1:5), error="setDF only accepts") test(1305.6, setDF(list(1, 2:3)), error="All elements in argument") # Tests .7 - .13 for FR #1320: setDF accepts rownames argument dt <- data.table(a=1:5, b=6:10) df <- data.frame(a=1:5, b=6:10) lst <- list(a=1:5, b=6:10) df2 <- data.frame(a=1:5, b=6:10) rownames(df2) <- LETTERS[1:5] test(1305.7, setDF(dt, rownames=LETTERS[1:5]), df2) test(1305.8, setDF(df, rownames=LETTERS[1:5]), df2) test(1305.9, setDF(lst,rownames=LETTERS[1:5]), df2) # setDF returns an error for each type if rownames incorrect length dt <- data.table(a=1:5, b=6:10) df <- data.frame(a=1:5, b=6:10) lst <- list(a=1:5, b=6:10) test(1305.10, setDF(dt, rownames="a"), error='rownames incorrect length') test(1305.11, setDF(df, rownames="a"), error='rownames incorrect length') test(1305.12, setDF(lst,rownames="a"), error='rownames incorrect length') # setDF returns an error when rownames contains duplicates test(1305.13, setDF(dt, rownames=rep("a",5)), error='rownames contains duplicates') # .SD retains as much of head(key) as appropriate. # by= always keeps data appearance order, so it's which columns are grouped and selected that drive how much of key is retained DT = data.table(a=1:3,b=1:6,c=1:6,key="a,b") test(1306, DT[1:2,key(.SD)], c("a","b")) test(1307, DT[2:1,key(.SD)], NULL) test(1308, DT[,key(.SD),by=a], data.table(a=integer())) test(1309, DT[,key(.SD),by=b], data.table(b=DT$b, V1="a")) test(1310, DT[,key(.SD),by=c%%2L], data.table(c=c(1L,1L,0L,0L), V1=c("a","b","a","b"))) test(1311, DT[,list(list(key(.SD))),by=a,.SDcols=1:2], data.table(a=1:3, V1=list(c("a","b")),key="a")) # .SDcols as Arun found # That setkey can't operate on locked tables such as .SD. Added in v1.9.3. DT = data.table(a=1:3,b=6:1) test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use") # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread). # gmin and gmax extensive testing (because there are tricky cases) DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) # make sure GForce is running options(datatable.optimize=2L) # for integers test(1313.1, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.2, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.3, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x]) test(1313.4, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := INT(NA)] test(1313.5, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.6, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.7, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing") test(1313.8, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing") # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) test(1313.9, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.10, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x]) test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := NA_real_] test(1313.13, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.14, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing") test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing") # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)]) test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)]) DT[c(1,6), y := NA] test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)]) test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)]) test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE)) test(1313.23, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.24, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base:::min(y, na.rm=TRUE), by=x]) test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base:::max(y, na.rm=TRUE), by=x]) DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base:::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base:::max(y), by=x]) test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing") test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing") # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y") dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y") test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key="x,y")) # also test where 'i' is not sorted. set.seed(1L) dt2 <- dt2[sample(nrow(dt2))] # key should be gone test(1317.2, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[1], dt2$w[c(2,6)]))) # bug fix for #472 : "parse" in j set.seed(100) nrow <- 100L DT <- data.table(aa = sample(letters[1:5], nrow, replace = TRUE), bb = rnorm(nrow)) sumExpr <- parse(text = "sum(bb, na.rm = TRUE)") meanExpr <- parse(text = "mean(bb, na.rm = TRUE)") test(1318.1, DT[, eval(sumExpr), by = aa], DT[, sum(bb, na.rm=TRUE), by=aa]) test(1318.2, DT[, eval(meanExpr), by = aa], DT[, mean(bb, na.rm=TRUE), by=aa]) test(1318.3, DT[, list(mySum = eval(sumExpr), myMean = eval(meanExpr)), by = aa], DT[, list(mySum=sum(bb, na.rm=TRUE), myMean=mean(bb, na.rm=TRUE)), by=aa]) # get DT[order(.)] to be 100% consistent with base, even though the way base does some things is *utterly ridiculous*, inconsistent. # closes #696. DT <- data.table(a = 1:4, b = 8:5, c=letters[4:1]) test(1319.1, DT[order(DT[, "b", with=FALSE])], DT[base:::order(DT[, "b", with=FALSE])]) test(1319.2, DT[order(DT[, "c", with=FALSE])], DT[base:::order(DT[, "c", with=FALSE])]) test(1319.3, DT[order(DT[, c("b","c"), with=FALSE])], DT[base:::order(DT[, c("b","c"), with=FALSE])]) test(1319.4, DT[order(DT[, c("c","b"), with=FALSE])], DT[base:::order(DT[, c("c","b"), with=FALSE])]) test(1319.5, DT[order(DT[, "b", with=FALSE], DT[, "a", with=FALSE])], DT[base:::order(DT[, "b", with=FALSE], DT[, "a", with=FALSE])]) # test to make sure old things are not modified (ridiculous, but "consistency" demands it!) test(1319.6, DT[order(list(DT$a))], DT[1]) test(1319.7, DT[order(list(DT$a), list(DT$b))], DT[1]) test(1319.8, DT[order(list(DT$a, DT$b))], error="Column '1' is type 'list' which is not") # FR #703. Not so extensive testing because test 1223 already tests for everything else extensively. Only integer64 here. # this'll be the test for both DT[order(.)] and setorder(.) as both internally uses forder/forderv if ("package:bit64" %in% search()) { set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, 50, 1e18, 1e-18)), y=sample(5)) ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE) ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE) ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE) ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE) test(1320.1, ans1, as.integer(c(1,2,5,3,4))) test(1320.2, ans2, as.integer(c(1,2,5,3,4))) test(1320.3, ans3, as.integer(c(4,3,2,5,1))) test(1320.4, ans4, as.integer(c(4,3,2,5,1))) set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7)) ans1 <- forder(DT, x, na.last=TRUE, decreasing=FALSE) ans2 <- forder(DT, x, na.last=FALSE, decreasing=FALSE) ans3 <- forder(DT, x, na.last=TRUE, decreasing=TRUE) ans4 <- forder(DT, x, na.last=FALSE, decreasing=TRUE) test(1320.5, ans1, as.integer(c(1,2,7,4,5,3,6))) test(1320.6, ans2, as.integer(c(3,6,1,2,7,4,5))) test(1320.7, ans3, as.integer(c(5,4,2,7,1,3,6))) test(1320.8, ans4, as.integer(c(3,6,5,4,2,7,1))) # missed test - checking na.last=NA! set.seed(45L) DT <- data.table(x=as.integer64(c(-50, 0, NA, 50, 1e18, NA, 1e-18)), y=sample(7)) ans1 <- forder(DT, x, na.last=NA, decreasing=FALSE) ans2 <- forder(DT, x, na.last=NA, decreasing=TRUE) test(1320.9, ans1, as.integer(c(0,0,1,2,7,4,5))) test(1320.10, ans2, as.integer(c(0,0,5,4,2,7,1))) } # fread newlines inside quoted fields test(1321, fread('A,B,C\n1,"foo\nbar",3\n4,baz,6'), data.table(A=c(1L,4L), B=c("foo\nbar","baz"), C=c(3L,6L))) test(1322, fread('A,B,C\n1,"foo bar",3\n4,baz,6'), data.table(A=c(1L,4L), B=c("foo\nbar","baz"), C=c(3L,6L))) # NB: don't remove the newline after foo in test 1322 above, that's what's being tested. test(1323, fread('col1,col2\n5,"4\n3"'), data.table(col1=5L, col2="4\n3")) test(1324, fread('A,B,C\n1,4,"foo"\n2,5,"bar'), data.table(A=1:2,B=4:5,C=c("foo", "\"bar"))) test(1325, fread('A,B,C\n1,4,"foo"\n2,5,"bar"'), data.table(A=1:2,B=4:5,C=c("foo",'bar'))) test(1326, fread('A,B,C\n1,4,"foo"\n2,5,bar"'), data.table(A=1:2,B=4:5,C=c("foo",'bar"'))) test(1327, fread('A,B,C\n1,4,"foo"\n2,5,""bar""'), data.table(A=1:2,B=4:5,C=c("foo",'"bar"'))) cat('A,B\n1,"Joe \\",Bloggs"', file = f<-tempfile()) test(1328, fread(f), data.table(V1=1L, V2='Joe \\', V3='Bloggs"'), warning="Starting data input on line 2 and discarding line 1 because.*: A,B") unlink(f) test(1329, fread(), error="Input is either empty or fully whitespace after the skip or autostart") # add test that that escaped escapes at the end of a quoted field test(1330, fread('A,B\nfoo,1\nAnalyst\\,2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3)) test(1331.1, fread('A,B\nfoo,1\nAnalyst\\ ,2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3)) # strip.white=TRUE test(1331.2, fread('A,B\nfoo,1\nAnalyst\\ ,2\nbar,3', strip.white=FALSE), data.table(A=c('foo','Analyst\\ ','bar'), B=1:3)) test(1332, fread('A,B\nfoo,1\n"Analyst\\",2\nbar,3'), data.table(A=c('foo','Analyst\\','bar'), B=1:3)) # double \\ in this file means one in the input, so the above " is escaped by a single '\' but still read ok test(1333.1, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3'), data.table(A = c("foo", "\"Analyst\\\"", "bar"), B = 1:3)) test(1333.2, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3', strip.white=FALSE), data.table(A = c("foo", "\"Analyst\\\" ", "bar"), B = 1:3)) test(1334, fread('A,B\nfoo,1\n"Analyst\\" ,",2\nbar,3'), data.table(A=c('foo', 'Analyst\\" ,', 'bar'), B=1:3)) test(1335, fread('A,B\nfoo,1\n"Analyst\\\\",2\nbar,3'), data.table(A=c('foo','Analyst\\\\','bar'), B=1:3)) # data from 12GB file in comments on http://stackoverflow.com/a/23858323/403310 ... # note that read.csv gets this wrong and puts jacoleman high school into the previous field, then fills the rest of the line silently. cat('A,B,C,D,E,F "12",0,"teacher private nfp\\\\\\\\"",""jacoleman high school","","" "TX",77406,"business analyst\\\\\\\\\\\\\\","the boeing co","","" "CA",94116,"na\\none","retired","","" ', file = f<-tempfile()) # aside: notice the \\ before n of none as well test(1336.1, fread(f), data.table(A = c("12", "TX", "CA"), B = c(0L, 77406L, 94116L), C = c("teacher private nfp\\\\\\\\\"", "business analyst\\\\\\\\\\\\\\", "na\\none"), D = c("\"\"jacoleman high school\"", "the boeing co", "retired"), E = c("", "", ""), F = c("", "", ""))) cat('A,B,C,D,E,F "12",0,"teacher private nfp\\\\\\\\"","jacoleman high school","","" "TX",77406,"business analyst\\\\\\\\\\\\\\","the boeing co","","" "CA",94116,"na\\none","retired","","" ', file = f) test(1336.2, fread(f), data.table(A=c("12","TX","CA"), B=c(0L,77406L,94116L),C=c('teacher private nfp\\\\\\\\"','business analyst\\\\\\\\\\\\\\','na\\none'), D=c('jacoleman high school','the boeing co','retired'),E="",F="")) unlink(f) # file names ending with \ (quite common) # http://stackoverflow.com/questions/24375832/fread-and-column-with-a-trailing-backslash cat('file,size\n"windows\\user\\",123\n', file = f<-tempfile()) test(1337, fread(f), data.table(file='windows\\user\\',size=123L)) test(1338, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) unlink(f) # TO DO, by checking for balanced embedded quotes # cat('http,size\n"www.blah?x="one",y="two","three"",123\n', file = f<-tempfile()) # read.csv(f) -- unusually, seems to be a case it doesn't handle # test(1339, fread(f), data.table(http='www.blah?x="one",y="two","three"',size=123L)) # unlink(f) # FR #706 - setorder and setorderv now has 'na.last=TRUE/FALSE' argument. It can't have value NA though, like `DT[order(.)]` as it reorders by reference, doesn't subset. Simple tests. set.seed(45L) DT <- data.table(x=sample(c(-2:2, NA_integer_), 20, TRUE), y=sample(c(-1:1, NA, Inf, -Inf, NaN), 20, TRUE)) test(1340.1, setorder(copy(DT), x, na.last=TRUE ), DT[order( x, na.last=TRUE)]) test(1340.2, setorder(copy(DT), x, na.last=FALSE), DT[order( x, na.last=FALSE)]) test(1340.3, setorder(copy(DT), -x, na.last=TRUE ), DT[order(-x, na.last=TRUE)]) test(1340.4, setorder(copy(DT), -x, na.last=FALSE), DT[order(-x, na.last=FALSE)]) test(1340.5, setorder(copy(DT), y, na.last=TRUE ), DT[order( y, na.last=TRUE)]) test(1340.6, setorder(copy(DT), y, na.last=FALSE), DT[order( y, na.last=FALSE)]) test(1340.7, setorder(copy(DT), -y, na.last=TRUE ), DT[order(-y, na.last=TRUE)]) test(1340.8, setorder(copy(DT), -y, na.last=FALSE), DT[order(-y, na.last=FALSE)]) test(1340.9, setorderv(copy(DT), "x", 1L, na.last=TRUE ), DT[order( x, na.last=TRUE)]) test(1340.10, setorderv(copy(DT), "x", 1L, na.last=FALSE), DT[order( x, na.last=FALSE)]) test(1340.11, setorderv(copy(DT), "x", -1L, na.last=TRUE ), DT[order(-x, na.last=TRUE)]) test(1340.12, setorderv(copy(DT), "x", -1L, na.last=FALSE), DT[order(-x, na.last=FALSE)]) test(1340.13, setorderv(copy(DT), "y", 1L, na.last=TRUE ), DT[order( y, na.last=TRUE)]) test(1340.14, setorderv(copy(DT), "y", 1L, na.last=FALSE), DT[order( y, na.last=FALSE)]) test(1340.15, setorderv(copy(DT), "y", -1L, na.last=TRUE ), DT[order(-y, na.last=TRUE)]) test(1340.16, setorderv(copy(DT), "y", -1L, na.last=FALSE), DT[order(-y, na.last=FALSE)]) test(1340.17, setorder(copy(DT), x, na.last=NA), error="na.last must be logical TRUE/FALSE") test(1340.18, setorderv(copy(DT), "x", na.last=NA), error="na.last must be logical TRUE/FALSE") # bug #481 - DT[, list(list(.)), by=.] on R v3.1.0 set.seed(1L) f <- function(x) list(x) DT <- data.table(x=sample(3,10,TRUE), y=as.numeric(sample(10))) test(1341.1, DT[, list(list(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8)))) test(1341.2, DT[, list(list(.I)), by=x], data.table(x=unique(DT$x), V1=list(c(1,5,10), c(2,3,8,9), c(4,6,7)))) test(1341.3, DT[, list(f(y)), by=x], data.table(x=unique(DT$x), V1=list(c(3,5,9), c(2,6,4,1), c(10,7,8)))) # test for list(list(.)) with := test(1341.4, copy(DT)[, z := list(list(y)), by=x], copy(DT)[, z := list(list(copy(y))), by=x]) test(1341.5, copy(DT)[, z := list(list(.I)), by=x], copy(DT)[, z := list(list(copy(.I))), by=x]) test(1341.6, copy(DT)[, z := list(f(y)), by=x], copy(DT)[, z := list(f(copy(y))), by=x]) # test regression on over-allocation (selfref) on unique() which uses new subsetDT() bla <- data.table(x=c(1,1,2,2), y=c(1,1,1,1)) test(1342, unique(bla)[, bla := 2L], data.table(x=c(1,2),y=1,bla=2L)) # blank and NA fields in logical columns test(1343, fread("A,B\n1,TRUE\n2,\n3,F"), data.table(A=1:3, B=c(TRUE,NA,FALSE))) test(1344, fread("A,B\n1,T\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA))) # .N now available in i DT = data.table(a=1:3,b=1:6) test(1348, DT[.N], DT[6]) test(1349, DT[.N-1:3], DT[5:3]) test(1350, DT[.N+1], DT[NA]) # Adding test to catch any future regressions - #734 dt = data.table(id = rep(c('a','b'), each=2), val = rep(c(1,2,3), times=c(1,2,1))) setkey(dt, id, val) test(1351.1, dt[J("a"), val], c(1,2)) test(1351.2, dt[J('a'), range(val)], c(1,2)) # New feature: .() in j and .() in by DT = data.table(a=1:3, b=1:6, c=LETTERS[1:6]) test(1352.1, DT[,.(b)], DT[,list(b)]) test(1352.2, DT[,.(b,c)], DT[,c("b","c"),with=FALSE]) test(1352.3, DT[,.(sum(b)),by=a], DT[,sum(b),by=a]) test(1352.4, DT[,.(MySum=sum(b)), by=a], data.table(a=1:3, MySum=c(5L,7L,9L))) test(1352.5, DT[,sum(b),by=.(a)], DT[,sum(b),by=a]) test(1352.6, DT[,sum(b),by=.(a%%2)], DT[,sum(b),by=a%%2]) test(1352.7, DT[,sum(b),by=.(Grp=a%%2)], DT[,sum(b),by=list(Grp=a%%2)]) test(1352.8, DT[,sum(b),by=.(a%%2,c)], DT[,sum(b),by=list(a%%2,c)]) # that :=NULL together with i is now an error DT = data.table(a=1:3, b=1:6) test(1353.1, DT[2, b:=NULL], error="When deleting columns, i should not be provided") test(1353.2, DT[2, c("a","b"):=list(42, NULL)], error="When deleting columns, i should not be provided") # order optimisation caused trouble due to chaining because of 'substitute(x)' usage in [.data.table. set.seed(1L) X = data.table(id=1:10, val1=sample(3,10,TRUE)) Y = data.table(val1=1:4, val2=8:5, key="val1") setkey(X, val1) test(1354, X[Y, val2 := i.val2, allow.cartesian=TRUE][, val1 := NULL][order(id)], data.table(id=1:10, val2=as.integer(c(8,7,7,6,8,6,6,7,7,8)))) # Fix for #475, setDT(CO2) should error, as it's trying to modify the object whose binding is locked. # CO2 is not locked in R 2.14.1 but is in R >= 3.1.0. R NEWS isn't clear when that change happened, so just test there is an error when it is locked. if (bindingIsLocked("CO2",as.environment("package:datasets"))) { test(1355, setDT(CO2), error="Can not convert 'CO2' to data.table by reference because binding is locked.") } else { test(1355, setDT(CO2), CO2) } # Fix for #698. not join doesn't need to check for allow.cartesian=TRUE. DT1 <- data.table(x=rep(1:3, each=3L), y=1:9, key="x") DT2 <- data.table(x=rep(c(3L,1L), each=10), z=1L) test(1356, DT1[!DT2], data.table(x=2L, y=4:6, key="x")) # Fix for #745. as.data.table.matrix shouldn't convert character to factor m <- matrix(letters[1:4], ncol=2) test(1357, as.data.table(m), data.table(V1=letters[1:2], V2=letters[3:4])) # Fix for #471. A[A[A]] contains duplicate names in 1.9.3 A <- data.table(foo = 1:2, bar = 3:4) setkey(A, foo) test(1358.1, names(A[A[A]]), c("foo", "bar", "i.bar", "i.bar.1")) test(1358.2, names(A[A[A[A]]]), c("foo", "bar", "i.bar", "i.bar.2", "i.bar.1")) # Fix for #743. 0 and -0 and the sign bit issue A <- data.table(x=c(0,0,-1,1,-1,0,-0,1,-1,1,0,1), y=1:12) test(1359.1, A[, .N, by=x], data.table(x=c(0,-1,1), N=c(5L,3L,4L))) dt1 <- data.table(x2 = 0L) dt2 <- data.table(x2 =-(11-11)/10) test(1359.2, as.integer(merge(dt2, dt1, by="x2")$x2), as.integer(merge(dt1, dt2, by="x2")$x2)) # Fix for #744: X[Y, c(...), by=.EACHI] segfaults because of using 'i' as variable in for-loop that masked the original 'i' from input. dt <- data.table(id = c("A", "A", "B", "B", "C"), val1=1:5, val2=6:10, key = "id") sample <- c("A", "B") test(1360.1, dt[sample, c(.N), by = .EACHI], dt[sample, list(V1=.N), by=.EACHI]) test(1360.2, copy(dt)[sample, N := c(.N), by = .EACHI], copy(dt)[sample, N := .N, by = .EACHI]) # Fix for #500 - `lapply` call shouldn't redirect to `[.data.frame`. L <- list(data.table(BOD), data.table(BOD)) test(1361, lapply(L, "[", Time==3L), list(L[[1L]][Time == 3L], L[[2L]][Time == 3L])) # Feature #735, first two cases: 1) .SD, and 2) DT[, c(.SD, lapply(.SD, ...)), by=...] optimisation: # Don't set options(datatable.verbose=TRUE) here because the "running test 1362.1 ..." messages cause output to scroll away errors on CRAN checks last 13 lines DT <- data.table(x=c(1,1,1,2,2), y=1:5, z=6:10) test(1362.1, DT[, .SD, by=x, verbose=TRUE], output="lapply optimization changed j from '.SD' to 'list(y, z)'") test(1362.2, DT[, c(.SD), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD)' to 'list(y, z)'") test(1362.3, DT[, c(.SD, lapply(.SD, sum)), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD, lapply(.SD, sum))' to 'list(y, z, sum(y), sum(z))'") test(1362.4, DT[, c(lapply(.SD, sum), .SD), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(lapply(.SD, sum), .SD)' to 'list(sum(y), sum(z), y, z)'") test(1362.5, DT[, c(list(y), .SD, lapply(.SD, sum)), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(list(y), .SD, lapply(.SD, sum))' to 'list(y, y, z, sum(y), sum(z))'") # 3) .SD[1] and 4) .SD[1L] test(1362.6, DT[, c(.SD[1L]), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD[1L])' to 'list(y[1L], z[1L])'") test(1362.7, DT[, c(.SD[1L], lapply(.SD, sum)), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD[1L], lapply(.SD, sum))' to 'list(y[1L], z[1L], sum(y), sum(z))'") test(1362.8, DT[, c(.SD[.N]), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD[.N])' to 'list(y[.N], z[.N])'") test(1362.9, DT[, .SD[1], by=x, verbose=TRUE], output="lapply optimization changed j from '.SD[1]' to 'list(y[1], z[1])'") test(1362.11, DT[, c(.SD[1]), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD[1])' to 'list(y[1], z[1])'") test(1362.12, DT[, c(.SD[1], lapply(.SD, sum)), by=x, verbose=TRUE], output="lapply optimization changed j from 'c(.SD[1], lapply(.SD, sum))' to 'list(y[1], z[1], sum(y), sum(z))'") test(1362.13, DT[, head(.SD, 1), by=x, verbose=TRUE], output="lapply optimization changed j from 'head(.SD, 1)' to 'list(head(y, 1), head(z, 1))'") # make sure .I is named as I when no name is given test(1362.14, names(DT[, c(list(.I, mean(y)), lapply(.SD, sum)), by=x]), c("x", "I", "V2", "y", "z")) # and if a name is given, it's retained test(1362.15, names(DT[, c(list(bla=.I, mean(y)), lapply(.SD, sum)), by=x]), c("x", "bla", "V2", "y", "z")) # Add test to ensure that mean() gets replaced with fastmean when GForce won't be used. test(1362.16, DT[, c(list(.I, mean(y)), lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization changed j from 'list(.I, mean(y), mean(y), mean(z))' to 'list(.I, .External(Cfastmean, y, FALSE), .External(Cfastmean, y, FALSE), .External(Cfastmean, z, FALSE))'") # setDT(DT), when input is already a data.table checks if selfrefok and if not, does alloc.col again. DT = list(data.frame(x=1:5, y=6:10)) invisible(lapply(DT, setDT)) DT = DT[[1L]] test(1363.1, selfrefok(DT), 1L) foo <- function(x) setDT(x) df = data.frame(x=1, y=2) foo(df) test(1363.2, selfrefok(df), 0L) setDT(df) test(1363.3, selfrefok(df), 1L) # setdiff, parly #547. internal as of now, and named setdiff_ because the name "set" can be confused with the set* functions. # maybe provide a %diff% operator that internally calls setdiff_?? Usage x %diff% y? X = data.table(a=c(1,1,1,1,3,3,2,2,2))[, `:=`(b=factor(a), c=as.character(a), d = as.integer(a), e=1:9)] Y = data.table(a=c(3,4), b=factor(3:4), c=c("3","4"), d=3:4, e=c(TRUE, FALSE), f=c(5L,7L)) test(1364.1, setdiff_(X, Y, "a", "a"), data.table(a=c(1,2))) test(1364.2, setdiff_(X, Y, c("a", "e"), c("a", "f")), X[!5, list(a,e)]) test(1364.3, setdiff_(X, Y, "a", "e"), error="When x's column ('a') is integer or numeric, the corresponding column in y ('e')") test(1364.4, setdiff_(X, Y, "b", "b"), data.table(b=factor(c(1,2), levels=c(1,2,3)))) test(1364.5, setdiff_(X, Y, c("b", "e"), c("b", "f")), X[!5, list(b,e)]) test(1364.6, setdiff_(X, Y, "b", "c"), data.table(b=factor(c(1,2), levels=c(1,2,3)))) test(1364.7, setdiff_(X, Y, "c", "c"), data.table(c=as.character(c(1,2)))) test(1364.8, setdiff_(X, Y, c("c", "e"), c("c", "f")), X[!5, list(c,e)]) test(1364.9, setdiff_(X, Y, "c", "b"), data.table(c=c("1", "2"))) test(1364.11, setdiff_(X, Y, "d", "d"), data.table(d=1:2)) test(1364.12, setdiff_(X, Y, c("d", "e"), c("d", "f")), X[!5, list(d,e)]) test(1364.13, setdiff_(X, Y, "d", "e"), error="When x's column ('d') is integer or numeric, the corresponding column in y ('e')") test(1364.14, setdiff_(X, Y, "b", "a"), error="When x's column ('b') is factor, the corresponding column in y ('a')") test(1364.15, setdiff_(X, Y, "c", "a"), error="When x's column ('c') is character, the corresponding column in y ('a') ") test(1364.16, setdiff_(X, Y), error="length(by.x) != length(by.y)") test(1364.17, setdiff_(X[, list(a)], Y[, list(a)]), data.table(a=c(1,2))) # not join along with by=.EACHI, #604 DT <- data.table(A=c(1,1,1,2,2,2,2,3,3,4,5,5))[, `:=`(B=as.integer(A), C=c("c", "e", "a", "d"), D=factor(c("c", "e", "a", "d")), E=1:12)] setkey(DT, A) test(1365.1, suppressMessages(DT[!J(c(2,5)), sum(E), by=.EACHI]), suppressMessages(DT[J(c(1,3,4)), sum(E), by=.EACHI])) setkey(DT, B) test(1365.2, suppressMessages(DT[!J(c(4:5)), list(.N, sum(E)), by=.EACHI]), suppressMessages(DT[J(1:3), list(.N, sum(E)), by=.EACHI])) setkey(DT, C) test(1365.3, suppressMessages(copy(DT)[!"c", f := .N, by=.EACHI]), suppressMessages(copy(DT)[c("a", "d", "e"), f := .N, by=.EACHI])) setkey(DT, D) test(1365.4, suppressMessages(DT[!J(factor("c")), .N, by=.EACHI]), suppressMessages(DT[J(factor(c("a", "d", "e"))), .N, by=.EACHI])) test(1365.5, suppressMessages(DT[!"c", lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")]), suppressMessages(DT[c("a", "d", "e"), lapply(.SD, sum), by=.EACHI, .SDcols=c("B", "E")])) # uniqlengths doesn't error on 0-length input test(1366, uniqlengths(integer(0), 0L), integer(0)) # na.last=NA gets 0's for NAs not at the beginning when there are values so close to NA_integer_ for integers and -Inf for example for numerics. Moved logic to the end in forder.c so that we replace NAs with 0's after the ordering have been taken care of completely. x = c(-2147483000L, NA_integer_, 1L) test(1367.1, forderv(x, na.last=NA), c(0L,1L,3L)) x = c(NA, Inf, 0, 1, -1, -Inf, NaN) test(1367.2, forderv(x, na.last=NA), c(0L, 0L, 6L, 5L, 3L, 4L, 2L)) # Fix for integer overflow segfault in setRange x = c(-2147483647L, NA_integer_, 2L) test(1368.1, forderv(x), c(2L, 1L, 3L)) x = c(2147483647L, NA_integer_, -2L) test(1368.2, forderv(x), c(2L, 3L, 1L)) # tests for frankv. testing on vectors alone so that we can compare with base::rank # One difference is that NAs belong to the same group, unlike base::rank. So are NaNs. # So, they can't be compared to base::rank, won't be identical except for ties="first", and (ties="random", na.last=NA) - should document this. # no seed set on purpose dt = data.table(AA=sample(c(-2:2), 50, TRUE), BB=sample(c(-2,-1,0,1,2,Inf,-Inf), 50, TRUE), CC=sample(c(letters[1:5]), 50, TRUE), DD=sample(c(-2:2), 50, TRUE), EE=sample(as.logical(c(-2:2)), 50, TRUE)) if ("package:bit64" %in% search()) dt[, DD := as.integer64(DD)] test_no = 1369.0 for (i in seq_along(dt)) { col = dt[[i]] for (j in list(TRUE, FALSE, "keep")) { for (k in c("average", "min", "max", "first")) { if (k == "random") set.seed(45L) if (class(col) == "integer64") { r1 = rank(as.integer(col), ties.method=k, na.last=j) r2 = rank(-xtfrm(as.integer(col)), ties.method=k, na.last=j) } else { r1 = rank(col, ties.method=k, na.last=j) r2 = rank(-xtfrm(col), ties.method=k, na.last=j) } if (k == "random") set.seed(45L) r3 = frankv(col, ties.method=k, na.last=j) r4 = frankv(col, order=-1L, ties.method=k, na.last=j) test_no = signif(test_no+.01, 7) test(test_no, r1, r3) test_no = signif(test_no+.01, 7) test(test_no, r2, r4) } } } # test na.last=NA here separately. dt = data.table(AA=sample(c(-2:2, NA), 50, TRUE), BB=sample(c(-2,-1,0,1,2,Inf,-Inf, NA, NaN), 50, TRUE), CC=sample(c(letters[1:5], NA), 50, TRUE), DD=sample(c(-2:2, NA), 50, TRUE), EE=sample(as.logical(c(-2:2, NA)), 50, TRUE)) if ("package:bit64" %in% search()) dt[, DD := as.integer64(DD)] for (i in seq_along(dt)) { col = dt[[i]] for (k in c("average", "min", "max", "first")) { if (k == "random") set.seed(45L) if (class(col) == "integer64") { r1 = rank(as.integer(col), ties.method=k, na.last=NA) r2 = rank(-xtfrm(as.integer(col)), ties.method=k, na.last=NA) } else { r1 = rank(col, ties.method=k, na.last=NA) r2 = rank(-xtfrm(col), ties.method=k, na.last=NA) } if (k == "random") set.seed(45L) r3 = frankv(col, ties.method=k, na.last=NA) r4 = frankv(col, order=-1L, ties.method=k, na.last=NA) test_no = signif(test_no+.01, 7) test(test_no, r1, r3) test_no = signif(test_no+.01, 7) test(test_no, r2, r4) } } # tests for is_na, which is equivalent of rowSums(is.na(dt)) > 0L # not exported yet, but we could! ## UPDATE: also added tests for "any_na", internal version of anyNA ## which also includes implementation for bit64::integer64, but the ## real need is for merging factors correctly in joins, and we need ## a fast check for NAs; can't rely on 3.1+ for anyNA. dt = list(AA=sample(c(NA,-2:2), 50, TRUE), BB=sample(c(NA,-2,-1,0,NaN,1,2,Inf,-Inf), 50, TRUE), CC=sample(c(NA,letters[1:5]), 50, TRUE), DD=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if ("package:bit64" %in% search()) dt[["DD"]] = as.integer64(dt[["DD"]]) test_no = 1370.0 ans = as.list(na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { test_no = signif(test_no+.01, 7) ans1 = is_na(dt[cols]) ans2 = rowSums(is.na(as.data.table(dt[cols]))) > 0L test(test_no, ans1, ans2) # update: tests for any_na test_no = signif(test_no+.01, 7) test(test_no, any_na(dt[cols]), TRUE) test_no = signif(test_no+.01, 7) test(test_no, any_na(ans[cols]), FALSE) TRUE }) } ## The function is_na now gains a "by" argument where we can specify the columns. Tests have not been added for that yet. ## However, I've added tests for 'na.omit.data.table' that uses this internally. So we don't have to add tests here again. ## See tests 1394.* # extensive testing of overlap joins: # first test all argument check errors... x = data.table(chr=c("Chr1", "Chr1", "Chr2", "Chr2", "Chr2"), start=c(5,10, 1, 25, 50), end=c(11,20,4,52,60)) y = data.table(chr=c("Chr1", "Chr1", "Chr2"), start=c(1, 15,1), end=c(4, 18, 55), val=1:3) # no by.x and by.y error test(1372.1, foverlaps(x, y, type="any"), error="'y' must be keyed (i.e., sorted, and, marked as sorted).") setkey(y, chr, end, start) test(1372.2, foverlaps(x, y, by.y=1:3, type="any"), error="The first 3 columns of y's key is not identical to the columns specified in by.y.") setkey(y, chr, start, end) setnames(y, c("chr", "pos1", "pos2", "val")) if ("package:GenomicRanges" %in% search()) { setcolorder(y, c("chr", "val", "pos1", "pos2")) ans1 = foverlaps(x, y, type="any", by.x=c("chr", "start", "end"), by.y=c("chr", "pos1", "pos2"), which=TRUE, nomatch=0L) test(1372.3, foverlaps(x,y,by.x=1:3, nomatch=0L), data.table(chr=x$chr[2:5], y[c(2,3,3,3), -1, with=FALSE], x[2:5, 2:3, with=FALSE])) gr <- function(x) { GRanges(Rle(x[[1]]), IRanges(start=x[[2]], end=x[[3]])) } fo <- function(gr1, gr2, ...) { olaps = findOverlaps(gr1, gr2, ...) if (is.vector(olaps)) return(olaps) ans = setDT(list(xid=queryHits(olaps), yid=subjectHits(olaps))) setorder(ans) ans } test(1372.4, setorder(ans1), fo(gr(x), gr(y[, c(1,3,4), with=FALSE]), type="any", select="all")) runs = 3L # repeat 3 times.. types=c("any", "within", "start", "end") mults=c("all", "first", "last") maxgap=0L; minoverlap=1L verbose=FALSE; which=TRUE test_no <- 1372.4 for (run in seq_len(runs)) { n1 = max(50L, sample(1e2L, 1, FALSE)) n2 = max(50L, sample(1e2L, 1, FALSE)) N = max(100L, sample(1e3L, 1, FALSE)) i1 = sample(N, n1, TRUE) i2 = sample(N, n1, TRUE) start = pmin(i1,i2) end = pmax(i1,i2) chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE)) i = setDT(list(chr=chr, start=start, end=end)) i1 = sample(N, n2, TRUE) i2 = sample(N, n2, TRUE) start = pmin(i1,i2) end = pmax(i1,i2) chr = sort(sample(paste("Chr", 1:2, sep=""), length(start), TRUE)) x = setDT(list(chr=chr, start=start, end=end)) setkey(x); setkey(i) for (type in types) { for (mult in mults) { # data.table overlap join nomatch = ifelse(mult == "all", 0L, NA_integer_) ans1 = foverlaps(i, x, mult=mult, type=type, nomatch=nomatch, which=which, verbose=verbose) ans2 = fo(gr(i), gr(x), type=type, select=mult) test_no = signif(test_no+.01, 7) # cat("test =", test_no, ", i = ", run, ", type = ", type, ", mult = ", mult, "\n", sep="") test(test_no, ans1, ans2) } } } } else { cat("Tests 1372.3+ not run. If required call library(GenomicRanges) first.\n") } # fix for bug in address - #824 x = 1:10 address(x) ## shouldn't increment NAM field out = capture.output(.Internal(inspect(x))) test(1373, grepl("NAM\\(1\\)", out), TRUE) # fix for bug #762 - key'd data.table with a non-existing column in 'by' is not handled properly. DT <- data.table(x=1:5, z=5:1, key="z") y <- c(1,3,2,3,2) test(1374.1, DT[, list(x=sum(x)), by=y], data.table(y=c(1,3,2), x=c(5L, 6L, 4L))) y <- c(1,2,2,3,3) test(1374.2, DT[, list(x=sum(x)), by=y], data.table(y=c(1,2,3), x=c(5L, 7L, 3L), key="y")) # order in i combined with := in j, updates those rows in that order # order in i without := in j, returns new object in that order, which is then updated # Similarly, subset in i with := in j, updates that subset DT = as.data.table(iris) DT[,Species:=as.character(Species)] test(1375.1, DT[,mean(Petal.Width),by=Species][order(-V1),Species:=toupper(Species)]$Species, c("SETOSA","VERSICOLOR","VIRGINICA")) test(1375.2, DT[,mean(Petal.Width),by=Species][order(-V1)][,Species:=toupper(Species)]$Species, c("VIRGINICA","VERSICOLOR","SETOSA")) test(1375.3, DT[,mean(Petal.Width),by=Species][V1>1,Species:=toupper(Species)]$Species, c("setosa","VERSICOLOR","VIRGINICA")) # Secondary keys a.k.a indexes ... DT = data.table(a=1:10,b=10:1) test(1376.1, indices(DT), NULL) test(1376.2, DT[b==7L,verbose=TRUE], DT[4L], output="Creating new index 'b'") test(1376.3, indices(DT), "b") test(1376.4, DT[b==8L,verbose=TRUE], DT[3L], output="Using existing index 'b'") test(1376.5, DT[a==7L,verbose=TRUE], DT[7L], output="Creating new index") # add 2nd secondary key test(1376.6, indices(DT), c("b","a")) # 2 secondary keys of single columns test(1376.7, DT[a==7L,verbose=TRUE], DT[7L], output="Using existing index 'a'") setkey(DT,b) test(1376.8, indices(DT), NULL) test(1376.9, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # create indices for next test setindex(DT,NULL) test(1376.10, list(key(DT), indices(DT)), list("b", NULL)) options(datatable.auto.index = FALSE) test(1376.11, list(DT[a==2L], indices(DT)), list(DT[9L],NULL)) options(datatable.auto.index = TRUE) test(1376.12, list(DT[a==2L], indices(DT)), list(DT[9L],"a")) # When i is FALSE and a column is being added by reference, for consistency with cases when i is not FALSE # we should still add the column. But we need to know what type it should be, so the user supplied RHS of := # needs to work on empty input to tell us the column type. Package vardpoor in example(vardchanges) used to # rely on DT[FALSE,...] not adding the column and not evaluating RHS but it no longer does that so we can # make this consistent now. If that usage is required then user should use if(FALSE) DT[...] instead. DT = data.table(a=1:3, b=4:6) ans = copy(DT)[, foo:=NA_real_] test(1377.1, copy(DT)[FALSE, foo:=7], ans) test(1377.2, copy(DT)[0, foo:=7], ans) test(1377.3, copy(DT)[, foo := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","b")], data.table(a=1:3, b=4:6, foo=c("1__4","2__5","3__6"))) err = "Some items of .SDcols are not column names" # .SDcols should always be checked even if RHS (which uses .SDcols) isn't eval'd due to i==FALSE test(1377.4, copy(DT)[, bar := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","zz")], error=err) test(1377.5, copy(DT)[FALSE, bar := Reduce(function(x,y)paste(x,y,sep="__"), .SD), .SDcols=c("a","zz")], error=err) test(1377.6, DT, data.table(a=1:3, b=4:6)) # check that the original hasn't been changed by these tests test(1377.7, copy(DT)[FALSE, bar:=stop("eval'd")], error="eval'd") DT[,bar:=NA] # create column so that RHS isn't needed to be eval'd to know type. We don't allow type changes anyway. # Now no need to eval RHS (and therefore find error), as relied on by package treemap # in example(random.hierarchical.data) in the do.call of fun=="addRange" where it's called on # an empty subset and LB <- x[[1]][1] results in NA which causes seq(LB, UB, ...) to error. test(1377.8, copy(DT)[FALSE, bar:=stop("eval'd")], DT) #==================================== # fread issue with http download on Windows, thanks to Steve Miller for highlighting. # any file would do but this one is http://www.russell.com/common/indexes/csvs/russellmicrocapvalueindex_hist.csv # it happens to have a \r embedded in the first (quoted) column as well but that's not the issue # can't pass in the http: address directly because this runs on CRAN and any http: site might be unavailable # therefore, this doesn't actually test mode="wb" but close as we can get test(1378.1, dim(fread("russellCRLF.csv")), c(19,4)) f = paste("file://",getwd(),"/russellCRLF.csv",sep="") # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. test(1378.2, dim(fread(f, showProgress=FALSE)), c(19,4)) f = paste("file://",getwd(),"/russellCRCRLF.csv",sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. test(1378.3, fread(f, showProgress=FALSE), error="Line ending is .*r.*r.*n. R's download.file() appears to add the extra .*r in text mode on Windows. Please download again in binary mode (mode='wb') which might be faster too. Alternatively, pass the URL directly to fread and it will download the file in binary mode for you.") #==================================== oldv = options(datatable.fread.datatable = FALSE) test(1379.1, fread("A,B\n1,3\n2,4\n"), data.frame(A=1:2,B=3:4)) test(1379.2, fread("A,B\n1,3\n2,4\n",data.table=TRUE), data.table(A=1:2,B=3:4)) options(datatable.fread.datatable = TRUE) test(1379.3, fread("A,B\n1,3\n2,4\n",data.table=FALSE), data.frame(A=1:2,B=3:4)) options(oldv) # That that RHS of == is coerced to x's type before bmerge in auto index. Package vardpoor does this in example(linqsr) DT = data.table(a=c(0,0,1,1,0,0), b=1:6) # 'a' type double here, as it is in vardpoor test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 is of size 0 but maximum size is 1,") # Fix for #813 and #758 DT = data.table(x = 1:2) test(1382.1, DT[c(FALSE, FALSE), list(x, 3:4)], data.table(x=integer(0), V2=integer(0))) DT <- data.table(id = c("a", "a", "b", "b"), var = c(1.1, 2.5, 6.3, 4.5), key="id") test(1382.2, DT["c", list(id, check = any(var > 3)), nomatch=0L], data.table(id=character(0), check=logical(0), key="id")) test(1382.3, DT[c(FALSE), id], character(0)) DT <- DT[1:3]; setkey(DT, id) test(1382.4, DT[c("c", "b"), list(id, check = any(var > 3)), nomatch=0L], data.table(id="b", check=TRUE, key="id")) # Fix for #742 - allow.cartesian should be ignored if `i` has no duplicates. DT <- data.table(id=rep(letters[1:2], 2), var = rnorm(4), key="id") test(1383.1, DT[letters[1:3], list(var)], DT[1:5, list(var)]) # Fix for #800 - allow.cartesian should be ignored if jsub[1L] has `:=`. TODO: maybe still warn if `i` has duplicates? DT=data.table(id=c(1,1), date=c(1992,1991), value=c(4.1,4.5), key="id") test(1383.2, copy(DT)[DT, a:=1], DT[, a := 1]) # Fix for #476 and #825 if ("package:reshape" %in% search()) { DT <- data.table(ID = c(611557L, 611557L, 611557L, 894125L, 894125L, 894125L, 894125L, 894125L, 898856L, 898856L, 898856L, 898856L, 898856L, 898856L, 898899L, 898899L, 898899L), DATUM = structure(c(16101, 16071, 16261, 16104, 16133, 16167, 16201, 16236, 16089, 16118, 16147, 16176, 16236, 16208, 16163, 16125, 16209), class = "Date"), N = c(25L, 9L, 23L, 29L, 26L, 26L, 27L, 28L, 39L, 39L, 38L, 36L, 40L, 39L, 19L, 20L, 19L), rank = c(2, 1, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 6, 5, 2, 1, 3)) ans = cast(DT, ID ~ rank, value = "DATUM") test(1383.3, names(DT), c("ID", "DATUM", "N", "rank")) } else { cat("Tests 1383.3 not run. If required call library(reshape) first.\n") } if ("package:caret" %in% search()) { DT <- data.table(x = rnorm(10), y = rnorm(10)) cv.ctrl <- trainControl(method = 'repeatedcv', number = 5, repeats = 1) fit <- train(y ~ x, data = DT, 'lm', trControl = cv.ctrl) test(1383.4, names(DT), c("x", "y")) } else { cat("Tests 1383.4 not run. If required call library(caret) first.\n") } # Somehow DT[col==max(col)] was never tested, broken by auto-indexing new in v1.9.4, #858 DT = data.table(a = c(1,1,1,2,2,2,3,3,3), b = rnorm(9)) test(1384, DT[a == max(a)], DT[7:9]) # Dups on RHS of == or %in% DT = data.table(id = paste("id",1:5,sep="")) id.sub = c("id1", "id2", "id3", "id3", "id4") # deliberate dup test(1385.1, DT[id %in% id.sub], DT[1:4]) test(1385.2, DT[id == id.sub], DT[1:3]) # reserved class attributes conflict with auto index names, # DT = data.table(class=c('a','b'), x=c(1,2)) test(1386, DT[class=='a'], DT[1]) # Fix for #774 - parsing a$b() in 'j' DT = data.table(x=1:5, y=6:10) ll = list(foo = function() 1L) test(1387.1, copy(DT)[, z := ll$foo()], copy(DT)[, z:=1L]) test(1387.2, copy(DT)[, z := ll[[1L]]()], copy(DT)[, z:=1L]) # Fix for #811 - ITime and negative integers formats wrong result. x = c(1L, -1L, -3700L) class(x) = "ITime" test(1388, as.character(x), c("00:00:01", "-00:00:01", "-01:01:40")) # Fix for #880. Another eval(parse(.)) issue. DT <- as.data.table(iris) DT[, foo := "Species"] test(1389, copy(DT)[,bar := eval(parse(text=foo[1]), envir=.SD)], copy(DT)[, bar := Species]) # Fix for foverlaps() floating point interval (double) types. Should increment them by machine tolerance, not by 1L DT1 = data.table(start=c(0.88), end=c(0.88)) DT2 = data.table(start=c(0.26, 0.5, 0.55, 0.7), end=c(0.61, 0.88, 0.88-.Machine$double.eps^0.5, 0.89)) setkey(DT2) test(1390.1, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1L, yid=c(2L, 4L))) DT1 = data.table(start=c(0.3,0.5), end=c(0.3,0.5)) DT2 = data.table(start=c(0.4), end=c(0.4)) setkey(DT2) test(1390.2, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:2, yid=as.integer(c(NA, NA)))) tt = c( as.POSIXct('2011-10-11 07:49:36'), as.POSIXct('2011-10-11 07:49:37')) DT1 = data.table(start=tt, end=tt) DT2 = data.table(start=tt[1], end=tt[1]) setkey(DT2) test(1390.3, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:2, yid=as.integer(c(1L, NA)))) tt = c( as.POSIXct('2011-10-11 07:49:36.3'), as.POSIXct('2011-10-11 07:49:37.4'), as.POSIXct('2011-10-11 07:49:37.5')) DT1 = data.table(start=tt, end=tt) DT2 = data.table(start=tt[2], end=tt[2]) setkey(DT2) test(1390.4, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:3, yid=as.integer(c(NA, 1L, NA)))) tt = c( as.POSIXct('2011-10-11 07:49:36.0003'), as.POSIXct('2011-10-11 07:49:36.0199'), as.POSIXct('2011-10-11 07:49:36.0399')) DT1 = data.table(start=tt, end=tt) DT2 = data.table(start=tt[2], end=tt[2]) setkey(DT2) test(1390.5, foverlaps(DT1, DT2, which=TRUE), data.table(xid=1:3, yid=as.integer(c(NA, 1, NA)))) # Fix for #891. 'subset' and duplicate names. # duplicate column names rule - if column numbers, extract the right column. If names, extract always the first column DT = data.table(V1=1:5, V2=6:10, V3=11:15) setnames(DT, c("V1", "V2", "V1")) test(1391.1, subset(DT, select=c(3L,2L)), DT[, c(3L, 2L), with=FALSE]) test(1391.2, subset(DT, select=c("V2", "V1")), DT[, c("V2", "V1"), with=FALSE]) # Test faster version of na.omit() using is_na. DT = data.table(x=sample(c(1:2, NA), 30, TRUE), y=sample(c(1:5, NA, NaN), 30, TRUE)) test(1392.1, na.omit(DT), DT[!is.na(x) & !is.na(y)]) # added 'invert = ', a logical argument which when TRUE returns rows that has any NAs instead. test(1392.2, na.omit(DT, invert=TRUE), DT[is.na(x) | is.na(y)]) # Fix for #899. Mix of ordered and normal factors where normal factors in more than 1 data.table has identical levels. DT1 = data.table(A = factor(INT(7,8,7,8,7)), B = factor(6:10), C = 0) DT2 = data.table(D = ordered(1:5), A = factor(INT(1:2,1:2,1L)), C = 0) DT3 = data.table(A = factor(INT(7:8)), C = 0) ans = data.table(A=factor(INT(7,8,7,8,7,1,2,1,2,1,7,8), levels=c("7", "8", "1", "2")), B=factor(INT(6:10, rep(NA,7))), C=0, D=ordered(INT(rep(NA,5), 1:5, rep(NA,2)))) test(1393.1, rbindlist(list(DT1, DT2, DT3), fill = TRUE), ans) # test for #591 (R-Forge #2491) ans[, ID := rep(1:3, c(5,5,2))] setcolorder(ans, c("ID", LETTERS[1:4])) test(1393.2, rbindlist(list(DT1, DT2, DT3), fill = TRUE, idcol="ID"), ans) # Tests for na.omit.data.table (faster version + with a 'cols=' new argument) col = c(1:2, NA_integer_) DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as.logical(sample(col,20,TRUE)), d=as.character(sample(col,20,TRUE))) # can't use complete.cases on bit64... will have to test integer64 separately. # if ("package:bit64" %in% search()) { # DT[, e := as.integer64(sample(col,20,TRUE))] # } test_no = 1394 for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { ans1 = na.omit(DT, cols=cols) ans2 = DT[complete.cases(DT[, cols, with=FALSE])] test_no <<- signif(test_no+.001, 7) test(test_no, ans1, ans2) 0L }) } # That data.table-unaware code in packages like knitr still work if ("package:knitr" %in% search()) { DT = data.table(x=1, y=2) test(1395, kable(DT), output="x.*y.*1.*2") # kable in knitr v1.6 calls DF[...] syntax } else { cat("Test 1395 not run. If required call library(knitr) first.\n") } # dropping secondary keys on update or delete DT = data.table(a=1:3, b=4:6) test(1396, DT[a==2, verbose=TRUE], DT[2], output="Creating new index 'a'") test(1397, DT[b==6, verbose=TRUE], DT[3], output="Creating new index 'b'") test(1398, DT[b==6, verbose=TRUE], DT[3], output="Using existing index 'b'") test(1399, indices(DT), c("a","b")) test(1400, DT[2, a:=4L, verbose=TRUE], data.table(a=c(1L,4L,3L),b=4:6), output="Dropping index 'a' due to update on 'a' (column 1)") test(1401, indices(DT), "b") test(1402, DT[,b:=NULL,verbose=TRUE], data.table(a=c(1L,4L,3L)), output="Dropping index 'b' due to delete of 'b' (column 2)") test(1403, indices(DT), NULL) DT = data.table(x=1:5) test(1404, DT[, y := x <= 2L], data.table(x=1:5, y=c(TRUE,TRUE,FALSE,FALSE,FALSE))) test(1405, DT[y == TRUE, .N, verbose=TRUE], 2L, output="Creating new index") test(1406, DT[, y := x <= 3L, verbose=TRUE], data.table(x=1:5, y=c(TRUE,TRUE,TRUE,FALSE,FALSE)), output="Dropping index") test(1407, DT[y == TRUE, .N], 3L) DT = data.table(x=1:5, y=10:6) test(1408, DT[x==3,verbose=TRUE], DT[3], output="Creating") test(1409, indices(DT), "x") set(DT,1:3,1L,-10L) test(1410, indices(DT), NULL) test(1411, DT[x==5], DT[5]) setorder(DT, y) test(1412, indices(DT), NULL) test(1413, DT[x==5], DT[1]) DT = data.table(foo=1:3, bar=4:6, baz=9:7) setindex(DT,foo,bar,baz) test(1414, indices(DT), c("foo__bar__baz")) test(1415, DT[2,bar:=10L,verbose=TRUE], output="Dropping index 'foo__bar__baz' due to update on 'bar'") # test middle test(1416, indices(DT), NULL) setindex(DT,foo,bar,baz) test(1417, DT[2,baz:=10L,verbose=TRUE], output="Dropping index 'foo__bar__baz' due to update on 'baz'") # test last setindex(DT,bar,baz) test(1418, DT[2,c("foo","bar"):=10L,verbose=TRUE], output="Dropping index.* due to update on 'bar'") # test 2nd to 1st setindex(DT,bar,baz) test(1419, DT[2,c("foo","baz"):=10L,verbose=TRUE], output="Dropping index.* due to update on 'baz'") # test 2nd to 2nd # setnames updates secondary key DT = data.table(a=1:5,b=10:6) setindex(DT,b) test(1420, indices(DT), "b") setnames(DT,"b","foo") test(1421, indices(DT), "foo") test(1422, DT[foo==9, verbose=TRUE], DT[2], output="Using existing index 'foo'") setindex(DT,a,foo) test(1423, indices(DT), c("foo","a__foo")) # tests as well that order of attributes is retained although we don't use that property currently. test(1424, indices(setnames(DT,"foo","bar")), c("bar","a__bar")) test(1425, indices(setnames(DT,"a","baz")), c("bar","baz__bar")) test(1426, DT[baz==4L, verbose=TRUE], output="Creating new index 'baz'") test(1427, indices(DT), c("bar","baz__bar","baz")) test(1428, DT[bar==9L, verbose=TRUE], output="Using existing index 'bar'") test(1429, indices(setnames(DT,"bar","a")), c("baz", "a", "baz__a")) # Finalised == and %in% optimization in i DT = data.table(a=1:3,b=c(0,2,3,0,0,2)) test(1430, DT[a==1:2], error="RHS of == is length 2 which is not 1 or nrow (6). For robustness, no recycling is allowed (other than of length 1 RHS). Consider %in% instead.") test(1431, DT[a %in% 1:2], DT[c(1,2,4,5)]) test(1432, DT[a==b], DT[2:3]) test(1433, DT[a %in% b], DT[c(2,3,5,6)]) test(1434, DT[a==b+1], DT[c(1,4,6)]) test(1435, DT[b==max(a)], DT[3]) test(1436, DT[a==2,verbose=TRUE], DT[c(2,5)], output="Coercing double column i.'V1' to integer") DT[,a:=factor(letters[a])] test(1437, DT[a==factor("b"),verbose=TRUE], DT[c(2,5)], output="Creating new index 'a'") # fread dec=',' e.g. France test(1438, fread("A;B\n1;2,34\n", dec="12"), error="dec must be a single character") test(1439, (if (base::getRversion()<"3.3.0") suppressWarnings else identity)(fread("A;B\n1;2,34\n", dec="1")), error="Unable to change to a locale which provides the desired dec") # this test runs on many machines so chose a dec for this test which is sure not to be valid in any locale test(1440, fread("A;B\n1;2,34\n", sep=".", dec="."), error="The two arguments to fread 'dec' and 'sep' are equal ('.')") if (.Platform$OS.type=="windows" || (!inherits(tt <- try(system("locale -a", intern=TRUE)), "try-error") && "fr_FR.utf8" %in% tt )) { # e.g. on Matt's machine where I've installed fr_FR.utf8 which has dec="," old = options(datatable.fread.dec.locale=if (.Platform$OS.type=="unix") "fr_FR.utf8" else "French_France.1252") oldlocale = Sys.getlocale("LC_NUMERIC") test(1441, fread("A;B\n1;2,34\n", dec=",", verbose=TRUE), data.table(A=1L, B=2.34), output="success!") test(1442, Sys.getlocale("LC_NUMERIC"), oldlocale) # locale restored after success test(1443.1, fread("A;B\n1;2,34\n", dec=",", sep=",", verbose=TRUE), error="'dec' and 'sep' are equal", output="success!") test(1443.2, Sys.getlocale("LC_NUMERIC"), oldlocale) # locale restored after error. [ouput check in 1443.1 ensures it was changed] # sep=".", issue #502 input = paste( paste("192.168.1.", 1:10, sep=""), collapse="\n") test(1444.1, fread(input, sep=".", dec="*"), error="Unable to change to a locale which provides the desired dec") test(1444.2, fread(input, sep="."), ans<-data.table(V1=192L,V2=168L,V3=1L,V4=1:10)) # by default, dec="," when sep="." test(1444.3, fread(paste(paste("192. 168. 1. ", 1:10, sep = ""), collapse="\n"), sep="."), ans) test(1444.4, fread(paste(paste("Hz.BB.GHG.", 1:10, sep = ""), collapse="\n"), sep="."), data.table(V1="Hz",V2="BB",V3="GHG",V4=1:10)) options(old) # return to default set it .onLoad } else { cat("Tests 1441-1444 not run. If required install the 'fr_FR.utf8' locale.\n") } # doubled quote inside a quoted field followed by an embedded newline test(1445, fread("doublequote_newline.csv")[7:10], data.table(A=c(1L,1L,2L,1L), B=c("a","embedded \"\"field\"\"\nwith some embedded new\nlines as well","not this one","a"))) # the example from #489 directly : test(1446, fread('A,B,C\n233,"AN ""EMBEDDED"" QUOTE FIELD",morechars\n'), data.table(A=233L, B='AN ""EMBEDDED"" QUOTE FIELD', C='morechars')) # # unescaped quoted subregion followed by newline # # commented this test for now as the logic now is to redirect to normal checks # test(1447, fread('A,B,C\n233,"an unescaped "embedded" # region followed by newline",morechars\n')) # when detecting types ... test(1448.1, fread('A,B\n1,"embedded""\nquote"\n2,should be ok\n'), data.table(A=1:2,B=c('embedded""\nquote','should be ok'))) test(1448.2, fread('A,B\n1,"embedded"" quote"\n2,should be ok\n'), data.table(A=1:2,B=c('embedded"" quote','should be ok'))) if ("package:bit64" %in% search()) { # quoted multiline (scrambled data thanks to #810) test(1449, fread("quoted_multiline.csv")[c(1,43:44),c(1,22:24),with=FALSE], data.table(GPMLHTLN=as.integer64(c("3308386085360","3440245203140","1305220146734")), BLYBZ = c(0L,4L,6L), ZBJBLOAJAQI = c("LHCYS AYE ZLEMYA IFU HEI JG FEYE","",""), JKCRUUBAVQ = c("",".\\YAPCNXJ\\004570_850034_757\\VWBZSS_848482_600874_487_PEKT-6-KQTVIL-7_30\\IRVQT\\HUZWLBSJYHZ\\XFWPXQ-WSPJHC-00-0770000855383.KKZ",""))) } # Fix for #927 DT = data.table(x=1L, y=2L) test(1450, DT[, set(.SD, j="x", value=10L)], error=".SD is locked. Updating .SD by reference using := or set") # Tests for shallow copy taking cols argument - not exported yet. DT = setDT(lapply(1:5, sample, 10, TRUE)) ans1 = sapply(DT, address) fans2 = function(DT, cols=NULL) sapply(shallow(DT, cols), address) test(1451.1, ans1, fans2(DT)) # make sure default/old functionality is intact test(1451.2, ans1[3:4], fans2(DT, 3:4)) # using integer column numbers test(1451.3, ans1[c(5,2)], fans2(DT, c(5,2))) # using numeric column numbers test(1451.4, ans1[c(4,2,4)], fans2(DT,c(4,2,4))) # using duplicate column numbers test(1451.5, ans1[3:2], fans2(DT, c("V3", "V2"))) # using column names test(1451.6, ans1[c(3,3)], fans2(DT, c("V3", "V3"))) # using duplicate column names test(1451.7, shallow(DT, integer(0)), null.data.table()) # length-0 input work as intended as well. test(1451.8, shallow(DT, character(0)), null.data.table()) # length-0 input work as intended as well. test(1452, fread("notexist.csv"), error="File 'notexist.csv' does not exist. Include one or more spaces to consider the input a system command.") # Test for #802 test(1453, fread("fread_line_error.csv"), error="Expecting 24 cols, but line 12 contains") # no-sep-found => sep="\n", use case for this in #738 test(1454.1, fread('"Foo"`"Bar"\n1`2\n',sep="`"), data.table(Foo=1L,Bar=2L)) test(1454.2, fread('"Foo"\n1\n',sep="`"), data.table(Foo=1L)) # Fix for #958 - Don't create secondary keys on .SD DT <- data.table(a=c(1, 1, 1, 0, 0), b=c("A", "B", "A1", "A", "B")) test(1455, DT[, nrow(.SD[b == 'B']), by=.(a)], data.table(a=c(1,0), V1=1L)) # Test for chmatch2 bug fix x1 = c("b", "a", "d", "a", "c", "a") x2 = c("a", "a", "a") x3 = c("d", "a", "a", "d", "a") table = rep(letters[1:3], each=2) test(1456.1, chmatch2(x1, table), as.integer(c(3,1,NA,2,5,NA))) test(1456.2, chmatch2(x2, table), as.integer(c(1,2,NA))) test(1456.3, chmatch2(x3, table), as.integer(c(NA,1,2,NA,NA))) # Add tests for which_ x = sample(c(-5:5, NA), 25, TRUE) test(1458.1, which(x > 0), which_(x > 0)) # default is TRUE test(1458.2, which(x > 0), which_(x > 0, TRUE)) # test explicitly test(1458.3, which(!x > 0), which_(x > 0, FALSE)) # Fix for #982. Testing subsetDT on complex/raw vectors, and added tests for other types. DT = data.table(a=c(1:3,NA_integer_), b=c(1,2,3,NA), c=as.complex(c(1:3,NA)), d=as.raw(1:4), e=as.list(1:4), f=c(FALSE,FALSE,TRUE,NA), g=c("a", "b", "c", NA_character_)) test(1459.1, .Call("CsubsetDT", DT, which(DT$a > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE])) test(1459.2, .Call("CsubsetDT", DT, which(DT$b > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE])) test(1459.3, .Call("CsubsetDT", DT, which(Re(DT$c) > 2), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE])) test(1459.4, .Call("CsubsetDT", DT, which(DT$d > 2), seq_along(DT)), setDT(as.data.frame(DT)[3:4, , drop=FALSE])) test(1459.5, .Call("CsubsetDT", DT, which(DT$f), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE])) test(1459.6, .Call("CsubsetDT", DT, which(DT$g == "c"), seq_along(DT)), setDT(as.data.frame(DT)[3, , drop=FALSE])) test(1459.7, .Call("CsubsetDT", DT, which(DT$a > 2 | is.na(DT$a)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,])) test(1459.8, .Call("CsubsetDT", DT, which(DT$b > 2 | is.na(DT$b)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,])) test(1459.9, .Call("CsubsetDT", DT, which(Re(DT$c) > 2 | is.na(DT$c)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,])) test(1459.10, .Call("CsubsetDT", DT, which(DT$f | is.na(DT$f)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,])) test(1459.11, .Call("CsubsetDT", DT, which(DT$g == "c" | is.na(DT$g)), seq_along(DT)), setDT(as.data.frame(DT)[3:4,])) test(1459.12, .Call("CsubsetDT", DT, 5L, seq_along(DT)), setDT(as.data.frame(DT)[5,])) # Test for na.omit with list, raw and complex types DT = data.table(x=c(1L,1L,NA), y=c(NA, NA, 1), z=as.raw(1:3), w=list(1,NA,2), v=c(1+5i, NA, NA)) test(1460.1, na.omit(DT, cols="w"), DT) test(1460.2, na.omit(DT, cols="v"), DT[1]) test(1460.3, na.omit(DT, cols=c("v", "y")), DT[0]) test(1460.4, na.omit(DT, cols=c("z", "v")), DT[1]) test(1460.5, na.omit(DT, cols=c("w", "v")), DT[1]) # Fix for #985 DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4)) test(1461.1, DT[, c(lapply(.SD, mean), lapply(.SD, sd)), by=x], DT[, c(lapply(.SD, function(x) mean(x)), lapply(.SD, function(x) sd(x))), by = x]) # Tests for #994 DT = data.table(x=c("a", "a", "b", "b"), v1=sample(4), v2=sample(4)) cols = c("v1", "v2") test(1462.1, DT[, mget(cols, as.environment(-1))], DT[, cols, with=FALSE]) # as.environment needed for testing on pre-R3.0.0 which we don't want to depend on yet test(1462.2, DT[, mget(cols[1], as.environment(-1))], DT[, cols[1], with=FALSE]) test(1462.3, DT[, sum(unlist(mget(cols, as.environment(-1)))), by=x], DT[, sum(unlist(.SD)), by=x, .SDcol=cols]) # test for 'shift' x=1:5 y=factor(x) test(1463.1, shift(x,1L), as.integer(c(NA, 1:4))) test(1463.2, shift(x,1:2), list(as.integer(c(NA, 1:4)), as.integer(c(NA, NA, 1:3)))) test(1463.3, shift(x,1L, 0L), as.integer(c(0L, 1:4))) test(1463.4, shift(x,1L, type="lead"), as.integer(c(2:5, NA))) test(1463.5, shift(x,1:2, type="lead"), list(as.integer(c(2:5, NA)), as.integer(c(3:5, NA, NA)))) test(1463.6, shift(x,1L, 0L, type="lead"), as.integer(c(2:5, 0L))) test(1463.7, shift(y,1L), factor(c(NA,1:4), levels=1:5)) test(1463.8, shift(y,1L, type="lead"), factor(c(2:5, NA), levels=1:5)) x=as.numeric(x) test(1463.9, shift(x,1L), as.numeric(c(NA, 1:4))) test(1463.10, shift(x,1:2), list(as.numeric(c(NA, 1:4)), as.numeric(c(NA, NA, 1:3)))) test(1463.11, shift(x,1L, 0L), as.numeric(c(0L, 1:4))) test(1463.12, shift(x,1L, type="lead"), as.numeric(c(2:5, NA))) test(1463.13, shift(x,1:2, type="lead"), list(as.numeric(c(2:5, NA)), as.numeric(c(3:5, NA, NA)))) test(1463.14, shift(x,1L, 0L, type="lead"), as.numeric(c(2:5, 0L))) if ("package:bit64" %in% search()) { x=as.integer64(x) test(1463.15, shift(x,1L), as.integer64(c(NA, 1:4))) test(1463.16, shift(x,1:2), list(as.integer64(c(NA, 1:4)), as.integer64(c(NA, NA, 1:3)))) test(1463.17, shift(x,1L, 0L), as.integer64(c(0L, 1:4))) test(1463.18, shift(x,1L, type="lead"), as.integer64(c(2:5, NA))) test(1463.19, shift(x,1:2, type="lead"), list(as.integer64(c(2:5, NA)), as.integer64(c(3:5, NA, NA)))) test(1463.20, shift(x,1L, 0L, type="lead"), as.integer64(c(2:5, 0L))) } x=as.character(x) test(1463.21, shift(x,1L), as.character(c(NA, 1:4))) test(1463.22, shift(x,1:2), list(as.character(c(NA, 1:4)), as.character(c(NA, NA, 1:3)))) test(1463.23, shift(x,1L, 0L), as.character(c(0L, 1:4))) test(1463.24, shift(x,1L, type="lead"), as.character(c(2:5, NA))) test(1463.25, shift(x,1:2, type="lead"), list(as.character(c(2:5, NA)), as.character(c(3:5, NA, NA)))) test(1463.26, shift(x,1L, 0L, type="lead"), as.character(c(2:5, 0L))) x=c(TRUE,FALSE,TRUE,FALSE,TRUE) test(1463.27, shift(x,1L), c(NA, x[-5L])) test(1463.28, shift(x,1:2), list(c(NA, x[-5L]), c(NA, NA, x[-(4:5)]))) test(1463.29, shift(x,1L, 0L), c(FALSE, x[-5L])) test(1463.30, shift(x,1L, type="lead"), c(x[-1L], NA)) test(1463.31, shift(x,1:2, type="lead"), list(c(x[-1L],NA), c(x[-(1:2)],NA,NA))) test(1463.32, shift(x,1L, 0L, type="lead"), c(x[-(1)], FALSE)) # for list of list, #1595 x = data.table(foo = c(list(c("a","b","c")), list(c("b","c")), list(c("a","b")), list(c("a"))), id = c(1,1,2,2)) test(1463.33, x[, shift(list(foo)), by=id], data.table(id=c(1,1,2,2), V1=list(NA, c("a", "b", "c"), NA, c("a", "b")))) test(1463.34, x[, shift(list(foo), type="lead", fill=NA_integer_), by=id], data.table(id=c(1,1,2,2), V1=list(c("b", "c"), NA_integer_, c("a"), NA_integer_))) # Fix for #1009 segfault in shift val = runif(1) test(1463.33, shift(val, 2L), NA_real_) test(1463.34, shift(val, 2L, type="lead"), NA_real_) test(1463.35, shift(1:5, -1L), error="n must be non-negative integer") test(1463.36, shift(1:5, 1L, fill=c(1:2)), error="fill must be a vector of length") # add tests for date and factor? # test for 'give.names=TRUE' on vectors x = 1:10 nm = c("x_lag_1", "x_lag_2") ans = list(as.integer(c(NA, 1:9)), as.integer(c(NA, NA, 1:8))) setattr(ans, 'names', nm) test(1463.27, shift(x, 1:2, give.names=TRUE), ans) # FR #686 DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10) # Seemingly superfluous 'foo' is needed to test fix for #1942 DT[, b := as.integer(factor(a))][, c := as.numeric(factor(a))] test(1464.1, rleidv(DT, "a"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.2, rleid(DT$a), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.3, rleidv(DT, "b"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.4, rleid(DT$b), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.5, rleidv(DT, "c"), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.6, rleid(DT$c), c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 5L)) test(1464.7, rleid(as.complex(c(1,0+5i,0+5i,1))), error="Type 'complex' not supported") test(1464.8, rleidv(DT, 0), error="outside range") test(1464.9, rleidv(DT, 5), error="outside range") test(1464.11, rleidv(DT, 1:4), 1:nrow(DT)) set.seed(1) DT = data.table( sample(1:2,20,replace=TRUE), sample(1:2,20,replace=TRUE), sample(1:2,20, replace=TRUE)) test(1464.12, rleidv(DT, 1:4), error="outside range") test(1464.13, rleidv(DT, 1:2), ans<-INT(1,2,3,4,5,6,6,6,7,8,8,9,10,11,12,13,14,15,16,17)) test(1464.14, rleidv(DT, 2:1), ans) test(1464.15, rleidv(DT, c(3,1)), INT(1,1,2,2,3,4,5,5,6,7,8,9,10,11,12,13,14,15,16,17)) # data.table-xts conversion #882 if ("package:xts" %in% search()) { # Date index dt = data.table(index = as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) dt_xt = as.data.table(xt) xt_dt = as.xts.data.table(dt) test(1465.1, all.equal(dt, dt_xt, check.attributes = FALSE)) test(1465.2, xt, xt_dt) # POSIXct index dt <- data.table(index = as.POSIXct(as.Date((as.Date("2014-12-12")-49):as.Date("2014-12-12"),origin="1970-01-01"),origin="1970-01-01"),quantity = as.numeric(rep(c(1:5),10)),value = rep(c(1:10)*100,5)) xt = as.xts(matrix(data = c(dt$quantity, dt$value),ncol = 2,dimnames = list(NULL,c("quantity","value"))),order.by = dt$index) dt_xt = as.data.table(xt) xt_dt = as.xts.data.table(dt) test(1465.3, all.equal(dt, dt_xt, check.attributes = FALSE)) test(1465.4, xt, xt_dt) } # as.data.table.default #969 ar <- array(NA, dim=c(10,4),dimnames = list(NULL,paste("col",1:4,sep=""))) test(1466.1, as.data.table(as.data.frame(ar)), as.data.table(ar)) # array type x <- rep(Sys.time(),3) test(1466.2, as.data.table(as.data.frame(x)), as.data.table(x)) # posix type # fix for #1001, #1002 and #759 # When adding a column, even if i results in no rows, the RHS needs to evaluate so we can know the # column type to create. Always create the column for consistency that does not depend on the data in i for (bool in c(FALSE,TRUE)) { options(datatable.auto.index=bool) DT = data.table(a=1:2) test(1467.01 + bool*0.03, copy(DT)[a==3, b:=notExist+1], error="notExist") test(1467.02 + bool*0.03, copy(DT)[a==3, b:=a+5L], data.table(a=1:2, b=NA_integer_)) test(1467.03 + bool*0.03, copy(DT)[a==3, b:=a+5], data.table(a=1:2, b=NA_real_)) } test(1467.07, getOption("datatable.auto.index")) # ensure to leave TRUE # fix for first bug reported in #1006 on 'foverlaps()' x <- c(-0.1, 0, 0.1) n <- length(x) dt.ref <- data.table(start=x[-n], end=x[-1], key=c("start", "end")) dt.query <- data.table(q1=c(-0.2, -0.05, 0.05, 0.15), q2=c(-0.2, -0.05, 0.05, 0.15), key=c("q1", "q2")) ans=cbind(dt.ref[, .(start,end)], dt.query[2:3, .(q1,q2)]) setkey(ans, q1,q2) test(1468.1, foverlaps(dt.query, dt.ref, nomatch=0L), ans) # fix and additional tests for #1006 following OP's follow-up. dt1 = data.table(x=c(-6.36917800737546, -2.19964384651646), y=c(-2.19964384651646, 4.07116428752538)) dt2 = data.table(x= 2.91816502571793, y=2.91816502571793) setkey(dt1) setkey(dt2) test(1468.2, foverlaps(dt2, dt1, which=TRUE), data.table(xid=1L, yid=2L)) dt1 = data.table(x=c(-6,-3), y=c(-3,4)) dt2 = data.table(x=3,y=3) setkey(dt1) setkey(dt2) test(1468.3, foverlaps(dt2, dt1, which=TRUE), data.table(xid=1L, yid=2L)) # Fix for #1010 (discovered while fixing #1007). Don't retain key if i had no key, but irows is sorted, and roll != FALSE... See example in #1010. DT = data.table(x=c(-5,5), y=1:2, key="x") test(1469.1, key(DT[J(c(2,0)), roll=TRUE]), NULL) test(1469.2, key(DT[J(c(2,0)), .(x,y), roll=TRUE]), NULL) test(1469.3, key(DT[J(c(2,0)), y, roll=TRUE, by=.EACHI]), NULL) test(1469.4, key(DT[J(c(2,0))]), NULL) test(1469.5, key(DT[SJ(c(2,0)), roll=TRUE]), "x") test(1469.6, key(DT[J(c(2,0)), roll="nearest"]), NULL) # 1007 fix, dealing with Inf and -Inf correctly in rolling joins. DT = data.table(x=c(-Inf, 3, Inf), y=1:3, key="x") test(1470.1, DT[J(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(2,-Inf,5,Inf), y=c(1L, 1:3))) test(1470.2, DT[J(c(2,-Inf,5,Inf)), roll=10], data.table(x=c(2,-Inf,5,Inf), y=INT(c(NA, 1, 2, 3)))) test(1470.3, DT[SJ(c(2,-Inf,5,Inf)), roll=Inf], data.table(x=c(-Inf,2,5,Inf), y=c(1L, 1:3), key="x")) # 1006, second bug with -Inf, now that #1007 is fixed. x <- c(-Inf, -0.1, 0, 0.1, Inf) n <- length(x) dt.ref <- data.table(start=x[-n], end=x[-1], key=c("start", "end")) dt.query <- data.table(q1=c(-0.2, -0.05, 0.05, 0.15), q2=c(-0.2, -0.05, 0.05, 0.15), key=c("q1", "q2")) test(1471, foverlaps(dt.query, dt.ref), data.table(dt.ref, dt.query, key=c("q1", "q2"))) # #1014 (segfault) fix test(1472, shift(1, 1:2, NA, 'lag'), list(NA_real_, NA_real_)) # #528, type=equal simple test # dt1 = data.table(x=1:5, y=6:10) # dt2 = data.table(x=3:7, y=8:12) # setkey(dt1) # setkey(dt2) # test(1473, foverlaps(dt1,dt2, which=TRUE, nomatch=0L, type="equal"), # data.table(xid=3:5, yid=1:3)) # More tests for `frankv`, #760 DT = data.table(x=c(4, 1, 4, NA, 1, NA, 4), y=c(1, 1, 1, 0, NA, 0, 2)) test(1474.1, frankv(DT, "y", ties.method="dense"), frankv(DT$y, ties.method="dense")) test(1474.2, frank(DT, y, ties.method="dense"), frank(DT$y, ties.method="dense")) test(1474.3, frankv(DT, "y", order=-1L, ties.method="dense"), frankv(-DT$y, ties.method="dense")) test(1474.4, frank(DT, -y, ties.method="dense"), frank(-DT$y, ties.method="dense")) # uniqueN, #884, part of #756 and part of #1019 DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), C = rep(1:2, 6)) test(1475.1, uniqueN(DT), 10L) test(1475.2, DT[, .(uN=uniqueN(.SD)), by=A], data.table(A=1:3, uN=c(3L,4L,3L))) # preserve class attribute in GForce mean (and sum) DT <- data.table(x = rep(1:3, each = 3), y = as.Date(seq(Sys.Date(), (Sys.Date() + 8), by = "day"))) test(1476.1, DT[, .(y=mean(y)), x], setDT(aggregate(y ~ x, DT, mean))) # test for 'transpose' of a list, TODO: integer64 support. ll = lapply(1:12, function(x) { if (x <= 3) sample(10, sample(5:10, 1L)) else if (x > 3 & x <= 6) as.numeric(sample(101:115, sample(7:12, 1L))) else if (x > 7 & x <= 9) sample(c(TRUE, FALSE), sample(7:9, 1L), TRUE) else sample(letters, sample(5:10, 1L)) }) ans1 = setDT(transpose(ll)) ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll, `[`, x))) test(1477.1, ans1, ans2) ans1 = setDT(transpose(ll[4:6])) ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll[4:6], `[`, x))) test(1477.9, ans1, ans2) ans1 = setDT(transpose(ll[8:9])) ans2 = setDT(lapply(seq_along(ans1), function(x) sapply(ll[8:9], `[`, x))) test(1477.10, ans1, ans2) # class is preserved? dt = data.table(x=1:5, y=6:10) test(1477.2, transpose(dt), as.data.table(t(as.matrix(dt)))) # factor column coerce to character ll = list(factor(letters[1:5]), factor(letters[6:8])) test(1477.3, transpose(ll), list(c("a", "f"), c("b", "g"), c("c", "h"), c("d", NA), c("e", NA))) # for data.frames test(1477.4, transpose(data.frame(x=1:2, y=3:4)), data.frame(V1=c(1L,3L), V2=c(2L,4L))) # test for `tstrsplit` ll = sapply(ll, paste, collapse=",") test(1477.5, transpose(strsplit(ll, ",", fixed=TRUE)), tstrsplit(ll, ",", fixed=TRUE)) test(1477.6, transpose(1:5), error="l must be a list") test(1477.7, transpose(list(as.complex(c(1, 1+5i)))), error="Unsupported column type") test(1477.8, transpose(list(list(1:5))), error="Item 1 of list input is") # #480 `setDT` and 'lapply' ll = list(data.frame(a=1), data.frame(x=1, y=2), NULL, list()) ll <- lapply(ll, setDT) test(1478.1, sapply(ll, truelength), c(1025L, 1026L, 1024L, 1024L)) test(1478.2, sapply(ll, length), INT(1,2,0,0)) # rbindlist stack imbalance issue, #980. test(1479, rbindlist(replicate(4,rbindlist(replicate(47, NULL), use.names=TRUE, fill=TRUE)), use.names=TRUE, fill=TRUE), null.data.table()) # #936, assigning list column to a factor column by reference DT <- data.table(x = factor(c("a", "b c", "d e f"))) test(1480, DT[, x := strsplit(as.character(x), " ")], data.table(x=list("a", letters[2:3], letters[4:6]))) # #970, over-allocation issue a=data.frame(matrix(1,ncol=101L)) old = options(datatable.alloccol=100L) ans1 = data.table(a) options(datatable.alloccol=101L) ans2 = data.table(a) test(1481.1, ans2, ans1) options(datatable.alloccol=0L) ans3 = data.table(a) test(1481.2, ans3, ans1) options(datatable.alloccol=1L) ans4 = data.table(a) test(1481.3, ans4, ans1) options(old) # #479, check := assignment in environment (actual case is when loaded from disk, but we'll just simulate a scenario here). ee = new.env() ee$DT = data.frame(x=1L, y=1:3) setattr(ee$DT, 'class', c("data.table", "data.frame")) test(1482.1, truelength(ee$DT), 0L) # make sure that the simulated environment is right. test(1482.2, ee$DT[, z := 3:1], data.table(x=1L, y=1:3, z=3:1), warning="Invalid .internal.selfref detected and") test(1482.3, truelength(ee$DT), 1027L) test(1482.4, ee$DT[, za := 4:6], data.table(x=1L, y=1:3, z=3:1, za=4:6)) test(1482.5, truelength(ee$DT), 1027L) # should have used spare slot i.e. no increase in tl # Fix for #499 and #945 x <- data.table(k=as.factor(c(NA,1,2)),v=c(0,1,2), key="k") y <- data.table(k=as.factor(c(NA,1,3)),v=c(0,1,3), key="k") test(1483.1, x[y], data.table(k=factor(c(NA,1,3)), v=c(0,1,NA), i.v=c(0,1,3), key="k")) test(1483.2, merge(x,y,all=TRUE), data.table(k=factor(c(NA,1,2,3)), v.x=c(0,1,2,NA), v.y=c(0,1,NA,3), key="k")) x <- data.table(country="US") y <- data.table(country=factor("USA")) test(1483.3, merge(x,y,by="country",all=T), data.table(country=factor(c("US", "USA")), key="country")) setkey(y) test(1483.4, y[x], data.table(country=factor("US"), key="country")) # Fix for #842 SomeFunction <- function(x, setnull=1L) { ans <- replicate(length(x), list("bla1", "bla2"), simplify=FALSE) ans[setnull] <- list(NULL) return(ans) } DT <- data.table(ID=1:3, key="ID") test(1484, DT[, SomeFunction(ID, setnull=1L)], DT[, SomeFunction(ID, setnull=2L)]) # Fix for #868 vals = c("setosa", "versicolor", "virginica") if (base::getRversion()>="3.1.0") { # depends on bug fix to combn() in R 3.1.0 test(1485, as.data.table(combn(unique(iris$Species),2)), data.table(vals[1:2], vals[c(1,3)], vals[2:3])) } # Fix for #955 DT <- data.table(Time=.POSIXct(0, tz="UTC")+0:1, Value=1:2) options(datatable.auto.index=FALSE) # Have to turn off to avoid error. ans1.1 = DT[Time==Time[1]] ans2.1 = DT[Time==.POSIXct(0, tz="UTC")] options(datatable.auto.index=TRUE) ans1.2 = DT[Time==Time[1]] ans2.2 = DT[Time==.POSIXct(0, tz="UTC")] test(1486.1, as.data.frame(ans1.1), as.data.frame(ans1.2)) test(1486.2, as.data.frame(ans2.1), as.data.frame(ans2.1)) # Fix for #832 x <- matrix(1:9, ncol=3) setattr(x, "names", paste("V", seq_len(length(x)), sep = "")) test(1487.1, setattr(x, "class", c("data.table", "data.frame")), error="Internal structure doesn't seem to be a list") x <- matrix(1:9, ncol=3) class(x) = c("data.table", "data.frame") # not sure how to test this one, so using `tryCatch` test(1487.2, tryCatch(print(x), error=function(k) "bla"), "bla") # Fix for #1043 DT = data.table(grp=LETTERS[1:2], categ=rep(c("X","Y"), each=2L), condition=rep(c("P","Q"), each=4L), value=sample(8)) tbl = with(DT, table(grp, categ, condition)) ans1 = setnames(setDF(data.table(tbl)), "N", "Freq") ans2 = data.frame(tbl) ans2[1:3] = lapply(ans2[1:3], as.character) test(1488, ans1, ans2) # joins where x is integer type and i is logical type DT = data.table(x=1:5, y=6:10, key="x") test(1489, DT[.(TRUE)], DT[1L]) # Fix for #932 DT <- data.table(v1 = c(1:3, NA), v2 = c(1,NA,2.5,NaN), v3=c(NA, FALSE, NA, TRUE), v4=c("a", NA, "b", "c")) options(datatable.auto.index = TRUE) # just to be sure setindex(DT, v1) test(1490.1, DT[v1==3], subset(DT, v1==3)) test(1490.2, DT[!v1==3], subset(DT, !v1==3)) test(1490.3, DT[v1==NA], subset(DT, v1==NA)) test(1490.4, DT[!v1==NA], subset(DT, !v1==NA)) setindex(DT, v2) test(1490.5, DT[v2==2.5], subset(DT, v2==2.5)) test(1490.6, DT[!v2==2.5], subset(DT, !v2==2.5)) test(1490.7, DT[v2==NA], subset(DT, v2==NA)) test(1490.8, DT[!v2==NA], subset(DT, !v2==NA)) test(1490.9, DT[v2==NaN], subset(DT, v2==NaN)) test(1490.10, DT[!v2==NaN], subset(DT, !v2==NaN)) setindex(DT, v3) test(1490.11, DT[v3==FALSE], subset(DT, v3==FALSE)) test(1490.12, DT[!v3==FALSE], subset(DT, !v3==FALSE)) test(1490.13, DT[v3==TRUE], subset(DT, v3==TRUE)) test(1490.14, DT[!v3==TRUE], subset(DT, !v3==TRUE)) test(1490.15, DT[v3==NA], subset(DT, v3==NA)) test(1490.16, DT[!v3==NA], subset(DT, !v3==NA)) test(1490.17, DT[(v3)], subset(DT, v3==TRUE)) test(1490.18, DT[!(v3)], subset(DT, !v3==TRUE)) setindex(DT, v4) test(1490.19, DT[v4=="b"], subset(DT, v4=="b")) test(1490.20, DT[!v4=="b"], subset(DT, !v4=="b")) test(1490.21, DT[v4==NA], subset(DT, v4==NA)) test(1490.22, DT[!v4==NA], subset(DT, !v4==NA)) # test for #957 test DT <- as.data.table(BOD) options(datatable.auto.index=FALSE) ans1 = DT[Time %in% c("1", "2")] options(datatable.auto.index=TRUE) ans2 = DT[Time %in% c("1", "2")] test(1490.23, ans1, ans2) # test for #961 DT <- as.data.table(cars) options(datatable.auto.index=FALSE) ans1 = DT[speed %in% list(1, 4)] options(datatable.auto.index=TRUE) ans2 = DT[speed %in% list(1, 4)] test(1490.24, ans1, ans2) # replace "." with "list" in 'j' ee1 = quote(.(val = lm(x ~ .))) ee2 = quote(.(v1=.(.SD), v2=.(min(y)), v3=.(.(x)), v4=.(x))) ee3 = quote(.(v1=.(.SD), v2=.(lm(. ~ xx)), v3=.(.(x)), v4=.(x^2))) ee4 = quote(c("a", "b") := .(.SD)) ee5 = quote(c("a", "b") := .(v1=x^2, v2 = .(.SD[[1L]]))) ee6 = quote(.(v1=.(.SD), v2=.(lm(. ~ xx)), v3=list(.(x)), v4=.(x^2))) test(1491.1, replace_dot_alias(ee1), quote(list(val = lm(x ~ .)))) test(1491.2, replace_dot_alias(ee2), quote(list(v1=list(.SD), v2=list(min(y)), v3=list(list(x)), v4=list(x)))) test(1491.3, replace_dot_alias(ee3), quote(list(v1=list(.SD), v2=list(lm(. ~ xx)), v3=list(list(x)), v4=list(x^2)))) test(1491.4, replace_dot_alias(ee4), quote(c("a", "b") := list(.SD))) test(1491.5, replace_dot_alias(ee5), quote(c("a", "b") := list(v1=x^2, v2 = list(.SD[[1L]])))) test(1491.6, replace_dot_alias(ee6), quote(list(v1=list(.SD), v2=list(lm(. ~ xx)), v3=list(list(x)), v4=list(x^2)))) # Fix for #1050 dt = data.table(x=1:5, y=6:10) options(datatable.auto.index=FALSE) ans1 <- dt[x == 2.5] options(datatable.auto.index=TRUE) ans2 <- dt[x == 2.5] test(1492, ans1, ans2) # Fix for #497 dt = data.table(x=1:10, y=11:20) test(1493, dt[, .(x=sum(x)),by= x %% 2, verbose=TRUE], data.table(`x%%2`=c(1,0), x=c(25L,30L)), output="by-expression 'x%%2' is not named") # Fix for #705 DT1 = data.table(date=as.POSIXct("2014-06-22", format="%Y-%m-%d", tz="GMT")) DT2 = data.table(date=as.Date("2014-06-23")) test(1494.1, rbind(DT1, DT2), error="Class attributes at column") test(1494.2, rbind(DT2, DT1), error="Class attributes at column") # test 1495 has been added to melt's test section (fix for #1055) # Fix for #1056 DT = data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5]) test(1496, DT[, shift(v1, 1:2, NA, "lead", TRUE)], DT[, shift(.SD, 1:2, NA, "lead", TRUE), .SDcols=2L]) # Fix for #1066 DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6) test(1497, DT[, .SD, .SDcols = !c("a", "c")], DT[, !c("a", "c"), with=FALSE]) # Fix for #1060 DT = data.table(x=1, y=2, z=3, a=4, b=5, c=6) test(1498.1, DT[, .SD, .SDcols=c(T,F)], DT[, c("x", "z", "b"), with=FALSE]) test(1498.2, DT[, .SD, .SDcols=!c(T,F)], DT[, !c("x", "z", "b"), with=FALSE]) # Fix for #1072 dt <- data.table(group1 = "a", group2 = "z", value = 1) options(datatable.auto.index=FALSE) ans1 = dt[group1 %in% c("a", "b"), sum(value), group2] options(datatable.auto.index=TRUE) ans2 = dt[group1 %in% c("a", "b"), sum(value), group2] test(1499, ans1, ans2) # Fix for #488 if ("package:bit64" %in% search()) { test(1500.1, fread("x,y\n0,\n", colClasses = list(integer64 = "y")), data.table(x=0L, y=as.integer64(NA))) # more tests after new fix test(1500.2, fread("x,y\n0,12345678901234\n0,\n0,\n0,\n0,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n12345678901234,\n0,\n0,\n0,\n0,\n0,\n"), data.table(x=as.integer64(c(rep(0L, 5L), rep(NA, 11), 12345678901234, rep(0L,5L))), y=as.integer64(c(12345678901234, rep(NA,21))))) x = c("12345678901234", rep("NA", 178), "a") y = sample(letters, length(x), TRUE) ll = paste(x,y, sep=",", collapse="\n") test(1500.3, fread(ll), data.table(V1=c("12345678901234", rep(NA, 178), "a"), V2=y)) x = c("12345678901234", rep("NA", 178), "0.5") y = sample(letters, length(x), TRUE) ll = paste(x,y, sep=",", collapse="\n") test(1500.4, fread(ll), data.table(V1=suppressWarnings(as.numeric(x)), V2=y)) } # fix for #1082 dt1 = data.table(x=rep(c("a","b","c"),each=3), y=c(1,3,6), v=1:9, key=c("x", "y")) dt2 = copy(dt1) test(1502.1, dt1["a", z := NULL], error="When deleting columns, i should not be provided") # this shouldn't segfault on 'dt1[...]' test(1502.2, dt1["a", z := 42L], dt2["a", z := 42L]) # fix for #1080 dt = data.table(col1 = c(1,2,3,2,5,3,2), col2 = c(0,9,8,9,6,5,4), key=c("col1")) test(1503.1, uniqueN(dt, by=key(dt)), 4L) # default on key columns test(1503.2, uniqueN(dt), 6L) # on all columns test(1503.3, uniqueN(dt$col1), 4L) # on just that column # .SDcols and with=FALSE understands colstart:colend syntax dt = setDT(lapply(1:10, function(x) sample(3, 10, TRUE))) # .SDcols test(1504.1, dt[, lapply(.SD, sum), by=V1, .SDcols=V8:V10], dt[, lapply(.SD, sum), by=V1, .SDcols=8:10]) test(1504.2, dt[, lapply(.SD, sum), by=V1, .SDcols=V10:V8], dt[, lapply(.SD, sum), by=V1, .SDcols=10:8]) test(1504.3, dt[, lapply(.SD, sum), by=V1, .SDcols=-(V8:V10)], dt[, lapply(.SD, sum), by=V1, .SDcols=-(8:10)]) test(1504.4, dt[, lapply(.SD, sum), by=V1, .SDcols=!(V8:V10)], dt[, lapply(.SD, sum), by=V1, .SDcols=!(8:10)]) # with=FALSE and auto with=FALSE tests as from v1.9.8 test(1504.5, dt[, V8:V10, with=FALSE], dt[, 8:10, with=FALSE]) test(1504.6, dt[, V8:V10], dt[, 8:10, with=FALSE]) test(1504.7, dt[, V10:V8, with=FALSE], dt[, 10:8, with=FALSE]) test(1504.8, dt[, V10:V8], dt[, 10:8, with=FALSE]) test(1504.9, dt[, -(V8:V10), with=FALSE], dt[, -(8:10), with=FALSE]) test(1504.11, dt[, -(V8:V10)], dt[, -(8:10), with=FALSE]) test(1504.12, dt[, !(V8:V10), with=FALSE], dt[, !(8:10), with=FALSE]) test(1504.13, dt[, !(V8:V10)], dt[, !(8:10), with=FALSE]) # Fix for #1083 dt = data.table(x=1:4, y=c(TRUE,FALSE)) test(1505.1, as.matrix(dt), as.matrix(as.data.frame(dt))) # setcolorder works with data.frames, #1018 dt = data.table(x=1, y=2) test(1506, setcolorder(dt, c("y", "x")), data.table(y=2, x=1)) # tstrsplit, #1094 # factor to character x = factor(paste(letters[1:5], letters[6:10], sep="-")) test(1507.1, tstrsplit(x, "-"), list(letters[1:5], letters[6:10])) # type.convert x = paste(letters[1:5], 1:5, sep="-") test(1507.2, tstrsplit(x, "-"), list(letters[1:5], as.character(1:5))) test(1507.3, tstrsplit(x, "-", type.convert=TRUE), list(letters[1:5], 1:5)) # implementing #575, keep.rownames can take a name x = matrix(1:6, ncol=2) rownames(x) = letters[3:1] test(1508.1, as.data.table(x, keep="bla"), data.table(bla=letters[3:1], x)) x = as.data.frame(x) test(1508.2, as.data.table(x, keep="bla"), data.table(bla=letters[3:1], x)) x = sample(10); setattr(x, 'names', letters[1:10]) test(1508.3, as.data.table(x, keep="bla"), data.table(bla=letters[1:10], x=unname(x))) # also for setDT df = data.frame(x=1:5, y=6:10, row.names=letters[5:1]) ans = data.table(foo=letters[5:1], df) test(1508.4, setDT(df, keep="foo"), ans) # #1509 test added for melt above. # #1510 transpose converts NULL to NAs ll = list(1:2, NULL, 3:4) test(1510.1, transpose(ll), list(c(1L, NA, 3L), c(2L, NA, 4L))) test(1510.2, transpose(ll, ignore=TRUE), list(c(1L, 3L), c(2L, 4L))) # setorder can reorder data.frames too, #1018 DF = data.frame(x=sample(3,10,TRUE), y=sample(letters[1:2], 10, TRUE)) rownames(DF) = sample(letters, 10) ans = DF[order(-xtfrm(DF$y), DF$x), ] test(1511, ans, setorder(DF, -y, x)) # fix for #1108 if ("package:bit64" %in% search()) { dt <- data.table(id = as.integer64(1:3), a = c("a", "b", "c"), key = "id") test(1512.1, dt[.(2)], dt[.(as.integer64(2))]) test(1512.2, dt[.(2L)], dt[.(as.integer64(2))]) dt <- data.table(id = as.numeric(1:3), a = c("a", "b", "c"), key = "id") test(1512.3, dt[.(2L)], dt[.(2)]) test(1512.4, dt[.(as.integer64(2))], dt[.(2)]) dt <- data.table(id = 1:3, a = c("a", "b", "c"), key = "id") test(1512.5, dt[.(2)], dt[.(2L)]) test(1512.6, dt[.(as.integer64(2))], dt[.(2L)]) } # setDT gains key argument, #1121 X = list(a = 4:1, b=runif(4)) test(1513, setkey(as.data.table(X), a), setDT(X, key="a")) # Adding tests for `isReallyReal` x = as.numeric(sample(10)) test(1514.1, isReallyReal(x), FALSE) x = as.numeric(sample(c(1:5, NA))) test(1514.2, isReallyReal(x), FALSE) # NAs are handled properly x = as.numeric(sample(c(1:2, NaN, NA))) test(1514.3, isReallyReal(x), TRUE) x = as.numeric(sample(c(1:2, Inf, NA))) test(1514.4, isReallyReal(x), TRUE) x = as.numeric(sample(c(1:2, -Inf, NA))) test(1514.5, isReallyReal(x), TRUE) x = as.numeric(runif(2)) test(1514.6, isReallyReal(x), TRUE) x = numeric() test(1514.7, isReallyReal(x), FALSE) # #1091 old.option = getOption("datatable.prettyprint.char") options(datatable.prettyprint.char = 5L) DT = data.table(x=1:2, y=c("abcdefghijk", "lmnopqrstuvwxyz")) test(1515.1, grep("abcde...", capture.output(print(DT))), 2L) options(datatable.prettyprint.char = old.option) # test 1516: chain setnames() - used while mapping source to target columns SRC = data.table(x=1:2, y=c("abcdefghij", "klmnopqrstuv"), z=rnorm(2)) src_cols <- c("y","z") tgt_cols <- c("name","value") DT <- SRC[, src_cols, with=FALSE][, setnames(.SD, tgt_cols)] test(1516.1, names(SRC), c("x","y","z")) # src not altered by ref test(1516.2, names(DT), tgt_cols) # target expected test(1516.3, unname(unclass(DT[, tgt_cols, with=FALSE])), unname(unclass(SRC[,src_cols, with=FALSE]))) # content match # Fix for #1078 and #1128 x = data.frame(x=1L, y=2L) setattr(x, 'class', c("foo", "data.frame")) test(1517.1, class(as.data.table(x)), c("data.table", "data.frame")) test(1517.2, class(setDT(x)), c("data.table", "data.frame")) x = data.table(x="a", y=2L) setattr(x, 'class', c("foo", "data.table", "data.frame")) test(1517.3, class(as.data.table(x)), c("data.table", "data.frame")) test(1517.4, class(setDT(x)), c("data.table", "data.frame")) # for plm package if ("package:plm" %in% search()) { set.seed(45L) x = data.table(V1=c(1L,2L), V2=LETTERS[1:3], V3=round(rnorm(4),4), V4=1:12) px = pdata.frame(x, index=c("V2", "V4"), drop.index=FALSE, row.names=TRUE) test(1517.5, class(as.data.table(px)), class(x)) test(1517.6, class(setDT(px)), class(x)) } # Fix for setattr, #1142 x = factor(rep(1:4, each=2L)) ax = address(x) setattr(x, 'levels', c("a", "a", "b", "b")) test(1518.1, levels(x), c("a", "b")) test(1518.2, address(x), ax) # Fix for #1074 and #1092 x = data.table(x=c(1,1,1,2), y=1:4, key="x") test(1519.1, x[.(2:3), .N, nomatch=0L], 1L) x = data.table(k = INT(0,2,3,7), o = "b", key = "k") y = data.table(k = 1:5, n = paste("n", 1:5, sep=""), key = "k") test(1519.2, x[y, o := n], data.table(k = INT(0,2,3,7), o = c("b","n2","n3","b"), key = "k")) # Fix for #1141 (thanks to @yvanrichard) x <- data.table(zxc = 1:3, vbn = 4:6) test(1520, x[, c('zxc', 'qwe', 'rty', 'vbn'), with = FALSE], error = "column(s) not found") # Fix for #1154 (unnecessary lock on .SD) x = data.table(a=c(1,1,2))[, unique(.SD)] test(1521, x[, b := 5], data.table(a=c(1,2), b=5)) # Fix for #1160, fastmean retaining attributes x = data.table(a = c(2,2,1,1,2), b=setattr(1:5, 'class', c('bla', 'integer'))) test(1522, class(x[, .(mean(b), all(b)), by=a]$V1), c('bla', 'integer')) # Fix for #1145, .N lock handled properly x = data.table(a=1:5) test(1523, x[, head(.SD, n=2)[1:.N]], data.table(a=1:2)) # #637 add by.x and by.y to merge.data.table d1 <- data.table(x1=c(1,3,8), y1=rnorm(3), key="x1") d2 <- data.table(x2=c(3,8,10), y2=rnorm(3), key="x2") ans1 = merge(d1, d2, by.x = "x1", by.y = "x2") ans2 = setkey(setDT(merge.data.frame(d1, d2, by.x = key(d1), by.y = key(d2))), x1) test(1524, ans1, ans2) # 'unique =' argument for CJ, #1148 x = c(1, 2, 1) y = c(5, 8, 8, 4) test(1525, CJ(x, y, unique=TRUE), CJ(c(1,2), c(4,5,8))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) setDT(DT, key = "A") test(1526.1, key(DT), "A") test(1526.2, key(setDT(DT, key = NULL)), NULL) # #501, fread stringsAsFactors=FALSE dt = data.table(x=1:5, y = letters[1:5]) text = "x,y\n1,a\n2,b\n3,c\n4,d\n5,e\n" test(1527.1, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) set.seed(1L) dt = data.table(x=1:5, y = sample(letters[1:5])) text = "x,y\n1,b\n2,e\n3,d\n4,c\n5,a\n" test(1527.2, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) set.seed(1L) dt = data.table(x=1:5, y = sample(letters[1:2], 5, TRUE)) text = "x,y\n1,a\n2,a\n3,b\n4,b\n5,a\n" test(1527.3, dt[, y := factor(y)], fread(text, stringsAsFactors=TRUE)) # #1027, check.names argument to fread nm1 = names(fread("a,a\n1,2\n3,4", check.names=FALSE)) nm2 = names(fread("a,a\n1,2\n3,4", check.names=TRUE)) nm3 = names(fread("a b,a b\n1,2\n3,4", check.names=TRUE)) test(1528.1, c("a", "a"), nm1) test(1528.2, c("a", "a.1"), nm2) test(1528.3, c("a.b", "a.b.1"), nm3) # add tests for between x = sample(10, 20, TRUE) test(1529.1, between(x, 1L, 5L, TRUE), x >= 1L & x <= 5L) test(1529.2, x %between% c(1L, 5L), x >= 1L & x <= 5L) test(1529.3, between(x, 1L, 5L, FALSE), x > 1L & x < 5L) x = sample(c(1:10, NA), 20, TRUE) test(1529.4, between(x, 1L, 5L, TRUE), x >= 1L & x <= 5L) test(1529.5, x %between% c(1L, 5L), x >= 1L & x <= 5L) test(1529.6, between(x, 1L, 5L, FALSE), x > 1L & x < 5L) x = runif(15) test(1529.7, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) test(1529.8, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) test(1529.9, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) x = c(NA, runif(15), NA) test(1529.10, between(x, 0.25, 0.75, TRUE), x >= 0.25 & x <= 0.75) test(1529.11, x %between% c(0.25, 0.75), x >= 0.25 & x <= 0.75) test(1529.12, between(x, 0.25, 0.75, FALSE), x > 0.25 & x < 0.75) # add tests for which.first and which.last # which.first test(1530.1, which.first(sample(5, 20, TRUE)), error = "x not boolean") x <- sample(c(TRUE, FALSE), 20, TRUE) test(1530.2, which.first(x), which(x)[1L]) # which.last test(1530.3, which.last(1:5), error = "x not boolean") test(1530.4, which.last(x), tail(which(x), 1L)) # test xts's last() if ("package:xts" %in% search()) { test(1531, xts::last(1:5), 5L) } # test for like, %like% set.seed(2L) x = apply(matrix(sample(letters, 12), nrow=2), 1, paste, collapse="") y = factor(sample(c(letters[1:5], x), 20, TRUE)) xsub = substring(x, 1L, 1L) test(1532.1, y %like% xsub[1L], grepl(xsub[1L], y)) test(1532.2, y %like% xsub[2L], grepl(xsub[2L], y)) test(1532.3, like(y, xsub[1L]), grepl(xsub[1L], y)) test(1532.4, like(y, xsub[2L]), grepl(xsub[2L], y)) # coverage for setkey() to 100% dt1 = data.table(x=sample(5), y=1:5, key="y") dt2 = as.data.table(dt1); setattr(dt2, 'sorted', NULL) test(1533.1, setkeyv(dt1, character(0)), dt2, warning = "cols is a character vector") test(1533.2, setkeyv(dt1, "x", verbose=TRUE), setkey(dt2, x), output = "forder took") # coverage for %+% and trim test(1534, `%+%.default`(1:5, 6:10), "1,2,3,4,56,7,8,9,10") test(1535.1, trim(" abcde "), "abcde") test(1535.2, trim(" abcde"), "abcde") test(1535.3, trim("abcde "), "abcde") # remaining test for covering duplicated.data.table dt = data.table(x=1:5, y=6:10) test(1536, duplicated(dt, incomparables=TRUE), error = "argument 'incomparables != FALSE'") # test for covering melt 100% test(1537 , names(melt(dt, id=1L, variable.name = "x", value.name="x")), c("x", "x.1", "x.2"), output = "Duplicate column names") # test for tables() test(1538, tables(), output = "Total:") # uniqueN not support list-of-list: reverted #1224 d1 <- data.table(a = 1:4, l = list(list(letters[1:2]),list(Sys.time()),list(1:10),list(letters[1:2]))) test(1539, d1[,uniqueN(l)], error = "x must be an atomic vector or data.frames/data.tables") # feature #1130 - joins without setting keys # can't test which=TRUE with DT1.copy's results.. set.seed(45L) DT1 = data.table(x=sample(letters[1:3], 15, TRUE), y=sample(6:10, 15, TRUE), a=sample(100, 15), b=runif(15)) DT2 = CJ(x=letters[1:3], y=6:10)[, mul := sample(20, 15)][sample(15L, 5L)] DT3 = rbindlist(list(DT2, list(x="d", y=7L, mul=100L))) DT3 = DT3[sample(nrow(DT3))] # key on char column DT1.copy = copy(DT1) setkey(DT1.copy, x) test(1540.1, DT1[DT2, on=c(x="x")], DT1.copy[DT2]) test(1540.33, DT1[DT2, on=c("x")], DT1.copy[DT2]) test(1540.2, DT1[DT2, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x"), .SDcols=c("a", "b")], DT1.copy[DT2, lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b")]) test(1540.3, DT1[DT3, on=c(x="x")], DT1.copy[DT3]) test(1540.4, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x"), .SDcols=c("a", "b")], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b")]) test(1540.5, DT1[DT3, on=c(x="x"), nomatch=0L], DT1.copy[DT3, nomatch=0L]) test(1540.6, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x"), .SDcols=c("a", "b"), nomatch=0L], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b"), nomatch=0L]) test(1540.7, DT1[DT3, on=c(x="x"), roll=TRUE], DT1.copy[DT3, roll=TRUE]) test(1540.8, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x"), .SDcols=c("a", "b"), roll=TRUE], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b"), roll=TRUE]) # key on integer col DT1.copy = copy(DT1) setkey(DT1.copy, y) test(1540.9, DT1[DT2, on=c(y="y")], DT1.copy[DT2[, c(2,1,3), with=FALSE]]) test(1540.34, DT1[DT2, on=c("y")], DT1.copy[DT2[, c(2,1,3), with=FALSE]]) test(1540.10, DT1[DT2, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(y="y"), .SDcols=c("a", "b")], DT1.copy[DT2[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b")]) test(1540.11, DT1[DT3, on=c(y="y")], DT1.copy[DT3[, c(2,1,3), with=FALSE]]) test(1540.12, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(y="y"), .SDcols=c("a", "b")], DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b")]) test(1540.13, DT1[DT3, on=c(y="y"), nomatch=0L], DT1.copy[DT3[, c(2,1,3), with=FALSE], nomatch=0L]) test(1540.14, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(y="y"), .SDcols=c("a", "b"), nomatch=0L], DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b"), nomatch=0L]) test(1540.15, DT1[DT3, on=c(y="y"), roll=TRUE], DT1.copy[DT3[, c(2,1,3), with=FALSE], roll=TRUE]) test(1540.16, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(y="y"), .SDcols=c("a", "b"), roll=TRUE], DT1.copy[DT3[, c(2,1,3), with=FALSE], lapply(.SD, function(x) x * mul), by=.EACHI, .SDcols=c("a", "b"), roll=TRUE]) # multiple keys DT1.copy = copy(DT1) setkey(DT1.copy, x, y) test(1540.17, DT1[DT2, on=c(x="x", y="y")], DT1.copy[DT2]) test(1540.35, DT1[DT2, on=c("x", "y")], DT1.copy[DT2]) test(1540.18, DT1[DT2, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x", y="y")], DT1.copy[DT2, lapply(.SD, function(x) x * mul), by=.EACHI]) test(1540.19, DT1[DT3, on=c(x="x", y="y")], DT1.copy[DT3]) test(1540.20, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x", y="y")], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI]) test(1540.21, DT1[DT3, on=c(x="x", y="y"), nomatch=0L], DT1.copy[DT3, nomatch=0L]) test(1540.22, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x", y="y"), nomatch=0L], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, nomatch=0L]) test(1540.23, DT1[DT3, on=c(x="x", y="y"), roll=TRUE], DT1.copy[DT3, roll=TRUE]) test(1540.24, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="x", y="y"), roll=TRUE], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, roll=TRUE]) # multiple keys, non-identical names DT1.copy = copy(DT1) setkey(DT1.copy, x, y) setnames(DT2, c("q", "r", "mul")) setnames(DT3, names(DT2)) test(1540.25, DT1[DT2, on=c(x="q", y="r")], DT1.copy[DT2]) test(1540.26, DT1[DT2, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="q", y="r")], DT1.copy[DT2, lapply(.SD, function(x) x * mul), by=.EACHI]) test(1540.27, DT1[DT3, on=c(x="q", y="r")], DT1.copy[DT3]) test(1540.28, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="q", y="r")], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI]) test(1540.29, DT1[DT3, on=c(x="q", y="r"), nomatch=0L], DT1.copy[DT3, nomatch=0L]) test(1540.30, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="q", y="r"), nomatch=0L], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, nomatch=0L]) test(1540.31, DT1[DT3, on=c(x="q", y="r"), roll=TRUE], DT1.copy[DT3, roll=TRUE]) test(1540.32, DT1[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, on=c(x="q", y="r"), roll=TRUE], DT1.copy[DT3, lapply(.SD, function(x) x * mul), by=.EACHI, roll=TRUE]) # to do: add tests for := # fix for #477, key not being retained on joins on factor columns set.seed(1) dtp <- data.table(pid = gl(3, 3, labels = c("du", "i", "nouana")), year = gl(3, 1, 9, labels = c("2007", "2010", "2012")), val = rnorm(9), key = c("pid", "year")) dtab <- data.table(pid = factor(c("i", "nouana")), year = factor(c("2010", "2000")), abn = sample(1:5, 2, replace = TRUE), key = c("pid", "year")) test(1541, key(dtp[dtab]), c("pid", "year")) # fix DT[TRUE, :=] using too much working memory for i, #1249 if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled f = tempfile() N = 1000000 # or any large number of rows DT = data.table(A=1:N, B=rnorm(N)) DT[TRUE, B := B * 2] # stabilize with initial dummy update Rprofmem(f) DT[TRUE, B := B * 2] # or some in-place update Rprofmem(NULL) test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only unlink(f) } # rest of #1130 - merge doesn't copy, instead uses joins without keys. set.seed(1L) d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE) d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE) ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by="A")) ans2 <- setDF(merge(setDT(d1), setDT(d2), by="A")) test(1543.1, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), all=TRUE, by="A")) ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A")) test(1542.2, ans1, ans2) # test duplicate name cases setnames(d2, c("A", "Y"), c("B", "A")) ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B")) ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B")) test(1543.3, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A")) test(1543.4, ans1, ans2) ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A")) ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A")) test(1543.5, ans1, ans2) # test for sort=FALSE argument, #1282 set.seed(1L) d1 <- data.table(A = sample(letters[1:10]), X = 1:10, total = TRUE) d2 <- data.table(A = sample(letters[5:14]), Y = 1:10, total = FALSE) test(1543.7, merge(setDT(d1), setDT(d2), by="A", sort=FALSE), setDT(merge(setDF(d1), setDF(d2), by="A", sort=FALSE))) # thinko in merge dupnames handling dt1 = data.table(x=1:5, y1=2L, y2=3L) dt2 = data.table(a=4:6, y2=TRUE, y1 = FALSE) test(1543.6, setDF(merge(dt1, dt2, by.x="x", by.y="a")), merge(as.data.frame(dt1), as.data.frame(dt2), by.x="x", by.y="a")) # fix #1290, restore colorder before setting names set.seed(1) dt1 <- data.table(sex = rep(1:2, 5), group = rep(letters[1:5], 2),V1 = sample(1:10)) set.seed(2) dt2 <- data.table(group = rep(letters[1:5], 2),sex = rep(1:2, 5),V2 = sample(1:10)) test(1543.7, setDF(merge(dt1, dt2, by = c("sex", "group"))), merge(as.data.frame(dt1), as.data.frame(dt2), by=c("sex", "group"))) by.x = c("sex.1", "group.1") by.y = c("sex.2", "group.2") setnames(dt1, 1:2, by.x) setnames(dt2, 1:2, rev(by.y)) test(1543.8, setDF(merge(dt1, dt2, by.x=by.x, by.y=by.y)), merge(as.data.frame(dt1), as.data.frame(dt2), by.x=by.x, by.y=by.y)) # fix for #1258 (bug on .shallow - retains keys when it shouldn't) # nice catch and excellent report from @and3k x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L)) y <- data.table(a2 = 1:3) setkey(y, a2) setkey(x1, a1, a2) test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y))) test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2")) # also test shallow here so as to catch future regressions x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), key="a1,a2") test(1545.1, key(.shallow(x1, cols="a2")), NULL) test(1545.2, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.3, key(.shallow(x1, retain.key=TRUE)), key(x1)) test(1545.4, key(.shallow(x1, cols="a1", retain.key=TRUE)), "a1") # test for #1234 df1 = df2 = data.frame(cats = rep(c('', ' ', 'meow'), 5)) df2[grep("^[ ]*$", df2$cats), "cats"] = NA_integer_ test(1546, set(df1, grep("^[ ]*$", df1$cats), 1L, NA_integer_), df2) # Add test for getdots() function (although it doesn't seem to be used anywhere) foo <- function(x, y, ...) { getdots() } test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f="issue_563_fread.txt" ans1 <- fread(f, sep=",", header=TRUE) ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") # #1167 print.data.table row id in non-scientific notation DT <- data.table(a = rep(1:5,3*1e6), b = rep(letters[1:3],5*1e6)) test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "14999996: 1 b", "14999997: 2 c", "14999998: 3 a", "14999999: 4 b", "15000000: 5 c")) rm(DT) # PR by @dselivanov # fixes #504 - handle nastring while reading (without coercion to character) # Note: this doesn't address cases like na.strings="-999" yet. See https://github.com/Rdatatable/data.table/pull/1236 for those examples. K = 10L nastrings = c('null', 'NULL', 'na', '_NA', 'NA', 'nan', 'Nan', 'NAN', 'NaN') DT = data.table(int = 1:K, char = sample(letters, size = K, replace = T), float = 1:K + 0.1, bool = sample( c(T, F), K, replace = T)) DT_NA = DT for (j in seq_len( ncol(DT) )) { set(x = DT_NA, i = j, j = j, value = NA) } for(k in seq_along(nastrings)) { dt0 = copy(DT) for (j in seq_len( ncol(DT) )) { set(x = dt0, i = NULL, j = j, value = as.character(dt0[[j]])) set(x = dt0, i = j, j = j, value = nastrings[[k]]) } str = do.call(paste, c(dt0, collapse="\n", sep=",")) str = paste(paste(names(dt0), collapse=","), str, sep="\n") DT_fread = fread(str, na.strings = nastrings, verbose = FALSE) test(1550 + k * 0.1, DT_fread, DT_NA) } # FR #568 str = "a,b\n1.5,\"at the 5\" end of the gene.\"" test(1551.1, fread(str), data.table(a = 1.5, b = "\"at the 5\" end of the gene.\"")) #1256 str = "x,y\nx1,\"oops\" y1\n" test(1551.2, fread(str), data.table(x = "x1", y = "\"oops\" y1")) #1077 str = '2,3\n""foo,bar' test(1551.3, fread(str), data.table(V1 = c("2", "\"\"foo"), V2 = c("3", "bar"))) #1079 str = 'L1\tsome\tunquoted\tstuff\nL2\tsome\t"half" quoted\tstuff\nL3\tthis\t"should work"\tok thought' test(1551.4, fread(str), data.table(L1 = c("L2", "L3"), some = c("some", "this"), unquoted = c("\"half\" quoted", "should work"), stuff = c("stuff", "ok thought"))) #1095 rhs = read.table("issue_1095_fread.txt", sep=",", comment.char="", stringsAsFactors=FALSE, quote="", strip.white=TRUE) test(1551.5, fread("issue_1095_fread.txt"), setDT(rhs)) # FR #1314 rest of na.strings issue str = "a,b,c,d\n#N/A,+1,5.5,FALSE\n#N/A,5,6.6,TRUE\n#N/A,+1,#N/A,-999\n#N/A,#N/A,-999,FALSE\n#N/A,1,NA,TRUE" read_table = function(str, ...) { setDT(read.table(text=str, stringsAsFactors=FALSE, comment.char="", sep=",", header=TRUE, ...))[] } test(1552.1, fread(str, na.strings="#N/A"), read_table(str, na.strings="#N/A")) test(1552.2, fread(str, na.strings=c("#N/A", "-999")), read_table(str, na.strings=c("#N/A", "-999"))) test(1552.3, fread(str, na.strings=c("#N/A", "-999", "+1")), read_table(str, na.strings=c("#N/A", "-999", "+1"))) test(1552.4, fread(str, na.strings=c("#N/A", "-999", "+1", "1")), read_table(str, na.strings=c("#N/A", "-999", "+1", "1"))) test(1552.5, fread(str, na.strings=c("#N/A", "-999", "FALSE")), read_table(str, na.strings=c("#N/A", "-999", "FALSE"))) # FR #1177: 'quote' option of 'print.data.table' DT1 <- data.table(s1=paste(" ",LETTERS[1:5],sep=""),s2=LETTERS[1:5]) ans1 <- c(" s1 s2","1: \" A\" \"A\"", "2: \" B\" \"B\"","3: \" C\" \"C\"", "4: \" D\" \"D\"","5: \" E\" \"E\"") ans2 <- c(" s1 s2","1: A A","2: B B", "3: C C","4: D D","5: E E") test(1553.1, capture.output(print(DT1, quote = TRUE)), ans1) test(1553.2, capture.output(print(DT1)), ans2) # #826 - subset DT on single integer vector stored as matrix the same way as data.frame dt <- data.table(a=letters[1:10]) idx <- c(2:4,7L,9:10) dim(idx) <- c(6L, 1L) dimnames(idx) <- list(NULL, "Resample1") # as in caret::createDataPartition test(1554.1, dt[idx], data.table(a=letters[idx])) test(1554.2, dt[-idx], data.table(a=letters[(1:10)[-idx]])) test(1554.3, dt[!idx], data.table(a=letters[(1:10)[-idx]])) test(1554.4, idx, structure(c(2L, 3L, 4L, 7L, 9L, 10L), .Dim = c(6L, 1L), .Dimnames = list(NULL, "Resample1"))) # strip.white and other enhancements to 'fread()' # bug #1113 ans1 <- fread("issue_1113_fread.txt") ans2 <- setDT(read.table("issue_1113_fread.txt", header=TRUE)) setnames(ans2, names(ans1)) test(1555.1, ans1, ans2) # bug #1035, take care of spaces automatically. Note that the columns are also read in proper types. Also with quotes when sep is not space. str1=" ITERATION THETA1 THETA2 0 3.95527E+01 2.10651E+01" str2=" ITERATION, THETA1, THETA2 0, 3.95527E+01, 2.10651E+01" str3=" ITERATION , THETA1 , THETA2 0 , 3.95527E+01 , 2.10651E+01" str4=" ITERATION , THETA1 , \"THETA2\" 0 , 3.95527E+01 , 2.10651E+01" str5=" ITERATION , THETA1 , THETA2 bla , 3.95527E+01 , 2.10651E+01" test(1555.2, fread(str1), data.table(ITERATION=0L, THETA1=39.5527, THETA2=21.0651)) test(1555.3, fread(str2), data.table(ITERATION=0L, THETA1=39.5527, THETA2=21.0651)) test(1555.4, fread(str3), data.table(ITERATION=0L, THETA1=39.5527, THETA2=21.0651)) test(1555.5, fread(str4), data.table(ITERATION=0L, THETA1=39.5527, `"THETA2"`=21.0651)) test(1555.6, fread(str5), data.table(ITERATION="bla", THETA1=39.5527, THETA2=21.0651)) # without strip.white # header col spaces are dealt properly irrespective of strip.white test(1555.7, fread(str1, strip.white=FALSE), error="Expecting 4 cols, but line 2 contains") test(1555.8, names(fread(str2, strip.white=FALSE)), c("ITERATION","THETA1","THETA2 ")) test(1555.9, names(fread(str3, strip.white=FALSE)), c("ITERATION ","THETA1 ","THETA2 ")) test(1555.10, names(fread(str4, strip.white=FALSE)), c("ITERATION ","THETA1 ","\"THETA2\" ")) # bug #1035, reply to the post from another user str1=" 22 4 6 4\n 34 22 34 5\n 6 2 1 4\n" str2="22 4 6 4\n34 22 34 5\n6 2 1 4\n" test(1555.11, fread(str1), fread(str2)) # bug #785 rhs <- setDT(read.table("issue_785_fread.txt", header=TRUE, stringsAsFactors=FALSE, sep="\t", strip.white=TRUE)) test(1555.12, fread("issue_785_fread.txt"), rhs) # bug #529, http://stackoverflow.com/questions/22229109/r-data-table-fread-command-how-to-read-large-files-with-irregular-separators str1=" YYYY MM DD HH mm 19490 40790 1991 10 1 1 0 1.046465E+00 1.568405E+00" str2="YYYY MM DD HH mm 19490 40790 1991 10 1 1 0 1.046465E+00 1.568405E+00" test(1555.13, fread(str1), fread(str2)) # fix for #1330 test(1556.1, fread("issue_1330_fread.txt", nrow=2), data.table(a=1:2, b=1:2)) test(1556.2, fread("issue_1330_fread.txt", nrow=4), data.table(a=1:2, b=1:2), warning="Stopped reading at empty line 4") # FR #768 str="1,2\n3,4\n" test(1557.1, names(fread(str)), c("V1", "V2")) # autonamed test(1557.2, names(fread(str, col.names=letters[1:2])), letters[1:2]) test(1557.3, names(fread(str, col.names=letters[1])), error="Can't assign 1 names to") test(1557.4, names(fread(str, col.names=letters[1:3])), error="Can't assign 3 names to") test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type") # Fix for #773 f = "issue_773_fread.txt" ans = data.table(AAA=as.integer(c(4,7,rep(1,17),31,21)), BBB=as.integer(c(5,8,rep(2,17),32,22)), CCC=as.integer(c(6,9,rep(3,17),33,23))) test(1558, fread(f, nrow=21L), ans) # no warning # FR # 1338 -- check.names argument of setDT ans=data.table(X=1:3,"X.1"=1:3) dt1<-data.table(X=1:3,X=1:3) df1<-data.frame(X=1:3,X=1:3,check.names=FALSE) ls1<-list("X"=1:3,"X"=1:3) test(1559.1, setDT(dt1, check.names=TRUE), ans) test(1559.2, setDT(df1, check.names=TRUE), ans) test(1559.3, setDT(ls1, check.names=TRUE), ans) # Fix #1140 test(1560.1, data.table(x=letters[1:5])[, 0, with=FALSE], null.data.table()) test(1560.2, data.table(x=letters[1:5])[, c(0,FALSE), with=FALSE], null.data.table()) # Fix for #1298 d = data.table(a = 1) q = quote(.(a)) test(1561, d[, 1, by = eval(q)], d[, 1, by = .(a)]) # Fix for #1315 d = as.IDate(seq(as.Date("2015-01-01"), as.Date("2015-01-15"), by='1 day')) test(1562.1, as.list(d), lapply(as.list(as.Date(d)), as.IDate)) test(1562.2, sapply(d, identity), as.integer(sapply(as.Date(d), identity))) # Fix for #1216, .SDcols and with=FALSE should evaluate within frame of 'x' only when it's of the form a:b dt = data.table(index1=1:10, index2=10:1, index3=1, s=4, i=24) i = 2L test(1557.1, dt[, paste0("index", 1:i), with=FALSE], dt[, index1:index2, with=FALSE]) test(1557.2, dt[, paste0("index", 1:i), with=FALSE], dt[, 1:2, with=FALSE]) test(1557.3, dt[, 5:4, with=FALSE], dt[, i:s, with=FALSE]) test(1557.4, dt[, .SD, .SDcols=paste0("index", 1:i)], dt[, .SD, .SDcols=index1:index2]) # fix for #1354 test(1558, as.ITime(NA), setattr(NA_integer_, 'class', 'ITime')) if (!"package:xts" %in% search()) { # #1347, xts issue from Joshua x = as.Date(1:5, origin="2015-01-01") test(1559.11, last(x), tail(x, 1L)) } else { test(1559.12, last(.xts(1:3,1:3)), .xts(1:3, 1:3)[3, ]) } # fix for #1352 dt1 = data.table(a=1:5, b=6:10, c=11:15) dt2 = data.table(a=3:6, b=8:11, d=1L) by_cols = c(x="a", y="b") test(1560, merge(dt1,dt2, by=by_cols, sort=FALSE), dt1[dt2, nomatch=0L, on=unname(by_cols)]) # FR #1353 DT = data.table(x=c(20,10,10,30,30,20), y=c("a", "a", "a", "b", "b", "b"), z=1:6) test(1561.1, rowid(DT$x), as.integer(c(1,1,2,1,2,2))) test(1561.2, rowidv(DT, cols="x"), as.integer(c(1,1,2,1,2,2))) test(1561.3, rowid(DT$x, prefix="group"), paste("group", as.integer(c(1,1,2,1,2,2)), sep="")) test(1561.4, rowid(DT$x, DT$y), as.integer(c(1,1,2,1,2,1))) test(1561.5, rowidv(DT, cols=c("x","y")), as.integer(c(1,1,2,1,2,1))) # convenient usage with dcast test(1561.6, dcast(DT, x ~ rowid(x, prefix="group"), value.var="z"), data.table(x=c(10,20,30), group1=c(2L,1L,4L), group2=c(3L,6L,5L), key="x")) # Fix for #1346 DT = data.table(id=1:3, g1=4:6, g2=7:9) test(1562, melt(DT, measure=patterns("^g[12]"), variable.factor=FALSE), data.table(id=1:3, variable=rep(c("g1","g2"),each=3L), value=4:9)) # tet 1563 added for melt above, fix for #1359. # fix for #1341 dt <- data.table(a = 1:10) test(1564.1, truelength(dt[, .SD]), 1025L) test(1564.2, truelength(dt[a==5, .SD]), 1025L) test(1564.3, dt[a==5, .SD][, b := 1L], data.table(a=5L, b=1L)) # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) old = options(datatable.optimize = Inf) ans1 = dt[, .N, by=a] ans2 = capture.output(dt[, .N, by=a, verbose=TRUE]) test(1565.1, length(grep("GForce optimized j to", ans2))>0L, TRUE) # make sure GForce optimisation works options(datatable.optimize = 1L) # make sure result is right test(1565.2, ans1, dt[, .N, by=a]) options(old) # Fix for #1212 set.seed(123) dt <- data.table(a=c("abc", "def", "ghi"), b=runif(3))[, c:=list(list(data.table(d=runif(1), e=runif(1))))] test(1566.1, dt[, c], dt[, get("c")]) test(1566.2, dt[, .(c=c)], dt[, .(c=get("c"))]) test(1566.3, address(dt$c) == address(dt[, get("c")]), FALSE) # Fix for #1207 d1 <- data.table(a = character(), b = list()) test(1567.1, d1[, b, by=a], d1) test(1567.2, d1[, b, keyby=a], data.table(d1, key="a")) # Fix for #1334 dt = data.table(x=ordered(rep(1:3,each=5)),y=ordered(rep(c("B","A","C"),5),levels=c("B","A","C")),z=1:15) test(1568, dt[, sum(z), keyby=.(I(x), I(y))], data.table(I=I(ordered(rep(1:3,each=3))), I.1=I(ordered(rep(c("B","A","C"),3),levels=c("B","A","C"))),V1=c(5L, 7L, 3L, 17L, 8L, 15L, 13L, 25L, 27L), key=c("I", "I.1"))) # Test 1569 is written under melt above. # fix for #1378, merge resets class X = data.table(a=1:3, b=4:6) Y = data.table(a=1L, c=5L) setattr(Y, 'class', c("custom","data.table","data.frame")) test(1570.1, class(merge(X, Y, all=TRUE, by="a")), class(X)) test(1570.2, class(merge(Y, X, all=TRUE, by="a")), class(X)) # #1379, tstrsplit gains names argument X = data.table(a=c("ABC", "DEFG")) test(1571.1, names(tstrsplit(X$a, "", fixed=TRUE, names=TRUE)), paste("V", 1:4, sep="")) test(1571.2, names(tstrsplit(X$a, "", fixed=TRUE, names=letters[1:3])), error="is not equal to ") test(1571.3, names(tstrsplit(X$a, "", fixed=TRUE, names=letters[1:4])), letters[1:4]) # tstrsplit also gains 'keep' argument test(1571.4, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,4)), list(c("B", "E"), c(NA, "G"))) test(1571.5, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,7)), error="should contain integer") test(1571.5, tstrsplit(X$a, "", fixed=TRUE, keep=c(2,4), names=letters[1:5]), error="is not equal to") # fix for #1367, quote="" argument in use. Using embedded quotes in the example below reads the # first two columns as one. I couldn't find a way to avoid introducing quote argument. test(1572, fread('"abcd efgh." ijkl.\tmnop "qrst uvwx."\t45\n', quote=""), setDT(read.table(text='"abcd efgh." ijkl.\tmnop "qrst uvwx."\t45\n', sep="\t", stringsAsFactors=FALSE, quote=""))) # Fix for #1384, fread with empty new line, initial checks failed due to extra spaces. test(1573, fread('a,b 1,2 '), data.table(a=1L, b=2L)) # Fix for #1375 X = data.table(a=1:3,b=4:6,c=c("foo","bar","baz")) test(1574.1, X[.(5), on="b"], X[2]) X = data.table(A=1:3,b=4:6,c=c("foo","bar","baz")) Y = data.table(A=2:4, B=5:7) test(1574.2, X[Y, on=c("A",b="B")], X[Y, on=c(A="A", b="B")]) test(1574.3, X[Y, on=c(b="B", "A")], X[Y, on=c(b="B", A="A")]) test(1574.4, X["bar", on="c"], X[2L]) # missed previously # fix for #1376 X = data.table(a=1:3,b=4:6,c=c("foo","bar","baz")) Y = data.table(A=2:4, B=5:7) test(1575.1, X[Y, on=c(A="a")], error="not found in x") test(1575.2, X[Y, on=c(a="a")], error="not found in i") # work around for issue introduced in v1.9.4, #1396 X = data.table(x=5:1, y=6:10) setattr(X, 'index', integer(0)) setattr(attr(X, 'index'), 'x', 5:1) # auto indexed attribute as created from v1.9.4 test(1576, X[, z := 1:5, verbose=TRUE], output = "Dropping index 'x' as.*beginning of its name.*very likely created by v1.9.4 of data.table") # fix for #1408 X = fread("a|b|c|d this|is|row|1 this|is|row|2 this|NA|NA|3 this|is|row|4", stringsAsFactors = TRUE) test(1577.1, is.na(X[3, b]), TRUE) test(1577.2, levels(X$b), "is") X = fread("a|b|c|d this|NA|row|1 this|NA|row|2 this|NA|NA|3 this|NA|row|4", colClasses="character", stringsAsFactors = TRUE) test(1577.3, levels(X$b), character(0)) # FR #530, skip blank lines input = "a,b\n\n1,3\n2,4" test(1578.1, fread(input), data.table(V1=1:2, V2=3:4)) test(1578.2, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) input = "a,b\n\n\n1,3\n2,4" test(1578.3, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) input = "a,b\n\n\n1,3\n\n2,4\n\n" test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) test(1578.5, fread("530_fread.txt", skip=47L), data.table(V1=1:2, V2=3:4)) test(1578.6, fread("530_fread.txt", skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4)) # gforce optimisations dt = data.table(x = sample(letters, 300, TRUE), i1 = sample(-10:10, 300, TRUE), i2 = sample(c(-10:10, NA), 300, TRUE), d1 = as.numeric(sample(-10:10, 300, TRUE)), d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) if ('package:bit64' %in% search()) { dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] } # make sure gforce is on optim = getOption("datatable.optimize") options(datatable.optimize=2L) # testing gforce::gmedian test(1579.1, dt[, lapply(.SD, median), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x]) test(1579.2, dt[, lapply(.SD, median, na.rm=TRUE), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) test(1579.3, dt[, lapply(.SD, median), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) test(1579.4, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) ans = capture.output(dt[, lapply(.SD, median), by=x, verbose=TRUE]) test(1579.5, any(grepl("GForce optimized", ans)), TRUE) # testing gforce::ghead and gforce::gtail # head(.SD, 1) and tail(.SD, 1) optimisation test(1579.6, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) test(1579.7, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) test(1579.8, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) test(1579.9, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) test(1579.6, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) test(1579.7, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) test(1579.8, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) test(1579.9, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) test(1579.10, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.11, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.12, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) test(1579.13, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) mysub <- function(x, n) x[n] test(1579.14, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) test(1579.15, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) test(1579.16, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) test(1579.17, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) test(1579.18, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) test(1579.19, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) test(1579.20, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) test(1579.21, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE]) test(1579.22, any(grepl("GForce optimized", ans)), TRUE) options(datatable.optimize=optim) # test for #1419, rleid doesn't remove names attribute x = c("a"=TRUE, "b"=FALSE) nx = copy(names(x)) r = rleid(x) test(1580, nx, names(x)) # FR #971, partly addressed (only subsets in 'i') # make sure GForce kicks in and the results are identical dt = dt[, .(x, d1, d2)] old = options(datatable.optimize=1L) test(1581.1, ans1 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), lapply(.SD, mean, na.rm=TRUE), lapply(.SD, median, na.rm=TRUE) ), by=x, verbose=TRUE], output = "(GForce FALSE)") options(datatable.optimize=2L) test(1581.2, ans2 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), lapply(.SD, mean, na.rm=TRUE), lapply(.SD, median, na.rm=TRUE) ), by=x, verbose=TRUE], output = "GForce optimized j") test(1581.3, ans1, ans2) # subsets in 'i' for head and tail options(datatable.optimize=1L) test(1581.4, ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output = "(GForce FALSE)") options(datatable.optimize=2L) test(1581.5, ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output = "GForce optimized j") test(1581.6, ans1, ans2) options(datatable.optimize=1L) test(1581.7, ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output = "(GForce FALSE)") options(datatable.optimize=2L) test(1581.8, ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output = "GForce optimized j") test(1581.9, ans1, ans2) options(datatable.optimize=1L) test(1581.10, ans1 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output = "(GForce FALSE)") options(datatable.optimize=2L) test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output = "GForce optimized j") test(1581.12, ans1, ans2) options(old) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) # bug fix #1461 dt = data.table(x=c(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) optim = getOption("datatable.optimize") # make sure gforce is on options(datatable.optimize=Inf) ans1 = suppressWarnings(dt[, base::min(y, na.rm=TRUE), by=x]) ans2 = suppressWarnings(dt[, base::max(y, na.rm=TRUE), by=x]) test(1583.1, dt[, min(y, na.rm=TRUE), by=x], ans1, warning="No non-missing values found") test(1583.2, dt[, max(y, na.rm=TRUE), by=x], ans2, warning="No non-missing values found") ans3 = suppressWarnings(dt[, base::min(y), by=x]) ans4 = suppressWarnings(dt[, base::max(y), by=x]) test(1583.3, dt[, min(y), by=x], ans3) test(1583.4, dt[, max(y), by=x], ans4) # restore optimisation options(datatable.optimize=optim) # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE) f2 <- function(x, f=TRUE, b=FALSE) read.table(x, fill=f, blank.lines.skip=b, sep=",", header=TRUE, stringsAsFactors=FALSE) test(1584.1, f1("fread_blank.txt", f=FALSE, b=TRUE), f2("fread_blank.txt", f=FALSE, b=TRUE)) test(1584.2, f1("fread_blank2.txt", f=FALSE, b=TRUE), f2("fread_blank2.txt", f=FALSE, b=TRUE)) test(1584.3, f1("fread_blank3.txt", f=FALSE, b=TRUE), f2("fread_blank3.txt", f=FALSE, b=TRUE)) # fread fill=TRUE, #536. Also takes care of #1124. test(1585.1, f1("536_fread_fill_1.txt"), f2("536_fread_fill_1.txt")) test(1585.2, f1("536_fread_fill_1.txt", b=TRUE), f2("536_fread_fill_1.txt", b=TRUE)) test(1585.3, f1("536_fread_fill_2.txt"), f2("536_fread_fill_2.txt")) test(1585.4, f1("536_fread_fill_2.txt", b=TRUE), f2("536_fread_fill_2.txt", b=TRUE)) test(1585.5, f1("536_fread_fill_3_extreme.txt"), f2("536_fread_fill_3_extreme.txt")) test(1585.6, f1("536_fread_fill_3_extreme.txt", b=TRUE), f2("536_fread_fill_3_extreme.txt", b=TRUE)) # no warning about bumping type. when fill=TRUE, column type detection starts at first non-empty line (which makes sense). test(1585.7, f1("536_fread_fill_4.txt"), f2("536_fread_fill_4.txt")) test(1585.8, f1("536_fread_fill_4.txt", b=TRUE), f2("536_fread_fill_4.txt", b=TRUE)) # TODO: add a test when fill=FALSE, but blank.lines.skip=TRUE, when the same effect should happen # TODO: fix and add test for cases like this: # a,b,c # 1,2,3 # 4,5,6 # 7,8,9,6 # extra column, but we've only detected 3 cols # 1,2,3 # ... # fix for #721 text="x,y\n1,a\n2,b\n" test(1586.1, fread(text, colClasses=c("integer", "factor")), data.table(x=1:2, y=factor(letters[1:2]))) test(1586.2, fread(text, colClasses=c(x="factor")), data.table(x=factor(1:2), y=letters[1:2])) # FR #590 text="x,y\n2,a\n1,q\n3,c\n" test(1587, fread(text, key="y"), setDT(fread(text), key="y")) # fix for #1361 dt = data.table(i=1:10, f=as.factor(1:10)) test(1588.1, dt[f %in% 3:4], dt[3:4]) test(1588.2, dt[f == 3], dt[3]) # fix for #1484 if ("package:xts" %in% search()) { x = xts::as.xts(8, order.by = as.Date("2016-01-03")) test(1589, all.equal(as.data.table(x), data.table(index = as.Date("2016-01-03"), V1 = 8), check.attributes=FALSE)) } # encoding issue in forder x <- "fa\xE7ile" Encoding(x) Encoding(x) <- "latin1" xx <- iconv(x, "latin1", "UTF-8") y = sample(c(x,xx), 10, TRUE) oy = if (length(oy <- forderv(y))) oy else seq_along(y) test(1590.4, oy, order(y)) Encoding(xx) = "unknown" y = sample(c(x,xx), 10, TRUE) oy = if (length(oy <- forderv(y))) oy else seq_along(y) test(1590.5, oy, order(y)) # #1432 test list_1 = list(a = c(44,47), dens = c(2331,1644)) list_2 = list(a=66, dens= 1890) list_3 = list(a=c(44,46,48,50), dens=c(8000,1452,1596,7521)) mylist = list(list_1, list_2, list_3) setattr(mylist, 'names', c("ID_1","ID_2","ID_3")) ans = data.table(id=rep(c("ID_1","ID_2","ID_3"), c(2,1,4)), a=c(44,47,66,44,46,48,50), dens=c(2331,1644,1890,8000,1452,1596,7521)) test(1591, rbindlist(mylist, idcol="id"), ans) # FR #1443 DT <- data.table(x = 1:3, y = 4:6, z = 7:9) test(1592.1, setnames(DT, -5, "bla"), error="Items of 'old'") test(1592.2, names(setnames(DT, -1, c("m", "n"))), c("x", "m", "n")) # fix for #1513 test(1593, CJ(c(1,2,2), c(1,2,3)), data.table(V1=rep(c(1,2), c(3,6)), V2=c(1,2,3,1,1,2,2,3,3), key=c("V1", "V2"))) # FR #523, var, sd and prod options(datatable.optimize = Inf) # ensure gforce is on DT = data.table(x=sample(5, 100, TRUE), y1=sample(6, 100, TRUE), y2=sample(c(1:10,NA), 100, TRUE), z1=runif(100), z2=sample(c(runif(10),NA,NaN), 100, TRUE)) test(1594.1, DT[, lapply(.SD, var, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::var, na.rm=FALSE), by=x]) test(1594.2, DT[, lapply(.SD, var, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::var, na.rm=TRUE), by=x]) test(1594.3, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gvar") test(1594.4, DT[, lapply(.SD, sd, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::sd, na.rm=FALSE), by=x]) test(1594.5, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) test(1594.6, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gsd") test(1594.7, DT[, lapply(.SD, prod, na.rm=FALSE), by=x], DT[, lapply(.SD, base::prod, na.rm=FALSE), by=x]) test(1594.8, DT[, lapply(.SD, prod, na.rm=TRUE), by=x], DT[, lapply(.SD, base::prod, na.rm=TRUE), by=x]) test(1594.9, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gprod") # FR #1517 dt1 = data.table(x=c(1,1,2), y=1:3) dt2 = data.table(x=c(2,3,4), z=4:6) test(1595, merge(dt1,dt2), merge(dt1,dt2, by="x")) # FR 1512, drop argument for dcast.data.table DT <- data.table(v1 = c(1.1, 1.1, 1.1, 2.2, 2.2, 2.2), v2 = factor(c(1L, 1L, 1L, 3L, 3L, 3L), levels=1:3), v3 = factor(c(2L, 3L, 5L, 1L, 2L, 6L), levels=1:6), v4 = c(3L, 2L, 2L, 5L, 4L, 3L)) ans1 <- dcast(DT, v1+v2~v3, value.var="v4", drop=FALSE) test(1596.1, dcast(DT, v1+v2~v3, value.var="v4", drop=c(FALSE, TRUE)), ans1[, -6, with=FALSE]) test(1596.2, dcast(DT, v1+v2~v3, value.var="v4", drop=c(TRUE, FALSE)), ans1[c(1,6)]) # bug fix #1495 dt = data.table(id=1:30, nn = paste0('A', 1:30)) smp = sample(30, size =10) lgl = dt$id %in% smp test(1597, dt[lgl, ], dt[id %in% smp]) # FR #643 vv = sample(letters[1:3], 10, TRUE) test(1599.1, data.table(x=vv, y=1:10, stringsAsFactors=TRUE)$x, factor(vv)) vv = sample(c(letters[1:3], NA), 10, TRUE) test(1599.2, data.table(x=vv, y=1:10, stringsAsFactors=TRUE)$x, factor(vv)) # bug #1477 fix DT <- data.table(a = 0L:1L, b = c(1L, 1L)) test(1600.1, DT[ , lapply(.SD, function(x) if (all(x)) x)], data.table(b=c(1L, 1L))) # this fix wasn't entirely nice as it introduced another issue. # it's fixed now, but adding a test for that issue as well to catch it early next time. set.seed(17022016L) DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"), z1 = sample(100L, 6L), z2 = sample(letters, 6L)) DT2 = data.table(id1=c("c", "w", "b"), val=50:52) test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla")) # warn when merge empty data.table #597 test(1601.1, merge(data.table(a=1),data.table(a=1), by="a"), data.table(a=1, key="a")) test(1601.2, tryCatch(merge(data.table(a=1),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'y' argument is 0 columns data.table.") test(1601.3, tryCatch(merge(data.table(NULL),data.table(a=1), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' argument is 0 columns data.table.") test(1601.4, tryCatch(merge(data.table(NULL),data.table(NULL), by="a"), warning = function(w) w$message), "You are trying to join data.tables where 'x' and 'y' arguments are 0 columns data.table.") # migrate `chron` dependency to Suggests #1558 dd = as.IDate("2016-02-28") tt = as.ITime("03:04:43") if(!requireNamespace("chron", quietly = TRUE)){ # Since chron is recommended package and installed by default, perhaps it's ok doing requireNamespace test(1602.1, as.chron.IDate(dd), error = "Install suggested `chron` package to use `as.chron.IDate` function.") test(1602.2, as.chron.ITime(tt), error = "Install suggested `chron` package to use `as.chron.ITime` function.") } else { test(1602.3, as.chron.IDate(dd), chron::as.chron(as.Date(dd))) test(1602.4, class(as.chron.ITime(tt)), "times") } # fix for #1549 d1 <- data.table(v1=1:2,x=x) d2 <- data.table(v1=3:4) test(1603.1, rbindlist(list(d2, d1), fill=TRUE), rbindlist(list(d1,d2), fill=TRUE)[c(3:4, 1:2)]) # fix for #1440 DT = data.table(a=1:3, b=4:6) myCol = "b" test(1604, DT[,.(myCol),with=F], error="When with=FALSE,") # fix for segfault #1531 DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9) test(1605, DT[order(-x, "D")], error="Column 2 is length 1 which differs") # fix for #1503, fread's fill argument polishing test(1606, fread("2\n1,a,b", fill=TRUE), data.table(V1=2:1, V2=c("","a"), V3=c("","b"))) # fix for #1476 dt = data.table(resp=c(1:5)) wide = copy(list(metrics = dt))$metrics # copy here copies the list of data.table and therefore didn't overallocate before.. test(1607, wide[, id := .I], data.table(resp = 1:5, id = 1:5)) # better fix for #1462, + improved error message (if this better fix fails) # no need for quote="" and sep="\t".. test(1608, dim(fread('issue_1462_fread_quotes.txt', header=FALSE)), c(4L, 224L)) # fix for #1164 test(1609, fread("issue_1164_json.txt"), data.table(json1='{""f1"":""value1"",""f2"":""double quote escaped with a backslash [ \\"" ]""}', string1="string field")) # set of enhancements to print.data.table for #1523 # dplyr-like column summary icol = 1L:3L Dcol = as.Date(paste0("2016-01-0", 1:3)) DT1 = data.table(lcol = list(list(1:3), list(1:3), list(1:3)), icol, ncol = as.numeric(icol), ccol = c("a", "b", "c"), xcol = as.complex(icol), ocol = factor(icol, ordered = TRUE), fcol = factor(icol)) test(1610.1, capture.output(print(DT1, class=TRUE)), c(" lcol icol ncol ccol xcol ocol fcol", " ", "1: 1 1 a 1+0i 1 1", "2: 2 2 b 2+0i 2 2", "3: 3 3 c 3+0i 3 3")) # fails on travis and appveyor; no idea why.. Passes on my mac and windows machine. # test(1610.2, capture.output(print(DT2, class=TRUE)) # c(" Dcol Pcol gcol Icol ucol", # " ", # "1: 2016-01-01 2016-01-01 01:00:00 TRUE 2016-01-01 1", # "2: 2016-01-02 2016-01-02 01:00:00 TRUE 2016-01-02 2", # "3: 2016-01-03 2016-01-03 01:00:00 TRUE 2016-01-03 3")) # fix for #833 l1 = list(a=seq_len(5), matrix(seq_len(25),ncol = 5, nrow = 5)) l2 = list(seq_len(5), matrix(seq_len(25),ncol = 5, nrow = 5)) test(1611.1, as.data.table(l1), setnames(setDT(as.data.frame(l1)), c("a", paste("V", 1:5, sep="")))) test(1611.2, as.data.table(l2), setnames(setDT(as.data.frame(l2)), c("V1", "V1.1", paste("V", 2:5, sep="")))) # fix for #646 # tz= is explicitly specified otherwise CRAN's solaris (both sparc and x86) fail. It may not be solaris per se # but something related to the timezone of the two solaris machines. I guess one or the other of as.POSIXct or # as.POSIXlt create the 'tzone' attribute differently for default tz="", just on solaris. I checked test.data.table # already uses all.equal(), not identical(). So I don't think it is an accuracy problem. But could be wrong. ll = list(a=as.POSIXlt("2015-01-01", tz='UTC'), b=1:5) test(1612.1, as.data.table(ll), data.table(a=as.POSIXct("2015-01-01", tz='UTC'), b=1:5), warning="POSIXlt column type detected") dt = data.table(d1="1984-03-17") ans = data.table(d1="1984-03-17", d2=as.POSIXct("1984-03-17", tz='UTC')) test(1612.2, dt[, d2 := strptime(d1, "%Y-%m-%d", tz='UTC')], ans, warning="POSIXlt column type detected and converted") ll = list(a=as.POSIXlt("2015-01-01"), b=2L) test(1612.3, setDT(ll), error="Column 1 is of POSIXlt type") # tests for all.equal.data.table #1106 # diff nrow DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = c(1:4,4L), b = letters[c(1:4,4L)]) test(1613.1, all.equal(DT1, DT2), "Different number of rows") # diff ncol DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 1:4) test(1613.2, all.equal(DT1, DT2), c("Different number of columns", "Different column names")) # diff colnames DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(aa = 1:4, bb = letters[1:4]) test(1613.3, all.equal(DT1, DT2), "Different column names") # diff column order DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(b = letters[1:4], a = 1:4) test(1613.4, all.equal(DT1, DT2), "Different column order") test(1613.5, all.equal(DT1, DT2, ignore.col.order=TRUE), TRUE) # diff row order DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 4:1, b = letters[4:1]) test(1613.6, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 0.8") test(1613.7, all.equal(DT1, DT2, ignore.row.order=TRUE), TRUE) # diff column order and diff row order DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(b = letters[4:1], a = 4:1) test(1613.8, all.equal(DT1, DT2), "Different column order") test(1613.9, all.equal(DT1, DT2, ignore.row.order=TRUE), "Different column order") test(1613.10, all.equal(DT1, DT2, ignore.col.order=TRUE), "Column 'a': Mean relative difference: 0.8") test(1613.11, all.equal(DT1, DT2, ignore.row.order=TRUE, ignore.col.order=TRUE), TRUE) # non-overlapping duplicates DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)]) DT2 <- data.table(a = c(1:4,3:4), b = letters[c(1:4,3:4)]) test(1613.12, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 1.333333") test(1613.13, all.equal(DT1, DT2, ignore.row.order=TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity") # overlapping duplicates DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)]) DT2 <- data.table(a = c(1:4,2:1), b = letters[c(1:4,2:1)]) test(1613.14, all.equal(DT1, DT2), "Column 'a': Mean relative difference: 0.6666667") test(1613.15, all.equal(DT1, DT2, ignore.row.order=TRUE), TRUE) # mixed overlapping duplicates DT1 <- data.table(a = c(1:4,1:2), b = letters[c(1:4,1:2)]) DT2 <- data.table(a = c(1:4,2:3), b = letters[c(1:4,2:3)]) test(1613.16, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity") # overlapping duplicates not equal in count DT1 <- data.table(a = c(1:4, rep(1L,3), rep(2L,2)), b = letters[c(1:4, rep(1L,3), rep(2L,2))]) DT2 <- data.table(a = c(1:4, rep(1L,2), rep(2L,3)), b = letters[c(1:4, rep(1L,2), rep(2L,3))]) test(1613.17, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity") # overlapping duplicates equal in count DT1 <- data.table(a = c(1:4, 1L, 2L, 1L, 2L), b = letters[c(1:4, 1L, 2L, 1L, 2L)]) DT2 <- data.table(a = c(2L, 1L, 1L, 2L, 1:4), b = letters[c(2L, 1L, 1L, 2L, 1:4)]) test(1613.18, all.equal(DT1, DT2, ignore.row.order = TRUE), TRUE) # subset with overlapping duplicates DT1 <- data.table(a = c(1:3,3L), b = letters[c(1:3,3L)]) DT2 <- data.table(a = c(1:4), b = letters[c(1:4)]) test(1613.19, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'target' has duplicate rows while 'current' doesn't") # different number of unique rows DT1 <- data.table(a = c(1:3,2:3), b = letters[c(1:3,2:3)]) DT2 <- data.table(a = c(1L,1:4), b = letters[c(1L,1:4)]) test(1613.20, all.equal(DT1, DT2, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity") test(1613.21, all.equal(DT2, DT1, ignore.row.order = TRUE), "Dataset 'current' has rows not present in 'target' or present in different quantity") # test attributes: key DT1 <- data.table(a = 1:4, b = letters[1:4], key = "a") DT2 <- data.table(a = 1:4, b = letters[1:4]) test(1613.22, all.equal(DT1, DT2), "Datasets has different keys. 'target': a. 'current' has no key.") test(1613.23, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) test(1613.24, all.equal(DT1, setkeyv(DT2, "a"), check.attributes = TRUE), TRUE) # test attributes: index DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 1:4, b = letters[1:4]) setindexv(DT1, "b") test(1613.25, all.equal(DT1, DT2), "Datasets has different indexes. 'target': b. 'current' has no index.") test(1613.26, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) test(1613.27, all.equal(DT1, setindexv(DT2, "a")), "Datasets has different indexes. 'target': b. 'current': a.") test(1613.28, all.equal(DT1, setindexv(DT2, "b")), "Datasets has different indexes. 'target': b. 'current': a, b.") test(1613.29, all.equal(DT1, setindexv(setindexv(DT2, NULL), "b")), TRUE) # test custom attribute DT1 <- data.table(a = 1:4, b = letters[1:4]) DT2 <- data.table(a = 1:4, b = letters[1:4]) setattr(DT1, "custom", 1L) test(1613.30, all.equal(DT1, DT2), "Datasets has different number of (non-excluded) attributes: target 3, current 2") test(1613.31, all.equal(DT1, DT2, check.attributes = FALSE), TRUE) setattr(DT2, "custom2", 2L) test(1613.32, all.equal(DT1, DT2), "Datasets has attributes with different names: custom, custom2") setattr(DT1, "custom2", 2L) setattr(DT2, "custom", 0L) if (base::getRversion() > "3.0.0") test(1613.33, all.equal(DT1, DT2), paste0("Attributes: < Component ", dQuote("custom"), ": Mean relative difference: 1 >")) setattr(DT2, "custom", 1L) test(1613.34, all.equal(DT1, DT2), TRUE) # trim.levels dt1 <- data.table(A = factor(letters[1:10])[1:4]) # 10 levels dt2 <- data.table(A = factor(letters[1:5])[1:4]) # 5 levels test(1613.35, all.equal(dt1, dt2)) test(1613.36, !isTRUE(all.equal(dt1, dt2, trim.levels = FALSE))) test(1613.37, !isTRUE(all.equal(dt1, dt2, trim.levels = FALSE, check.attributes = FALSE))) test(1613.38, all.equal(dt1, dt2, trim.levels = FALSE, ignore.row.order = TRUE)) test(1613.39, length(levels(dt1$A)) == 10L && length(levels(dt2$A)) == 5L, TRUE) # dt1 and dt2 not updated by reference # unsupported column types: list dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = lapply(1:4, function(x) new.env())) test(1613.40, all.equal(dt, dt), TRUE) test(1613.41, all.equal(dt, dt, ignore.row.order = TRUE), error = "Datasets to compare with 'ignore.row.order' must not have unsupported column types: list") # unsupported type in set-ops: complex, raw dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = as.complex(1:4), V4 = as.raw(1:4), V5 = lapply(1:4, function(x) NULL)) test(1613.42, all.equal(dt, dt), TRUE) test(1613.43, all.equal(dt, dt, ignore.row.order = TRUE), error = "Datasets to compare with 'ignore.row.order' must not have unsupported column types: raw, complex, list") # supported types multi column test dt = data.table( V1 = 1:4, V2 = as.numeric(1:4), V3 = letters[rep(1:2, 2)], V4 = factor(c("a","a","b","b")), V5 = as.POSIXct("2016-03-05 12:00:00", origin="1970-01-01")+(1:4)*3600, V6 = as.Date("2016-03-05", origin="1970-01-01")+(1:4) )[, V7 := as.IDate(V6) ][, V8 := as.ITime(V5)] test(1613.441, all.equal(dt, dt), TRUE) test(1613.442, all.equal(dt, dt, ignore.row.order = TRUE), TRUE) test(1613.443, all.equal(dt[c(1:4,1L)], dt[c(1:4,1L)]), TRUE) test(1613.444, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)]), "Column 'V1': Mean relative difference: 0.6") test(1613.445, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)], ignore.row.order = TRUE), TRUE) test(1613.45, all.equal(dt[c(1:4,1:2)], dt[c(1L,1L,1:4)], ignore.row.order = TRUE), c("Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0", "Dataset 'current' has rows not present in 'target' or present in different quantity")) test(1613.46, all.equal(dt[c(1:2,1:4,1:2)], dt[c(1:2,1:2,1:4)], ignore.row.order = TRUE), TRUE) # supported type all.equal: integer64 if ("package:bit64" %in% search()) { dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = bit64::as.integer64("90000000000")+1:4) test(1613.47, all.equal(dt, dt), TRUE) test(1613.48, all.equal(dt, dt, ignore.row.order = TRUE), TRUE) test(1613.49, all.equal(dt[c(1:4,1L)], dt[c(1:4,1L)]), TRUE) test(1613.50, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)]), "Column 'V1': Mean relative difference: 0.6") test(1613.51, all.equal(dt[c(1:4,1L)], dt[c(1L,1:4)], ignore.row.order = TRUE), TRUE) test(1613.52, all.equal(dt[c(1:4,1:2)], dt[c(1L,1L,1:4)], ignore.row.order = TRUE), c("Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0","Dataset 'current' has rows not present in 'target' or present in different quantity")) test(1613.53, all.equal(dt[c(1:2,1:4,1:2)], dt[c(1:2,1:2,1:4)], ignore.row.order = TRUE), TRUE) } # all.equal - new argument 'tolerance' #1737 x = data.table(1) # test numeric after adding 'tolerance' argument y = data.table(2) test(1613.5411, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5412, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(c(1,1)) y = data.table(c(2,2)) test(1613.5421, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5422, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(c(1,2)) y = data.table(c(2,2)) test(1613.5431, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5432, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(as.factor(1)) # test factor adding 'tolerance' argument y = data.table(as.factor(2)) test(1613.5511, !isTRUE(all.equal(x,y))) test(1613.5512, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5513, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(as.factor(c(1,1))) y = data.table(as.factor(c(2,2))) test(1613.5521, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5522, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(as.factor(c(1,2))) y = data.table(as.factor(c(2,2))) test(1613.5531, !isTRUE(all.equal(x, y, ignore.row.order = FALSE))) test(1613.5532, !isTRUE(all.equal(x, y, ignore.row.order = TRUE))) x = data.table(-0.000189921844659375) # tolerance in action y = data.table(-0.000189921844655161) test(1613.561, all(all.equal(x, y, ignore.row.order = FALSE), all.equal(x, y, ignore.row.order = TRUE))) test(1613.562, all(is.character(all.equal(x, y, ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(x, y, ignore.row.order = TRUE, tolerance = 0)))) test(1613.563, all(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE), all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE))) test(1613.564, all(is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE, tolerance = 0)))) test(1613.565, all(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE), is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 test(1613.566, all(all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = FALSE, tolerance = 0), all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = TRUE, tolerance = 0))) test(1613.567, all(is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0)))) test(1613.571, all(all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 test(1613.572, all(all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = FALSE), all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE))) # x to x with factor equality test(1613.573, all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all.equal(x, y, ignore.row.order = TRUE, tolerance = 1))) test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced") if ("package:bit64" %in% search()) { # fix for #1405, handles roll with -ve int64 values properly dt = data.table(x=as.integer64(c(-1000, 0)), y=c(5,10)) val = c(-1100,-900,100) ans = data.table(x=as.integer64(val)) test(1614.1, dt[.(val), roll=Inf, on="x"], ans[, y:=c(NA,5,10)]) test(1614.2, dt[.(val), roll=Inf, on="x", rollends=TRUE], ans[, y:=c(5,5,10)]) test(1614.3, dt[.(val), roll=-Inf, on="x"], ans[, y:=c(5,10,NA)]) test(1614.4, dt[.(val), roll=-Inf, on="x", rollends=TRUE], ans[, y:=c(5,10,10)]) } # fix for #1571 x = data.table(c(1,1,2,7,2,3,4,4,7), 1:9) y = data.table(c(2,3,4,4,4,5)) test(1615.1, x[!y, on="V1", mult="first"], data.table(V1=c(1,7), V2=INT(c(1,4)))) test(1615.2, x[!y, on="V1", mult="last"], data.table(V1=c(1,7), V2=INT(c(2,9)))) test(1615.3, x[!y, on="V1", mult="all"], data.table(V1=c(1,1,7,7), V2=INT(c(1,2,4,9)))) # fix for #1287 and #1271 set.seed(1L) dt = data.table(a=c(1,1,2), b=sample(10,3), c=sample(10,3)) test(1616.1, dt[.(1:2), if (c-b > 0L) b, on="a", by=.EACHI, mult="first"], data.table(a=c(1,2), V1=c(3L,5L))) test(1616.2, dt[.(1:2), if (c-b > 0L) b, on="a", by=.EACHI, mult="last"], data.table(a=c(2), V1=5L)) test(1616.3, dt[.(1:2), c := if (c-b > 0L) b, by=.EACHI, mult="first", on="a"], data.table(a=dt$a, b=dt$b, c=c(3L,2L,5L)) ) # fix for #1281 x <- 3 > 0 ans = setattr(copy(x), "foo", "bar") if (base::getRversion() > "3.0.0") test(1617, setattr(x, "foo", "bar"), ans, warning = "Input is a length=1 logical that") # fix for #1445 test(1618.1, fread("a,c,b\n1,2,3", select=c("b", "c")), data.table(b=3L, c=2L)) test(1618.2, fread("a,c,b\n1,2,3", select=c("c", "b")), data.table(c=2L, b=3L)) test(1618.3, fread("a,c,b\n1,2,3", select=c(3,2)), data.table(b=3L, c=2L)) test(1618.4, fread("a,c,b\n1,2,3", select=c(2:3)), data.table(c=2L, b=3L)) test(1618.5, fread("a,c,b\n1,2,3", select=c("b", "c"), col.names=c("q", "r")), data.table(q=3L, r=2L)) test(1618.6, fread("a,c,b\n1,2,3", select=c("b", "z")), data.table(b=3L), warning="Column name 'z' not found.*skipping") # fix for #1270. Have been problems with R before vs after 3.1.0 here. But now ok in all R versions. DT = data.table(x=1:2, y=5:6) test(1619.1, DT[, .BY, by=x]$BY, as.list(1:2)) test(1619.2, DT[, bycol := .BY, by=x]$bycol, as.list(1:2)) # fix for #473 DT = data.frame(x=1, y=2) setattr(DT, 'class', c('data.table', 'data.frame')) # simulates over-allocation lost scenario if (!truelength(DT)) test(1620, truelength(as.data.table(DT)), 1026L) # fix for #1116, (#1239 and #1201) test(1621.1, fread("issue_1116_fread_few_lines.txt"), setDT(read.delim("issue_1116_fread_few_lines.txt", stringsAsFactors=FALSE, sep=",", check.names=FALSE))) test(1621.2, fread("issue_1116_fread_few_lines_2.txt"), setDT(read.delim("issue_1116_fread_few_lines_2.txt", stringsAsFactors=FALSE, sep=",", check.names=FALSE))) # fix for #1573 ans1 = fread("issue_1573_fill.txt", fill=TRUE, na.strings="") ans2 = setDT(read.table("issue_1573_fill.txt", header=TRUE, fill=TRUE, stringsAsFactors=FALSE, na.strings="")) test(1622.1, ans1, ans2) test(1622.2, ans1, fread("issue_1573_fill.txt", fill=TRUE, sep=" ", na.strings="")) # fix for #989 # error_msg = if (base::getRversion() < "3.4") "can not be a directory name" else "does not exist" # Until R v3.3, file.info("~") returned TRUE for isdir. This seems to return NA in current devel. However, it # correctly identifies that "~" is not a file. So leads to another error message. So removing the error message # so that it errors properly on both versions. This seems fine to me since we just need it to error. Tested. test(1623, fread("~"), error="") # testing print.rownames option, #1097 (part of #1523) old = getOption("datatable.print.rownames") options(datatable.print.rownames = FALSE) DT <- data.table(a = 1:3) test(1624, capture.output(print(DT)), c(" a", " 1", " 2", " 3")) options(datatable.print.rownames = old) # fix for #1575 text = "colA: dataA\ncolB: dataB\ncolC: dataC\n\nColA: dataA\nColB: dataB\nColC: dataC" test(1625.1, fread(text, header=FALSE, sep=":", blank.lines.skip=TRUE, strip.white=FALSE), setDT(read.table(text=text, header=FALSE, sep=":", blank.lines.skip=TRUE, stringsAsFactors=FALSE))) test(1625.2, fread(text, header=FALSE, sep=":", blank.lines.skip=TRUE), setDT(read.table(text=text, header=FALSE, sep=":", blank.lines.skip=TRUE, stringsAsFactors=FALSE, strip.white=TRUE))) # set-operators #547 # setops basic check all x = data.table(c(1,2,2,2,3,4,4)) y = data.table(c(2,3,4,4,4,5)) test(1626.1, fintersect(x, y), data.table(c(2,3,4))) # intersect test(1626.2, fintersect(x, y, all=TRUE), data.table(c(2,3,4,4))) # intersect all test(1626.3, fsetdiff(x, y), data.table(c(1))) # setdiff (except) test(1626.4, fsetdiff(x, y, all=TRUE), data.table(c(1,2,2))) # setdiff all (except all) test(1626.5, funion(x, y), data.table(c(1,2,3,4,5))) # union test(1626.6, funion(x, y, all=TRUE), data.table(c(1,2,2,2,3,4,4,2,3,4,4,4,5))) # union all test(1626.7, fsetequal(x, y), FALSE) # setequal # setops check two cols x = data.table(c(1,2,2,2,3,4,4), c(1,1,1,3,3,3,3)) y = data.table(c(2,3,4,4,4,5), c(1,1,2,3,3,3)) test(1626.8, fintersect(x, y), data.table(c(2,4), c(1,3))) # intersect test(1626.9, fintersect(x, y, all=TRUE), data.table(c(2,4,4), c(1,3,3))) # intersect all test(1626.10, fsetdiff(x, y), data.table(c(1,2,3), c(1,3,3))) # setdiff (except) test(1626.11, fsetdiff(x, y, all=TRUE), data.table(c(1,2,2,3), c(1,1,3,3))) # setdiff all (except all) test(1626.12, funion(x, y), data.table(c(1,2,2,3,4,3,4,5), c(1,1,3,3,3,1,2,3))) # union test(1626.13, funion(x, y, all=TRUE), data.table(c(1,2,2,2,3,4,4,2,3,4,4,4,5), c(1,1,1,3,3,3,3,1,1,2,3,3,3))) # union all test(1626.14, fsetequal(x, y), FALSE) # setequal # setops on unique sets x = unique(x) y = unique(y) test(1626.15, fintersect(x, y), data.table(c(2,4), c(1,3))) # intersect test(1626.16, fintersect(x, y, all=TRUE), data.table(c(2,4), c(1,3))) # intersect all test(1626.17, fsetdiff(x, y), data.table(c(1,2,3), c(1,3,3))) # setdiff (except) test(1626.18, fsetdiff(x, y, all=TRUE), data.table(c(1,2,3), c(1,3,3))) # setdiff all (except all) test(1626.19, funion(x, y), data.table(c(1,2,2,3,4,3,4,5), c(1,1,3,3,3,1,2,3))) # union test(1626.20, funion(x, y, all=TRUE), data.table(c(1,2,2,3,4,2,3,4,4,5), c(1,1,3,3,3,1,1,2,3,3))) # union all test(1626.21, fsetequal(x, y), FALSE) # setequal # intersect precise duplicate handling dt = data.table(a=1L) test(1626.22, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,0)])), 0L) test(1626.23, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,0)], all=TRUE)), 0L) test(1626.24, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,1)])), 1L) test(1626.25, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,1)], all=TRUE)), 1L) test(1626.26, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,2)])), 1L) test(1626.27, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,2)], all=TRUE)), 2L) test(1626.28, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,3)])), 1L) test(1626.29, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,3)], all=TRUE)), 3L) test(1626.30, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,4)])), 1L) test(1626.31, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,4)], all=TRUE)), 4L) test(1626.32, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,5)])), 1L) test(1626.33, nrow(fintersect(dt[rep(1L,4)], dt[rep(1L,5)], all=TRUE)), 4L) # setdiff precise duplicate handling dt = data.table(a=1L) test(1626.34, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,0)])), 1L) test(1626.35, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,0)], all=TRUE)), 4L) test(1626.36, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,1)])), 0L) test(1626.37, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,1)], all=TRUE)), 3L) test(1626.38, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,2)])), 0L) test(1626.39, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,2)], all=TRUE)), 2L) test(1626.40, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,3)])), 0L) test(1626.41, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,3)], all=TRUE)), 1L) test(1626.42, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,4)])), 0L) test(1626.43, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,4)], all=TRUE)), 0L) test(1626.44, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,5)])), 0L) test(1626.45, nrow(fsetdiff(dt[rep(1L,4)], dt[rep(1L,5)], all=TRUE)), 0L) # unsupported type in set-ops: list (except UNION ALL) dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = lapply(1:4, function(x) new.env())) x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] test(1626.46, fintersect(x, y), error = "x and y must not have unsupported column types: list") test(1626.47, fintersect(x, y, all=TRUE), error = "x and y must not have unsupported column types: list") test(1626.48, fsetdiff(x, y), error = "x and y must not have unsupported column types: list") test(1626.49, fsetdiff(x, y, all=TRUE), error = "x and y must not have unsupported column types: list") test(1626.50, funion(x, y), error = "x and y must not have unsupported column types: list") test(1626.51, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)]) test(1626.52, fsetequal(x, y), error = "x and y must not have unsupported column types: list") test(1626.53, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "x and y must not have unsupported column types: list") # unsupported type in set-ops: complex, raw dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = as.complex(1:4), V4 = as.raw(1:4), V5 = lapply(1:4, function(x) NULL)) x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] test(1626.54, fintersect(x, y), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.55, fintersect(x, y, all=TRUE), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.56, fsetdiff(x, y), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.57, fsetdiff(x, y, all=TRUE), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.58, funion(x, y), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.59, funion(x, y, all=TRUE), error = "x and y must not have unsupported column types: raw, complex") # no 'list' here which is supported for `all=TRUE` test(1626.60, fsetequal(x, y), error = "x and y must not have unsupported column types: raw, complex, list") test(1626.61, fsetequal(dt[c(1:2,2L)], dt[c(1:2,2L)]), error = "x and y must not have unsupported column types: raw, complex, list") # supported types multi column test dt = data.table( V1 = 1:4, V2 = as.numeric(1:4), V3 = letters[rep(1:2, 2)], V4 = factor(c("a","a","b","b")), V5 = as.POSIXct("2016-03-05 12:00:00", origin="1970-01-01")+(1:4)*3600, V6 = as.Date("2016-03-05", origin="1970-01-01")+(1:4) )[, V7 := as.IDate(V6) ][, V8 := as.ITime(V5)] x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] test(1626.62, fintersect(x, y), dt[2:3]) test(1626.63, fintersect(x, y, all=TRUE), dt[c(2:3,2L)]) test(1626.63, fsetdiff(x, y), dt[4L]) test(1626.64, fsetdiff(x, y, all=TRUE), dt[c(4L,2L)]) test(1626.65, funion(x, y), dt[c(2:4,1L)]) test(1626.66, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)]) test(1626.67, fsetequal(x, y), FALSE) test(1626.68, fsetequal(dt[c(2:3,3L)], dt[c(2:3,3L)]), TRUE) # supported type in set-ops: integer64 if ("package:bit64" %in% search()) { dt = data.table(V1 = 1:4, V2 = letters[1:4], V3 = bit64::as.integer64("90000000000")+1:4) x = dt[c(2:4,2L,2L)] y = dt[c(1:3,2L)] test(1626.69, fintersect(x, y), dt[2:3]) test(1626.70, fintersect(x, y, all=TRUE), dt[c(2:3,2L)]) test(1626.71, fsetdiff(x, y), dt[4L]) test(1626.72, fsetdiff(x, y, all=TRUE), dt[c(4L,2L)]) test(1626.73, funion(x, y), dt[c(2:4,1L)]) test(1626.74, funion(x, y, all=TRUE), dt[c(2:4,2L,2L,1:3,2L)]) test(1626.75, fsetequal(x, y), FALSE) test(1626.76, fsetequal(dt[c(2:3,3L)], dt[c(2:3,3L)]), TRUE) } else { cat("Tests 1626.[69-76] not run. If required call library(bit64) first.\n") } # fix for #1087 and #1465 test(1627, charToRaw(names(fread("issue_1087_utf8_bom.csv"))[1L]), as.raw(97L)) # uniqueN gains na.rm argument, #1455 set.seed(1L) dt = data.table(x=sample(c(1:3,NA),25,TRUE), y=sample(c(NA,"a", "b"), 25,TRUE), z=sample(2,25,TRUE)) test(1628.1, uniqueN(dt, by=1:2, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y)]))) test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)]))) test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z]) test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z]) # fix for long standing FR/bug, #495 # most likely I'm missing some tests, but we'll fix/add them as we go along. dt = data.table(grp=c(2,3,3,1,1,2,3), v1=1:7, v2=7:1, v3=10:16) test(1629.1, dt[, .SD*v1, .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)]) test(1629.2, dt[, lapply(.SD, function(x) x*v1), .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)]) test(1629.3, dt[, lapply(.SD, function(x) mean(x)*sum(v1)), .SDcols=v2:v3], data.table(v2=112, v3=364)) test(1629.4, dt[, c(sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(V1=28L, v2=4, v3=13)) test(1629.5, dt[, c(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, v2=4, v3=13)) test(1629.6, dt[, .(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, V2=list(4,13))) test(1629.7, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) # add/update dt2 = copy(dt) test(1629.8, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations oldopts = getOption("datatable.optimize") # backup options(datatable.optimize = 1L) # no gforce test(1629.9, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) ans1 = dt[, sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce options(datatable.optimize = Inf) # Inf test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3], dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp]) test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp]) options(datatable.optimize = oldopts) # by=.EACHI and operations with 'i' test(1629.16, dt[.(2:3), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3]) test(1629.17, dt[.(2:3), c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3, on="grp"][order(V1,v2,v3)], dt[grp %in% 2:3, c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3][order(V1,v2,v3)]) # #759, add new cols on := dt1 <- data.table(id = 1:2, x = 3:4) dt2 <- data.table(id = 3:4, y = c(5,6)) # when updating using :=, nomatch = 0 or NA should make no difference i.e. new columns should always # be added. Otherwise there's an inconsistent number of columns in result that depends on data. ans = copy(dt1)[,z:=NA_real_] # NA_real_ because :=2 below is type double test(1630.1, copy(dt1)[id>5, z:=2, nomatch=0L], ans, warning="ignoring nomatch") test(1630.2, copy(dt1)[dt2, z:=2, on="id", nomatch=0L], ans, warning="ignoring nomatch") test(1630.3, copy(dt1)[dt2, z:=y, on="id", nomatch=0L], ans, warning="ignoring nomatch") test(1630.4, copy(dt1)[dt2, z:=y, on="id", by=.EACHI, nomatch=0L], ans, warning="ignoring nomatch") test(1630.5, copy(dt1)[id>5, z:=2, nomatch=NA], ans, warning="ignoring nomatch") test(1630.6, copy(dt1)[dt2, z:=2, on="id", nomatch=NA], ans, warning="ignoring nomatch") test(1630.7, copy(dt1)[dt2, z:=y, on="id", nomatch=NA], ans, warning="ignoring nomatch") test(1630.8, copy(dt1)[dt2, z:=y, on="id", by=.EACHI, nomatch=NA], ans, warning="ignoring nomatch") test(1630.9, copy(dt1)[id>5, z:=2L, nomatch=0L], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch") test(1630.11, copy(dt1)[id>5, z:=2L, nomatch=NA], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch") # fix for #1268, on= retains keys correctly. A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key="site,date") B = data.table(x=c(10,20), y=c(100,200), key="x") test(1631, key(A[B, on="x"]), NULL) # fix for #1479, secondary keys are removed when necessary dt = data.table(a = rep(c(F,F,T,F,F,F,F,F,F), 3), b = c("x", "y", "z")) setindex(dt, a) dt[, a := as.logical(sum(a)), by = b] test(1632.1, names(attributes(attr(dt, 'index'))), NULL) dt = data.table(a = rep(c(F,F,T,F,F,F,F,F,F), 3), b = c("x", "y", "z")) setindex(dt, b) dt[, a := as.logical(sum(a)), by = b] test(1632.2, names(attributes(attr(dt, 'index'))), "__b") dt = data.table(a = rep(c(F,F,T,F,F,F,F,F,F), 3), b = c("x", "y", "z")) test(1632.3, copy(dt)[, c := !a, by=b], copy(dt)[, c := c(T,T,F,T,T,T,T,T,T)]) # by accepts colA:colB for interactive scenarios, #1395 dt = data.table(x=rep(1,18), y=rep(1:2, each=9), z=rep(1:3,each=6), a=rep(1:6, each=3))[, b := 6] test(1633.1, dt[, sum(b), by=x:a], dt[, sum(b), by=.(x,y,z,a)]) test(1633.2, dt[, sum(b), by=y:a], dt[, sum(b), by=.(y,z,a)]) test(1633.3, dt[, sum(b), by=a:y], dt[, sum(b), by=.(a,z,y)]) test(1633.4, dt[, .SD, by=1:nrow(dt)], data.table(nrow=1:nrow(dt), dt)) # make sure this works # reuse secondary indices dt = data.table(x=sample(3, 10, TRUE), y=1:10) v1 = capture.output(ans1 <- dt[.(3:2), on="x", verbose=TRUE]) setindex(dt, x) v2 = capture.output(ans2 <- dt[.(3:2), on="x", verbose=TRUE]) test(1634.1, any(grepl("ad hoc", v1)), TRUE) test(1634.2, any(grepl("existing index", v2)), TRUE) # fread's fill argument detects separator better in complex cases as well, #1573 text = "a b c d e f g h i j k l\n1 P P;A;E; Y YW; H(). 1-3 pro\n2 Q9 a;a;a;a; YB YH; M(). 13 pn ba\n1 P3 P; Y Y; R(). 14 p\n53 P P6;B;D;0;5;a;X;a;4R; Y Y; H(). 13 pe e\n1 P P;O;O;a;a;a; HLA-A HLA-A;; H(). HcIha,A-n\n102 P P;O;P;P;P;P;P;P;a;a;a;a;a;a;a;a;a;a; H-A H-A;; H(). HcIha,A" test(1635.1, ans1 <- fread(text, fill=TRUE), setDT(read.table(text=text, stringsAsFactors=FALSE, fill=TRUE, sep="\t", header=TRUE))) text = "a b c d e\n1 P P;A;E; Y YW; H(). 1-3 pro\n2 Q9 a;a;a;a; YB YH; M(). 13 pn ba\n1 P3 P; Y Y; R(). 14 p\n53 P P6;B;D;0;5;a;X;a;4R; Y Y; H(). 13 pe e\n1 P P;O;O;a;a;a; HLA-A HLA-A;; H(). HcIha,A-n\n102 P P;O;P;P;P;P;P;P;a;a;a;a;a;a;a;a;a;a; H-A H-A;; H(). HcIha,A" test(1635.2, fread(text, fill=TRUE), setnames(ans1[, 1:7, with=FALSE], c(letters[1:5], paste("V", 6:7, sep="")))) # testing function type in dt, #518 dt = data.table(x=1, y=sum) test(1636.1, class(dt$y), "list") test(1636.2, any(grepl("1: 1 ", capture.output(print(dt)))), TRUE) dt = data.table(x=1:2, y=sum) test(1636.3, class(dt$y), "list") test(1636.4, any(grepl("2: 2 ", capture.output(print(dt)))), TRUE) dt = data.table(x=1:2, y=c(sum, min)) test(1636.5, class(dt$y), "list") test(1636.6, any(grepl("2: 2 ", capture.output(print(dt)))), TRUE) # #484 fix (related to #495 fix above) dt = data.table(a = 1, b = 1) test(1637.1, dt[, data.table(a, .SD), by = cumsum(a)], data.table(cumsum=1, a=1, b=1)) test(1637.2, dt[, data.table(a, .SD), by = cumsum(a), .SDcols=a:b], data.table(cumsum=1, a=1, a=1, b=1)) test(1637.3, dt[, data.table(a, .SD), by = a], data.table(a=1,a=1,b=1)) test(1637.4, dt[, data.table(b, .SD), by = cumsum(a)], data.table(cumsum=1, b=1, b=1)) test(1637.5, dt[, data.table(a, b), by = cumsum(a)], data.table(cumsum=1, a=1, b=1)) # when datatable.optimize<1, no optimisation of j should take place: old = options(datatable.optimize=0L) dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") options(old) #1389 - split.data.table - big chunk of unit tests set.seed(123) dt = data.table(x1 = rep(letters[1:2], 6), x2 = rep(letters[3:5], 4), x3 = rep(letters[5:8], 3), y = rnorm(12)) dt = dt[sample(.N)] df = as.data.frame(dt) # - [x] split by factor the same as `split.data.frame` - `f` argument ---- test(1639.1, lapply(split(df, as.factor(1:2)), setDT), split(dt, as.factor(1:2))) # drop=FALSE on same factor test(1639.2, lapply(split(df, as.factor(1:2), drop=TRUE), setDT), split(dt, as.factor(1:2), drop=TRUE)) # drop=TRUE on same factor test(1639.3, lapply(split(df, as.factor(1:4)[3:2]), setDT), split(dt, as.factor(1:4)[3:2])) # drop=FALSE on same factor with empty levels test(1639.4, lapply(split(df, as.factor(1:4)[3:2], drop=TRUE), setDT), split(dt, as.factor(1:4)[3:2], drop=TRUE)) # drop=TRUE on same factor with empty levels test(1639.5, lapply(split(df, as.factor(1:12)), setDT), split(dt, as.factor(1:12))) # drop=FALSE factor length of nrow test(1639.6, lapply(split(df, as.factor(1:12), drop=TRUE), setDT), split(dt, as.factor(1:12), drop=TRUE)) # drop=TRUE factor length of nrow ord = sample(2:13) test(1639.7, lapply(split(df, as.factor(1:14)[ord]), setDT), split(dt, as.factor(1:14)[ord])) # drop=FALSE factor length of nrow with empty levels test(1639.8, lapply(split(df, as.factor(1:14)[ord], drop=TRUE), setDT), split(dt, as.factor(1:14)[ord], drop=TRUE)) # drop=TRUE factor length of nrow with empty levels test(1639.9, lapply(split(df, list(as.factor(1:2), as.factor(3:2))), setDT), split(dt, list(as.factor(1:2), as.factor(3:2)))) # `f` list object drop=FALSE test(1639.10, lapply(split(df, list(as.factor(1:2), as.factor(3:2)), drop=TRUE), setDT), split(dt, list(as.factor(1:2), as.factor(3:2)), drop=TRUE)) # `f` list object drop=TRUE test(1639.11, split(dt, as.factor(integer())), error = "group length is 0 but data nrow > 0") # factor length 0L test(1639.12, split(dt, as.factor(integer()), drop=TRUE), error = "group length is 0 but data nrow > 0") test(1639.13, split(dt, as.factor(1:2)[0L]), error = "group length is 0 but data nrow > 0") # factor length 0L with empty levels test(1639.14, split(dt, as.factor(1:2)[0L], drop=TRUE), error = "group length is 0 but data nrow > 0") # - [x] edge cases for `f` argument ---- if (base::getRversion() > "3.0.0") test(1639.15, split(df, as.factor(NA)), split(dt, as.factor(NA))) # factor NA if (base::getRversion() > "3.0.0") test(1639.16, split(df, as.factor(NA), drop=TRUE), split(dt, as.factor(NA), drop=TRUE)) if (base::getRversion() > "3.0.0") test(1639.17, lapply(split(df, as.factor(1:2)[0L][1L]), setDT), split(dt, as.factor(1:2)[0L][1L])) # factor NA with empty levels if (base::getRversion() > "3.0.0") test(1639.18, split(df, as.factor(1:2)[0L][1L], drop=TRUE), split(dt, as.factor(1:2)[0L][1L], drop=TRUE)) test(1639.19, lapply(split(df, as.factor(c(1L,NA,2L))), setDT), split(dt, as.factor(c(1L,NA,2L)))) # factor has NA test(1639.20, lapply(split(df, as.factor(c(1L,NA,2L)), drop=TRUE), setDT), split(dt, as.factor(c(1L,NA,2L)), drop=TRUE)) test(1639.21, lapply(split(df, as.factor(c(1L,NA,2:4))[1:3]), setDT), split(dt, as.factor(c(1L,NA,2:4))[1:3])) # factor has NA with empty levels test(1639.22, lapply(split(df, as.factor(c(1L,NA,2:4))[1:3], drop=TRUE), setDT), split(dt, as.factor(c(1L,NA,2:4))[1:3], drop=TRUE)) test(1639.23, lapply(split(df, letters[c(1L,NA,2L)]), setDT), split(dt, letters[c(1L,NA,2L)])) # character as `f` arg test(1639.24, lapply(split(df, letters[c(1L,NA,2L)], drop=TRUE), setDT), split(dt, letters[c(1L,NA,2L)], drop=TRUE)) test(1639.25, lapply(split(df, "z"), setDT), split(dt, "z")) # character as `f` arg, length 1L test(1639.26, lapply(split(df, "z", drop=TRUE), setDT), split(dt, "z", drop=TRUE)) test(1639.27, lapply(split(df, letters[c(1L,NA)]), setDT), split(dt, letters[c(1L,NA)])) # character as `f` arg, length 1L of non-NA test(1639.28, lapply(split(df, letters[c(1L,NA)], drop=TRUE), setDT), split(dt, letters[c(1L,NA)], drop=TRUE)) test(1639.29, lapply(split(df[0L,], "z"), setDT), split(dt[0L], "z")) # nrow 0, f length 1-2 test(1639.30, lapply(split(df[0L,], c("z1","z2")), setDT), split(dt[0L], c("z1","z2"))) test(1639.31, lapply(split(df[0L,], "z", drop=TRUE), setDT), split(dt[0L], "z", drop=TRUE)) test(1639.32, lapply(split(df[0L,], c("z1","z2"), drop=TRUE), setDT), split(dt[0L], c("z1","z2"), drop=TRUE)) test(1639.33, lapply(split(df[1L,], "z"), setDT), split(dt[1L], "z")) # nrow 1, f length 1-2 test(1639.34, lapply(suppressWarnings(split(df[1L,], c("z1","z2"))), setDT), suppressWarnings(split(dt[1L], c("z1","z2")))) test(1639.35, lapply(split(df[1L,], "z", drop=TRUE), setDT), split(dt[1L], "z", drop=TRUE) ) test(1639.36, lapply(suppressWarnings(split(df[1L,], c("z1","z2"), drop=TRUE)), setDT), suppressWarnings(split(dt[1L], c("z1","z2"), drop=TRUE))) if (base::getRversion() > "3.0.0") test(1639.37, lapply(split(df[0L,], as.factor(NA_character_)), setDT), split(dt[0L], as.factor(NA_character_))) # nrow 0, f factor length 1L NA if (base::getRversion() > "3.0.0") test(1639.38, lapply(split(df[0L,], as.factor(NA_character_), drop=TRUE), setDT), split(dt[0L], as.factor(NA_character_), drop=TRUE)) if (base::getRversion() > "3.0.0") test(1639.39, lapply(split(df[0L,], as.factor(1:2)[0L][1L]), setDT), split(dt[0L], as.factor(1:2)[0L][1L])) # nrow 0, f factor length 1L NA with empty levels if (base::getRversion() > "3.0.0") test(1639.40, lapply(split(df[0L,], as.factor(1:2)[0L][1L], drop=TRUE), setDT), split(dt[0L], as.factor(1:2)[0L][1L], drop=TRUE)) test(1639.41, lapply(split(df[0L,], as.factor(integer())), setDT), split(dt[0L], as.factor(integer()))) # nrow 0, f factor length 0L test(1639.42, lapply(split(df[0L,], as.factor(integer()), drop=TRUE), setDT), split(dt[0L], as.factor(integer()), drop=TRUE)) if (base::getRversion() > "3.0.0") test(1639.43, lapply(split(df[0L,], as.factor(1:2)[0L]), setDT), split(dt[0L], as.factor(1:2)[0L])) # nrow 0, f factor length 0L with empty levels if (base::getRversion() > "3.0.0") test(1639.44, lapply(split(df[0L,], as.factor(1:2)[0L], drop=TRUE), setDT), split(dt[0L], as.factor(1:2)[0L], drop=TRUE)) test(1639.45, lapply(split(df[0L,], as.factor(1:3)[c(2L,NA,3L)]), setDT), split(dt[0L], as.factor(1:3)[c(2L,NA,3L)])) # nrow 0, f factor with empty levels and NA test(1639.46, lapply(split(df[0L,], as.factor(1:3)[c(2L,NA,3L)], drop=TRUE), setDT), split(dt[0L], as.factor(1:3)[c(2L,NA,3L)], drop=TRUE)) # nrow 0, f character length 1L NA if (base::getRversion() > "3.0.0") test(1639.47, lapply(split(df[0L,], NA_character_), setDT), split(dt[0L], NA_character_)) if (base::getRversion() > "3.0.0") test(1639.48, lapply(split(df[0L,], NA_character_, drop=TRUE), setDT), split(dt[0L], NA_character_, drop=TRUE)) test(1639.49, lapply(split(df[0L,], letters[c(NA,1:3)]), setDT), split(dt[0L], letters[c(NA,1:3)])) # nrow 0, f length > 1L, with NA test(1639.50, lapply(split(df[0L,], letters[c(NA,1:3)], drop=TRUE), setDT), split(dt[0L], letters[c(NA,1:3)], drop=TRUE)) # - [x] split by reference to column names - `by` - for factor column ---- fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3] l = split(fdt, by = "x1", flatten=FALSE) # single col test(1639.51, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L))) l = split(fdt, by = "x2", flatten=FALSE) test(1639.52, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L))) l = split(fdt, by = "x3", flatten=FALSE) test(1639.53, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=4L, f=4L, g=4L, e=4L))) l = split(fdt, by = c("x1","x2"), flatten=FALSE) # multi col test(1639.54, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))), sapply(l, sapply, nrow) == rep(2L, 6), sapply(l, sapply, ncol) == rep(4L, 6) )) l = split(fdt, by = c("x1","x3"), flatten=FALSE) # empty levels appears due subset x3 by x1 groups test(1639.55, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("h","f","e","g"), a=c("g","e","f","h"))), sapply(l, sapply, nrow) == rep(c(3L,3L,0L,0L), 2), sapply(l, sapply, ncol) == rep(4L, 8) )) l = split(fdt, by = c("x2","x3"), flatten=FALSE) test(1639.56, TRUE, all( is.list(l), identical(names(l), c("d","e","c")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))), sapply(l, sapply, nrow) == rep(1L, 12), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset are expanded test(1639.57, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), sapply(l, sapply, sapply, ncol) == rep(4L, 24) )) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # multi col rev test(1639.58, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 8) )) l = split(fdt, by = c("x3","x1"), flatten=FALSE) # x1 has empty levels after split on x3 first test(1639.59, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b","a"), f=c("b","a"), g=c("a","b"), e=c("a","b"))), sapply(l, sapply, nrow) == rep(c(3L,0L), 4), sapply(l, sapply, ncol) == rep(4L, 8) )) l = split(fdt, by = c("x3","x2","x1"), drop = TRUE, flatten=FALSE) test(1639.60, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(h=list(d=c("b"), e=c("b"), c=c("b")), f=list(e=c("b"), c=c("b"), d=c("b")), g=list(e=c("a"), d=c("a"), c=c("a")), e=list(e=c("a"), d=c("a"), c=c("a")))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(4L, 12) )) sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x3)) # split.data.frame match test(1639.61, unlist(split(fdt, by = c("x1","x3"), sorted = TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x3), drop=TRUE) test(1639.62, unlist(split(fdt, by = c("x1","x3"), sorted = TRUE, drop=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # empty levels in factor and drop=FALSE x2 = as.factor(c("a", as.character(x2)))[-1L], x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)], y = y)] l = split(fdt, by = "x1") test(1639.63, TRUE, all(is.list(l), identical(names(l), c("b","a","c")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L, c=0L), sapply(l, ncol) == c(b=4L, a=4L, c=4L))) l = split(fdt, by = "x2") test(1639.64, TRUE, all(is.list(l), identical(names(l), c("d","e","c","a")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L, a=0L), sapply(l, ncol) == c(d=4L, e=4L, c=4L, a=4L))) l = split(fdt, by = c("x3","x1"), flatten=FALSE) test(1639.65, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e","a","z")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b","a","c"), f=c("b","a","c"), g=c("a","b","c"), e=c("a","b","c"), a=c("a","b","c"), z=c("a","b","c"))), sapply(l, sapply, nrow) == c(rep(c(3L,0L,0L), 4), rep(0L, 6)), sapply(l, sapply, ncol) == rep(4L, 18) )) l = split(fdt, by = "x1", drop=TRUE) # empty levels in factor and drop=TRUE test(1639.66, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L))) l = split(fdt, by = "x2", drop=TRUE) test(1639.67, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L))) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) test(1639.68, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=FALSE test(1639.69, TRUE, all( is.list(l), identical(names(l), c("a","e","f","g","h","z")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), setNames(rep(list(c("a","b","c")), 6), c("a","e","f","g","h","z"))), sapply(l, sapply, nrow) == c(0L,0L,0L,3L,0L,0L,0L,3L,0L,3L,0L,0L,0L,3L,0L,0L,0L,0L), sapply(l, sapply, ncol) == rep(4L, 18) )) l = split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=TRUE test(1639.70, TRUE, all( is.list(l), identical(names(l), c("e","f","g","h")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(e=c("a"), f=c("b"), g=c("a"), h=c("b"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) sdf = split(as.data.frame(fdt), list(fdt$x3, fdt$x1)) # split.data.frame match on by = 2L and empty levels, drop=FALSE test(1639.71, unlist(split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) sdf = split(as.data.frame(fdt), list(fdt$x3, fdt$x1), drop=TRUE) # split.data.frame match on by = 2L and empty levels, drop=TRUE test(1639.72, unlist(split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # - [x] split by reference to column names - `by` - factor and character column ---- fdt = dt[, .(x1 = x1, x2 = x2, x3 = as.factor(x3), y = y)] l = split(fdt, by = c("x2","x3"), flatten=FALSE) test(1639.73, TRUE, all( is.list(l), identical(names(l), c("d","e","c")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))), sapply(l, sapply, nrow) == rep(1L, 12), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x1","x2","x3"), flatten=FALSE) # empty levels in x3 after subset on x1, x2 test(1639.74, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(b=list(d=c("h","f","e","g"), e=c("h","f","e","g"), c=c("f","h","e","g")), a=list(e=c("g","e","f","h"), d=c("e","g","f","h"), c=c("e","g","f","h")))), sapply(l, sapply, sapply, nrow) == rep(c(1L,1L,0L,0L), 6), sapply(l, sapply, sapply, ncol) == rep(4L, 24) )) l = split(fdt, by = c("x1","x2","x3"), drop=TRUE, flatten=FALSE) test(1639.75, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(4L, 12) )) l = split(fdt, by = c("x3","x1"), flatten=FALSE) # multi col rev test(1639.76, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x3","x2","x1"), flatten=FALSE) test(1639.77, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(h=list(d=c("b"), e=c("b"), c=c("b")), f=list(e=c("b"), c=c("b"), d=c("b")), g=list(e=c("a"), d=c("a"), c=c("a")), e=list(e=c("a"), d=c("a"), c=c("a")))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(4L, 12) )) fdt = dt[, .(x1 = x1, # empty levels in factor and drop=FALSE x2 = x2, x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)], y = y)] l = split(fdt, by = c("x3","x1"), flatten=FALSE) # empty levels in factor and drop=FALSE test(1639.78, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e","a","z")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"), a=character(), z=character())), identical(lapply(l, lapply, nrow), list(h=list(b=3L), f=list(b=3L), g=list(a=3L), e=list(a=3L), a=structure(list(), .Names = character(0)), z=structure(list(), .Names = character(0)))), identical(lapply(l, lapply, ncol), list(h=list(b=4L), f=list(b=4L), g=list(a=4L), e=list(a=4L), a=structure(list(), .Names = character(0)), z=structure(list(), .Names = character(0)))) )) l = split(fdt, by = c("x3","x1"), drop=TRUE, flatten=FALSE) # empty levels in factor and drop=TRUE test(1639.79, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(fdt, by = c("x3","x1"), sorted=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=FALSE test(1639.80, TRUE, all( is.list(l), identical(names(l), c("a","e","f","g","h","z")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(a=character(), e=c("a"), f=c("b"), g=c("a"), h=c("b"), z=character())), identical(lapply(l, lapply, nrow), list(a=structure(list(), .Names = character(0)), e=list(a=3L), f=list(b=3L), g=list(a=3L), h=list(b=3L), z=structure(list(), .Names = character(0)))), identical(lapply(l, lapply, ncol), list(a=structure(list(), .Names = character(0)), e=list(a=4L), f=list(b=4L), g=list(a=4L), h=list(b=4L), z=structure(list(), .Names = character(0)))) )) l = split(fdt, by = c("x3","x1"), sorted=TRUE, drop=TRUE, flatten=FALSE) # test order for empty levels in factor and drop=TRUE test(1639.81, TRUE, all( is.list(l), identical(names(l), c("e","f","g","h")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(e=c("a"), f=c("b"), g=c("a"), h=c("b"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) # - [x] split by reference to column names - `by` - for character column ---- l = split(dt, by = "x1") # single col test(1639.82, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=4L, a=4L))) l = split(dt, by = "x2") test(1639.83, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=4L, e=4L, c=4L))) l = split(dt, by = "x3") test(1639.84, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=4L, f=4L, g=4L, e=4L))) l = split(dt, by = c("x1","x2"), flatten=FALSE) # multi col test(1639.85, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))), sapply(l, sapply, nrow) == rep(2L, 6), sapply(l, sapply, ncol) == rep(4L, 6) )) l = split(dt, by = c("x1","x3"), flatten=FALSE) test(1639.86, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("h","f"), a=c("g","e"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(dt, by = c("x2","x3"), flatten=FALSE) test(1639.87, TRUE, all( is.list(l), identical(names(l), c("d","e","c")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))), sapply(l, sapply, nrow) == rep(1L, 12), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(dt, by = c("x1","x2","x3"), flatten=FALSE) test(1639.88, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(4L, 12) )) l = split(dt, by = c("x3","x1"), flatten=FALSE) # multi col rev test(1639.89, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(4L, 4) )) l = split(dt, by = c("x3","x2","x1"), flatten=FALSE) test(1639.90, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(h=list(d="b", e="b", c="b"), f=list(e="b", c="b", d="b"), g=list(e="a", d="a", c="a"), e=list(e="a",d="a",c="a"))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(4L, 12) )) # - [x] allow to keep or drop field on which we split - `keep.by` argument ---- l = split(dt, by = "x1", keep.by = FALSE) test(1639.91, TRUE, all(is.list(l), identical(names(l), c("b","a")), sapply(l, is.data.table), sapply(l, nrow) == c(b=6L, a=6L), sapply(l, ncol) == c(b=3L, a=3L))) l = split(dt, by = "x2", keep.by = FALSE) test(1639.92, TRUE, all(is.list(l), identical(names(l), c("d","e","c")), sapply(l, is.data.table), sapply(l, nrow) == c(d=4L, e=4L, c=4L), sapply(l, ncol) == c(d=3L, e=3L, c=3L))) l = split(dt, by = "x3", keep.by = FALSE) test(1639.93, TRUE, all(is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, is.data.table), sapply(l, nrow) == c(h=3L, f=3L, g=3L, e=3L), sapply(l, ncol) == c(h=3L, f=3L, g=3L, e=3L))) l = split(dt, by = c("x1","x2"), keep.by = FALSE, flatten=FALSE) # multi col test(1639.94, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("d","e","c"), a=c("e","d","c"))), sapply(l, sapply, nrow) == rep(2L, 6), sapply(l, sapply, ncol) == rep(2L, 6) )) l = split(dt, by = c("x1","x3"), keep.by = FALSE, flatten=FALSE) test(1639.95, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(b=c("h","f"), a=c("g","e"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(2L, 4) )) l = split(dt, by = c("x2","x3"), keep.by = FALSE, flatten=FALSE) test(1639.96, TRUE, all( is.list(l), identical(names(l), c("d","e","c")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(d=c("h","f","e","g"), e=c("h","f","g","e"), c=c("f","h","e","g"))), sapply(l, sapply, nrow) == rep(1L, 12), sapply(l, sapply, ncol) == rep(2L, 12) )) l = split(dt, by = c("x1","x2","x3"), keep.by = FALSE, flatten=FALSE) test(1639.97, TRUE, all( is.list(l), identical(names(l), c("b","a")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(b=list(d=c("h","f"), e=c("h","f"), c=c("f","h")), a=list(e=c("g","e"), d=c("e","g"), c=c("e","g")))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(1L, 12) )) l = split(dt, by = c("x3","x1"), keep.by = FALSE, flatten=FALSE) # multi col rev test(1639.98, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, names), list(h=c("b"), f=c("b"), g=c("a"), e=c("a"))), sapply(l, sapply, nrow) == rep(3L, 4), sapply(l, sapply, ncol) == rep(2L, 4) )) l = split(dt, by = c("x3","x2","x1"), keep.by = FALSE, flatten=FALSE) test(1639.99, TRUE, all( is.list(l), identical(names(l), c("h","f","g","e")), sapply(l, function(x) !is.data.table(x) && is.list(x)), sapply(l, sapply, function(x) !is.data.table(x) && is.list(x)), identical(lapply(l, lapply, names), list(h=list(d="b", e="b", c="b"), f=list(e="b", c="b", d="b"), g=list(e="a", d="a", c="a"), e=list(e="a",d="a",c="a"))), sapply(l, sapply, sapply, nrow) == rep(1L, 12), sapply(l, sapply, sapply, ncol) == rep(1L, 12) )) # - [x] support recursive split into nested lists for `length(by) > 2L` (default) and `flatten` arg to produce non-nested list of data.table ---- fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3] # factors, flatten consistent to non-flatten length(by)==1L test(1639.100, split(fdt, by = "x1"), split(fdt, by = "x1", flatten = FALSE)) # length(by) == 1L should be same as flatten=FALSE # ref data already checked in above test test(1639.101, split(fdt, by = "x2"), split(fdt, by = "x2", flatten = FALSE)) test(1639.102, split(fdt, by = "x3"), split(fdt, by = "x3", flatten = FALSE)) test(1639.103, split(fdt, by = "x1", sorted = TRUE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE)) test(1639.104, split(fdt, by = "x3", sorted = TRUE), split(fdt, by = "x3", flatten = FALSE, sorted = TRUE)) test(1639.105, split(fdt, by = "x1", sorted = TRUE, drop = TRUE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE, drop = TRUE)) test(1639.106, split(fdt, by = "x1", sorted = TRUE, keep.by = FALSE), split(fdt, by = "x1", flatten = FALSE, sorted = TRUE, keep.by = FALSE)) test(1639.107, unlist(split(fdt, by = c("x1","x2"), sorted = TRUE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = TRUE)) # by two variables - match after unlist nested one # sorted=TRUE test(1639.108, unlist(split(fdt, by = c("x1","x2"), sorted = FALSE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = FALSE)) # sorted=FALSE test(1639.109, unlist(split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE, flatten = FALSE), recursive = FALSE), split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE)) # drop.by=TRUE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2) test(1639.110, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT))# vs split.data.frame by 2L drop=FALSE test(1639.111, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT))# vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE) test(1639.112, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3)) # vs split.data.frame by 3L test(1639.113, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=FALSE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3), drop=TRUE) test(1639.114, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=TRUE fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # empty levels in factors x2 = as.factor(c("a", as.character(x2)))[-1L], x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)], y = y)] sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2) test(1639.115, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE test(1639.116, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE) test(1639.117, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3)) # vs split.data.frame by 3L test(1639.118, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=FALSE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2, fdt$x3), drop=TRUE) test(1639.119, split(fdt, by = c("x1","x2","x3"), flatten = TRUE, sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 3L drop=TRUE sdf = split(as.data.frame(fdt[, .SD, .SDcols=c("x3","y")]), f=list(fdt$x1, fdt$x2)) # flatten drop.by and empty lists # this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2) test(1639.120, split(fdt, by = c("x1","x2"), sorted = TRUE, keep.by = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE test(1639.121, unlist(split(fdt, by = c("x1","x2"), flatten = FALSE, sorted = TRUE, keep.by = FALSE), recursive = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE, flatten=FALSE + unlist sdf = split(as.data.frame(fdt[, .SD, .SDcols=c("x3","y")]), f=list(fdt$x1, fdt$x2), drop=TRUE) test(1639.122, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE, keep.by = FALSE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE # - [x] edge cases for `by` and `sorted`, 0 rows, 1 unique value in cols, drop ---- test(1639.123, length(split(dt[0L], by = "x1")), 0L) # drop=FALSE vs split.data.frame expand list with empty levels won't work on characters, use factor with defined levels, included those unused. test(1639.124, length(split(as.data.frame(dt[0L]), df$x1)), 2L) # unlike data.frame because character != factor fdt = dt[, c(lapply(.SD, as.factor), list(y=y)), .SDcols=x1:x3] # factors no empty levels test(1639.125, length(split(fdt[0L], by = "x1")), 2L) test(1639.126, length(split(as.data.frame(fdt[0L]), df$x1)), 2L) # match on factors work test(1639.127, split(fdt[0L], by = "x1"), lapply(split(as.data.frame(fdt[0L]), df$x1), setDT)) # we match also on complete structure fdt = dt[, .(x1 = as.factor(c(as.character(x1), "c"))[-13L], # factors empty levels x2 = as.factor(c("a", as.character(x2)))[-1L], x3 = as.factor(c("a", as.character(x3), "z"))[c(-1L,-14L)], y = y)] sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2)) # vs split.data.frame by 2L# this will dispatch to `interaction(x1, x2)` which results into different order, see: levels(interaction(1:2,1:2)) vs CJ(1:2,1:2) test(1639.128, split(fdt, by = c("x1","x2"), sorted = TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=FALSE sdf = split(as.data.frame(fdt), f=list(fdt$x1, fdt$x2), drop=TRUE) test(1639.129, split(fdt, by = c("x1","x2"), sorted = TRUE, drop=TRUE), lapply(sdf[sort(names(sdf))], setDT)) # vs split.data.frame by 2L drop=TRUE test(1639.130, split(dt[0L], by = "x1"), structure(list(), .Names = character(0))) # 0 nrow character/factor with empty levels # no empty levels test(1639.131, split(fdt[0L], by = "x1"), lapply(c(a=1L,b=2L,c=3L), function(i) data.table(x1=factor(levels = c("a","b","c")),x2=factor(levels = c("a","c","d","e")),x3=factor(levels = c("a","e","f","g","h","z")),y=numeric()))) # expand empty levels test(1639.132, split(dt[0L], by = "x1", sorted = TRUE), structure(list(), .Names = character(0))) test(1639.133, split(fdt[0L], by = "x1", sorted = TRUE), lapply(c(a=1L,b=2L,c=3L), function(i) data.table(x1=factor(levels = c("a","b","c")),x2=factor(levels = c("a","c","d","e")),x3=factor(levels = c("a","e","f","g","h","z")),y=numeric()))) # same as none sorted as all appended on the end in sorted order due to lack of data dt2 = copy(dt)[, "l" := lapply(1:12, function(i) i)] # non-atomic type to 'by' should raise error test(1639.134, split(dt2, by = "l"), error = "argument 'by' must refer only to atomic type columns, classes of 'l' columns are not atomic type") # - [x] additional tests for names consistency with data.frame, and current examples in SO df = data.frame(product = c("b", "a", "b", "a"), value = c(sample(1:10,4)), year = c(2001, 2001, 2000, 2000)) tmp = as.data.table(df)[, list(grp=list(.SD)), by=.(product, year), .SDcols=names(df)] # http://stackoverflow.com/a/33068928/2490497 setattr(ans <- tmp$grp, 'names', paste(tmp$product, tmp$year, sep=".")) dt = as.data.table(df) # http://stackoverflow.com/q/33068791/2490497 dt[, grp := .GRP, by = list(product,year)] setkey(dt, grp) o2 = dt[, list(list(.SD)), by = grp]$V1 setattr(o2, 'names', paste(tmp$product, tmp$year, sep=".")) # names reused test(1639.135, o2, ans) lapply(ans, setattr, ".data.table.locked", NULL) sort.by.names = function(x) x[sort(names(x))] test(1639.136, sort.by.names(ans), sort.by.names(split(as.data.table(df), f=list(df$product, df$year)))) test(1639.137, sort.by.names(ans), sort.by.names(unlist(split(setDT(df), by=c("product","year"), flatten = FALSE), recursive = FALSE))) test(1639.138, ans, split(as.data.table(df), by=c("product","year"))) test(1639.139, sort.by.names(ans), sort.by.names(unlist(split(as.data.table(df), by=c("product","year"), flatten=FALSE), recursive = FALSE))) # test if split preallocate columns in results #1908 if (base::getRversion() > "3.0.0") { dt = data.table(x=rexp(100),y=rep(LETTERS[1:10], 10)) dtL = split(dt, by = "y") test(1639.140, dim(dtL[[1]][, x2 := -x]), c(10L,3L)) test(1639.141, all(sapply(dtL, truelength) > 1000)) } # allow x's cols (specifically x's join cols) to be referred to using 'x.' syntax # patch for #1615. Note that I specifically have not implemented x[y, aa, on=c(aa="bb")] # to refer to x's join column as well because x[i, col] == x[i][, col] will not be TRUE anymore.. x <- data.table(aa = 1:3, cc = letters[1:3]) y <- data.table(bb = 3:5, dd = 3:1) test(1640.1, x[y, x.aa, on=c(aa="bb")], INT(3,NA,NA)) test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c("c", NA,NA), x.aa=INT(3,NA,NA))) # tests for non-equi joins # function to create a random data.table with all necessary columns # set.seed(45L) # for testing.. set.seed(unclass(Sys.time())) nq_fun = function(n=100L) { i1 = sample(sample(n, 10L), n, TRUE) i2 = sample(-n/2:n/2, n, TRUE) i3 = sample(-1e6:1e6, n, TRUE) i4 = sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE) d1 = sample(rnorm(10L), n, TRUE) d2 = sample(rnorm(50), n, TRUE) d3 = sample(c(Inf, -Inf, NA, NaN, runif(10L)), n, TRUE) d4 = sample(c(NA, NaN, rnorm(10L)), n, TRUE) c1 = sample(letters[1:5], n, TRUE) c2 = sample(LETTERS[1:15], n, TRUE) dt = data.table(i1,i2,i3,i4, d1,d2,d3,d4, c1,c2) if ("package:bit64" %in% search()) { I1 = as.integer64(sample(sample(n, 10L), n, TRUE)) I2 = as.integer64(sample(-n/2:n/2, n, TRUE)) I3 = as.integer64(sample(-1e6:1e6, n, TRUE)) I4 = as.integer64(sample(c(NA_integer_, sample(-n:n, 10L, FALSE)), n, TRUE)) dt = cbind(dt, data.table(I1,I2,I3,I4)) } dt } dt1 = nq_fun(400L) dt2 = nq_fun(50L) x = na.omit(dt1) y = na.omit(dt2) nqjoin_test <- function(x, y, k=1L, test_no, mult="all") { ops = c("==", ">=", "<=", ">", "<") xclass = sapply(x, class) runcmb = combn(names(x), k) runcmb = as.data.table(runcmb[, 1:min(100L, ncol(runcmb)), drop=FALSE]) # max 100 combinations to test runops = lapply(runcmb, function(cols) { thisops = sample(ops, k, TRUE) thisops[grepl("^c", cols)] = "==" thisops }) is_only_na <- function(x) is.na(x) & !is.nan(x) is_int64 <- function(x) "integer64" %in% class(x) construct <- function(cols, vals, ops) { expr = lapply(seq_along(cols), function(i) { if (is_int64(vals[[i]])) { if (is.na.integer64(vals[[i]])) if (ops[i] %in% c(">", "<")) quote(integer(0)) else as.call(list(quote(is.na.integer64), as.name(cols[[i]]))) else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), as.integer(vals[[i]]))) # don't know how to construct a call with int64 -- vals[[i]] gets converted to NAN } else { if (is.nan(vals[[i]])) if (ops[i] %in% c(">", "<")) quote(integer(0)) else as.call(list(quote(is.nan), as.name(cols[[i]]))) else if (is_only_na(vals[[i]])) if (ops[i] %in% c(">", "<")) quote(integer(0)) else as.call(list(quote(is_only_na), as.name(cols[[i]]))) else as.call(list(as.name(ops[[i]]), as.name(cols[[i]]), vals[[i]])) } }) ans = expr[[1L]] lapply(expr[-1L], function(e) ans <<- as.call(list(quote(`&`), ans, e))) ans } check <- function(x, y, cols, ops, mult="all") { expr = lapply(1:nrow(y), function(i) { expr = construct(cols, as.list(y[i, cols, with=FALSE]), ops) }) ans = lapply(expr, function(e) { val = x[eval(e)] if (!nrow(val)) return(val) val = if (mult=="first") val[1L] else if (mult=="last") val[.N] else val }) rbindlist(ans) } nq <- function(x, y, cols, ops, nomatch=0L, mult="all") { sd_cols = c(paste("x.", cols, sep=""), setdiff(names(x), cols)) ans = x[y, mget(sd_cols, as.environment(-1)), on = paste(cols, ops, cols, sep=""), allow=TRUE, nomatch=nomatch, mult=mult] setnames(ans, gsub("^x[.]", "", names(ans))) setcolorder(ans, names(x))[] } for (i in seq_along(runcmb)) { thiscols = runcmb[[i]] thisops = runops[[i]] # cat("k = ", k, "\ti = ", i, "\t thiscols = [", paste(thiscols,collapse=","), "]\t thisops = [", paste(thisops,collapse=","), "]\t ", sep="") ans1 = nq(x, y, thiscols, thisops, 0L, mult=mult) ans2 = check(x, y, thiscols, thisops, mult=mult) test_no = signif(test_no+.001, 7) test(test_no, all.equal(ans1,ans2,ignore.row.order=TRUE), TRUE) # if (identical(all.equal(ans1,ans2,ignore.row.order=TRUE), TRUE)) cat("successful\n") else stop("failed\n") } } if (TRUE) { # turn off to FALSE temporarily if needed to rerun valgrind, as very slow # without NAs in x and i nqjoin_test(x, y, 1L, 1641.0, mult="all") nqjoin_test(x, y, 2L, 1642.0, mult="all") nqjoin_test(x, y, 1L, 1643.0, mult="first") nqjoin_test(x, y, 2L, 1644.0, mult="first") nqjoin_test(x, y, 1L, 1645.0, mult="last") nqjoin_test(x, y, 2L, 1646.0, mult="last") # with NAs in x and i nqjoin_test(dt1, dt2, 1L, 1647.0, mult="all") nqjoin_test(dt1, dt2, 2L, 1648.0, mult="all") nqjoin_test(dt1, dt2, 1L, 1649.0, mult="first") nqjoin_test(dt1, dt2, 2L, 1650.0, mult="first") nqjoin_test(dt1, dt2, 1L, 1651.0, mult="last") nqjoin_test(dt1, dt2, 2L, 1652.0, mult="last") } # TODO: add tests for nomatch=NA.. # tested, but takes quite some time.. so commenting for now # nqjoin_test(x, y, 3L,1643.0) # nqjoin_test(dt1,dt2,3L,1652.0) # nqjoin_test( x,dt2,1L,1644.0) # without NA only in x # nqjoin_test( x,dt2,2L,1645.0) # nqjoin_test( x,dt2,3L,1646.0) # nqjoin_test(dt1, y,1L,1647.0) # without NA only in i # nqjoin_test(dt1, y,2L,1648.0) # nqjoin_test(dt1, y,3L,1649.0) # test for the issues Jan spotted... dt = data.table(id="x", a=as.integer(c(3,8,8,15,15,15,16,22,22,25,25)), b=as.integer(c(9,10,25,19,22,25,38,3,9,7,28)), c=as.integer(c(22,33,44,14,49,44,40,25,400,52,77))) set.seed(1L) dt=dt[sample(.N)] test(1653.1, uniqueN(dt[dt, .(x.id, x.a, x.b, x.c, i.id, i.a, i.b, i.c), which=FALSE, on = c("id==id","a>=a","b>=b"), allow.cartesian=TRUE]), 42L) test(1653.2, x[y, .(x.i1, x.i2, x.i3, x.i4, x.d1, x.d2, x.d3, x.d4, x.c1, x.c2, i.i1, i.i2, i.i3, i.i4, i.d1, i.d2, i.d3, i.d4, i.c1, i.c2), on = c("i4==i4", "i1>=i1", "d4<=d4", "i3==i3", "d3>d3", "i2>i2", "d2>=d2", "d1>d1"), allow.cartesian = TRUE], x[y, .(x.i1, x.i2, x.i3, x.i4, x.d1, x.d2, x.d3, x.d4, x.c1, x.c2, i.i1, i.i2, i.i3, i.i4, i.d1, i.d2, i.d3, i.d4, i.c1, i.c2), on = c("i4==i4", "i1>=i1", "d4<=d4", "i3==i3", "d3>d3", "i2>i2", "d2>=d2", "d1>d1"), allow.cartesian = TRUE]) # ensuring there are no warnings here really.. # error on any op other than "==" on char type dt1 = data.table(x=sample(letters[1:2], 10, TRUE), y=sample(c(1L,5L,7L), 10, TRUE), z=1:10, k=11:20) dt2 = data.table(x=c("b", "a"), y=c(1L, 9L)) test(1654, dt1[dt2, on="x>x"], error="Only '==' operator") # on= with .() syntax, #1257 dt1 = data.table(x=sample(letters[1:2], 10, TRUE), y=sample(c(1L,5L,7L), 10, TRUE), z=1:10, k=11:20) dt2 = data.table(x=c("b", "a"), y=c(1L, 9L)) test(1655.1, dt1[dt2, on=.(x)], dt1[dt2, on="x"]) test(1655.2, dt1[dt2, on=.(x==x)], dt1[dt2, on=c("x==x")]) test(1655.3, dt1[dt2, on=.(x==x)], dt1[dt2, on=c("x"="x")]) test(1655.4, dt1[dt2, on=.(y>=y)], dt1[dt2, on=c("y>=y")]) test(1655.4, dt1[dt2, on=.(x==x, y>=y)], dt1[dt2, on=c("x==x", "y>=y")]) # Patching another issue spotted by Jan dt = data.table(id="x", a=as.integer(c(3,8,8,15,15,15,16,22,22,25,25)), b=as.integer(c(9,10,25,19,22,25,38,3,9,7,28)), c=as.integer(c(22,33,44,14,49,44,40,25,400,52,77))) set.seed(1L) dt=dt[sample(.N)][, row_id := 1:.N] test(1656, nrow(dt[dt, .(x.id, x.a, x.b, x.c, x.row_id, i.id, i.a, i.b, i.c, i.row_id), on = .(c,b<=b,id,a>=a), allow.cartesian = TRUE]), 12L) # just to check that there's no warning # between is vectorised, #534 set.seed(1L) dt = data.table(x=sample(3,10,TRUE), y=sample(2,10,TRUE), z=sample(5,10,TRUE)) test(1657, dt[x %between% list(y,z)], dt[x>=y & x<=z]) oldverbose = options(datatable.verbose=FALSE) # fwrite tests # without quoting test(1658.1, fwrite(data.table(a=c(NA, 2, 3.01), b=c('foo', NA, 'bar'))), output=paste(c("a,b",",foo","2,","3.01,bar"),collapse="")) # with quoting and qmethod="escape" test(1658.2, fwrite(data.table( a=c(NA, 2, 3.01), `other column`=c('foo bar', NA, 'quote" and \\ bs \n and newline')), quote=TRUE, qmethod="escape"), output='"a","other column","foo bar"2,3.01,"quote\\" and \\\\ bs and newline"') # with quoting and qmethod="double" (default) test(1658.3, fwrite(data.table( a=c(NA, 1.2e-100, 3.01), "other \"column"=c('foo bar', NA, 'quote" and \\ bs')), quote=TRUE, qmethod="double"), output='"a","other ""column"\n,"foo bar"\n1.2e-100,\n3.01,"quote"" and \\ bs"\n') # presence of " triggers auto quoting as well, #1925 test(1658.4, fwrite(data.table(a=1:4, b=c('"foo','ba"r','baz"','a "quoted" region'))), output='a,b\n1,"""foo"\n2,"ba""r"\n3,"baz"""\n4,"a ""quoted"" region"') test(1658.5, fwrite(data.table(a=1:4, b=c('"foo','ba"r','baz"','a "quoted" region')), qmethod='escape'), output='a,b\n1,"\\"foo"\n2,"ba\\"r"\n3,"baz\\""\n4,"a \\"quoted\\" region"') # NB: sep2[2] triggering quoting when list columns are present is tested in test 1736 # changing sep DT = data.table(a="foo", b="ba\"r") ans = '"a";"b"\n"foo";"ba""r"\n' test(1658.41, fwrite(DT, sep=";", quote=TRUE, qmethod="double"), output=ans) test(1658.42, write.table(DT, sep=";", qmethod="double", row.names=FALSE), output=ans) ans = '"a";"b"\n"foo";"ba\\"r"\n' test(1658.43, fwrite(DT, sep=";", quote=TRUE, qmethod="escape"), output=ans) test(1658.44, write.table(DT, sep=";", qmethod="escape", row.names=FALSE), output=ans) if (.Platform$OS.type=="unix") { # on linux we can create windows format files if we want test(1658.5, fwrite(data.table(a="foo", b="bar"), eol="\r\n", quote=TRUE), output = '"a","b""foo","bar"') } # changing NA test(1658.6, fwrite(data.table(a=c("foo", NA), b=c(1, NA)), na="NA", quote=TRUE), output='"a","b"\n"foo",1\nNA,NA\n') # no col.names test(1658.7, fwrite(data.table(a="foo", b="bar"), col.names=F, quote=TRUE), output='"foo","bar"\n') test(1658.8, fwrite(data.table(a=c(1:5), b=c(1:5)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3\n4,4\n5,5\n') # block size equal to number of rows test(1658.9, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3\n') # block size one bigger than number of rows test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3\n') # block size one less than number of rows test(1658.12, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3\n') # writing a data.frame test(1658.13, fwrite(data.frame(a="foo", b="bar"), quote=TRUE), output='"a","b"\n"foo","bar"\n') # single-column data.table test(1658.14, fwrite(data.table(a=c(1,2,3)), quote=TRUE), output='"a"\n1\n2\n3\n') # single-column data.frame test(1658.15, fwrite(data.frame(a=c(1,2,3)), quote=TRUE), output='"a"\n1\n2\n3\n') # different column types test(1658.16, fwrite(data.table( factor1=as.factor(c('foo', 'bar')), factor2=as.factor(c(NA, "baz")), bool=c(TRUE,NA), ints=as.integer(c(NA, 5))), na='na', quote=TRUE), output='"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5\n') # empty data table (headers but no rows) empty_dt <- data.table(a=1, b=2)[0,] test(1658.17, fwrite(empty_dt, quote=TRUE), output='"a","b"\n') # data.table with duplicate column names test(1658.18, fwrite(data.table(a=1, a=2), quote=TRUE), output='"a","a"\n1,2\n') # number of significant digits = 15 test(1658.19, fwrite(data.table(a=1/0.9), quote=TRUE), output='"a"\n1.11111111111111\n') # test append f = tempfile() fwrite(data.table(a=c(1,2), b=c('a', 'b')), f, quote=TRUE) fwrite(data.table(a=c(3,4), b=c('c', 'd')), f, append=TRUE, quote=TRUE) test(1658.21, readLines(f), c('"a","b"','1,"a"','2,"b"','3,"c"','4,"d"')) unlink(f) # simple data table (reference for the error cases below) ok_dt <- data.table(foo="bar") test(1658.22, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"\n') options(oldverbose) # wrong argument types test(1658.23, fwrite(ok_dt, 1), error="is.character(file).*not TRUE") test(1658.24, fwrite(ok_dt, quote=123), error="identical(quote.*auto.*FALSE.*TRUE") test(1658.25, fwrite(ok_dt, sep="..."), error="nchar(sep)") test(1658.26, fwrite(ok_dt, qmethod=c("double", "double")), error="length(qmethod)") test(1658.27, fwrite(ok_dt, col.names="foobar"), error="isLOGICAL(col.names)") # null data table (no columns) test(1658.28, fwrite(data.table(a=1)[NULL,]), error="ncol(x) > 0L is not TRUE") ## End fwrite tests # tests for #679, inrange(), FR #707 dt = data.table(a=c(8,3,10,7,-10), val=runif(5)) range = data.table(start = 1:5, end = 6:10) test(1659.1, dt[a %inrange% range], dt[1:4]) test(1659.2, dt[inrange(a, range$start, range$end)], dt[1:4]) test(1659.3, dt[inrange(a, range$start, range$end, incbounds=FALSE)], dt[c(1,2,4)]) range[4, `:=`(start=-12L, end=-4L)] test(1659.4, dt[a %inrange% range], dt) # tests for non-equi joins returning columns correctly when j is missing dt1 = fread('Chr Start End Region chr6 3324 3360 Region1 chr4 2445 2455 Region2 chr1 1034 1090 Region4') dt2 = fread('Site Chr Location Gene Site1 chr4 2447 GeneB Site2 chr9 1153 GeneT Site3 chr6 3350 GeneM Site4 chr1 1034 GeneC Site5 chr1 2000 GeneU Site6 chr6 3359 GeneF Site7 chr7 1158 GeneI Site8 chr4 2451 GeneO Site9 chr6 3367 GeneZ ') test(1660.1, names(dt2[dt1, on=.(Chr, Location>=Start, Location<=End)]), c(names(dt2), "Location.1", "Region")) test(1660.2, names(dt1[dt2, on=.(Chr, Start<=Location, End>=Location)]), c(names(dt1), "Site", "Gene")) # `names<-` should NOT modify by reference #1015 DT = data.table(x=1, y=2) nn = names(DT) test(1661.1, {names(DT) <- c("k", "m"); nn}, c("x","y"), warning=if (base::getRversion()>="3.1.0") NULL else "Please upgrade") test(1661.2, names(DT), c("k","m")) # as.Date.IDate won't change the class if xts package loaded #1500 if ("package:zoo" %in% search()) { x = as.IDate("2016-01-15") require(zoo) test(1662, class(as.Date(x)), "Date") } else { cat("Test 1662 not run. If required call library(zoo) first.\n") } # IDate support in as.xts.data.table #1499 if ("package:xts" %in% search()) { dt <- data.table(date = c(as.IDate("2014-12-31"), as.IDate("2015-12-31"), as.IDate("2016-12-31")), nav = c(100,101,99), key = "date") dt.xts <- as.xts.data.table(dt) test(1663, dt.xts[1L], xts::xts(data.table(nav=100), order.by=as.Date("2014-12-31"))) } else { cat("Test 1663 not run. If required call library(xts) first.\n") } # fwrite crash on very large number of columns (say 100k) set.seed(123) m <- matrix(runif(3*100000), nrow = 3) DT <- as.data.table(m) f <- tempfile() system.time(fwrite(DT, f, eol='\n', quote=TRUE)) # eol fixed so size test passes on Windows system.time(fwrite(DT, f, eol='\n', quote=TRUE)) # run again to force seg fault test(1664, abs(file.info(f)$size %/% 100000 - 62) <= 1.5) # file size appears to be 34 bytes bigger on Windows (6288931 vs 6288965) unlink(f) # rbindlist support for complex type dt1 = data.table(x=1L, y=2+3i) dt2 = data.table(x=0:101, y=3+sample(102)*1i) test(1665.1, rbindlist(list(dt1,dt2)), setDT(rbind(as.data.frame(dt1), as.data.frame(dt2)))) # print method now works (when rows > 100 it uses rbind/rbindlist internally) test(1665.2, ans <- capture.output(dt2), ans) # just checking that it doesn't error, really. # Use existing index even when auto index is disabled #1422 d = data.table(k=3:1) # subset - no index options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) test(1666.1, d[k==1L, verbose=TRUE], d[3L], output="Creating new index 'k'") d = data.table(k=3:1) options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE) test(1666.2, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # do not create index d = data.table(k=3:1) options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE) test(1666.3, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) d = data.table(k=3:1) options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE) test(1666.4, grep("Creating new index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) d = data.table(k=3:1) # subset - index setindex(d, k) options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) test(1666.5, d[k==1L, verbose=TRUE], d[3L], output="Using existing index 'k'") options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE) test(1666.6, d[k==1L, verbose=TRUE], d[3L], output="Using existing index 'k'") options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE) test(1666.7, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) # not using existing index options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE) test(1666.8, grep("Using existing index", capture.output(d[k==1L, verbose=TRUE])), integer(0)) d1 = data.table(k=3:1) # join - no index d2 = data.table(k=2:4) options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) test(1666.9, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="ad hoc") options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE) test(1666.11, d1[d2, on="k", verbose=TRUE], d1[d2, on="k"], output="ad hoc") options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE) test(1666.12, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0)) # not looking for index options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE) test(1666.13, grep("Looking for existing (secondary) index", capture.output(d1[d2, on="k", verbose=TRUE])), integer(0)) d1 = data.table(k=3:1,v1=10:12) # join - index d2 = data.table(k=2:4,v2=20:22) setindex(d1, k) ans = data.table(k=2:4, v1=c(11L,10L,NA), v2=20:22) options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) test(1666.14, d1[d2, on="k", verbose=TRUE], ans, output="existing index") options("datatable.use.index"=TRUE, "datatable.auto.index"=FALSE) test(1666.15, d1[d2, on="k", verbose=TRUE], ans, output="existing index") options("datatable.use.index"=FALSE, "datatable.auto.index"=FALSE) test(1666.16, d1[d2, on="k", verbose=TRUE], ans, output='ad hoc') options("datatable.use.index"=FALSE, "datatable.auto.index"=TRUE) test(1666.17, d1[d2, on="k", verbose=TRUE], ans, output='ad hoc') # reset defaults options("datatable.use.index"=TRUE, "datatable.auto.index"=TRUE) #testing fix to #1654 (dcast should only error when _using_ duplicated names) DT <- data.table(a = 1:4, a = 1:4, id = rep(1:4, 2), V1 = 8:1) test(1667.1, dcast(DT, id ~ rowid(id), value.var = "V1"), output = " id 1 21: 1 8 42: 2 7 33: 3 6 24: 4 5 1") DT <- data.table(a = 1:4, id = 1:4, id = rep(1:4, 2), V1 = 8:1) test(1667.2, dcast(DT, id ~ rowid(id), value.var = "V1"), error = "data.table to cast") # fix for #1672 test(1668, chmatch(c("a","b"), c("a","c"), nomatch = integer()), c(1L, NA_integer_)) # fix for #1650, segfault in rolling joins resulting from fixing #1405. x = data.table(Date = as.Date(c("2015-12-29", "2015-12-29", "2015-12-29", "2015-12-29", "2016-01-30", "2016-01-30", "2016-01-30", "2016-01-30", "2016-02-29", "2016-02-29", "2016-02-29", "2016-02-29", "2016-03-26", "2016-03-26", "2016-03-26", "2016-03-26")), ID = c("A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D", "A", "B", "C", "D"), Value = c("A201512", "B201512", "C201512", "D201512", "A201601", "B201601", "C201601", "D201601", "A201602", "B201602", "C201602", "D201602", "A201603", "B201603", "C201603", "D201603"), key = c('Date', 'ID')) y = CJ(Date = as.Date(c("2015-12-31", "2016-01-31", "2016-02-29", "2016-03-31")), ID = unique(x$ID)) test(1669, x[y, on=c("ID", "Date"), roll=TRUE, which=TRUE], 1:16) # 1680 fix, fread header encoding issue x = "Stra\xdfe" Encoding(x) = "latin1" nm = names(fread("1680-fread-header-encoding.csv", encoding="Latin-1")) test(1670, nm[2], x) # as.data.table must return a copy even if 'x' is a data.table x = data.table(a=1, b=2) test(1670.1, address(x) != address(as.data.table(x)), TRUE) setattr(x, 'class', c('a', class(x))) test(1670.2, class(as.data.table(x)), class(x)[2:3]) # #1676, `:=` with by shouldn't add cols on supported types dt = data.table(x=1, y=2) test(1671, dt[, z := sd, by=x], error="invalid type/length (closure/1)") # 1683 DT <- data.table(V1 = rep(1:2, 3), V2 = 1:6) test(1672.1, DT[ , .(.I[1L], V2[1L]), by = V1], output = " V1 V1 V21: 1 1 12: 2 2 2") #make sure GForce operating test(1672.2, DT[ , .(.I[1L], V2[1L]), by = V1, verbose = TRUE], output = "GForce optimized j") #make sure works on .I by itself test(1672.3, DT[ , .I[1L], by = V1], output = " V1 V11: 1 12: 2 2") #make sure GForce here as well test(1672.4, DT[ , .I[1L], by = V1, verbose = TRUE], output = "GForce optimized j") #make sure works with order test(1672.5, DT[order(V1), .I[1L], by = V1], output = " V1 V11: 1 12: 2 2") # should also work with subsetting test(1672.6, DT[1:5, .(.I[1L], V2[1L]), by = V1], output = " V1 V1 V21: 1 1 12: 2 2 2") #tests for #1528 TT <- as.IDate("2016-04-25") test(1673.1, TT + 4L, as.IDate("2016-04-29")) test(1673.2, TT + 4, as.IDate("2016-04-29")) test(1673.3, TT - 3, as.IDate("2016-04-22")) test(1673.4, TT - 3L, as.IDate("2016-04-22")) test(1673.5, as.IDate("2016-04-28") - as.IDate("2016-04-20"), 8L) # test for radix integer order when MAXINT is present AND decreasing=TRUE AND na.last=FALSE # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=16925 # It seems this 'just' fails ASAN, but also results in seg fault under some compilers # https://github.com/rstudio/shiny/issues/1200 test(1674, forderv(c(2147483645L, 2147483646L, 2147483647L, 2147483644L), order=-1L), c(3,2,1,4)) # fix for #1718 # In R-devel somwhere between 12 June 2017 (r72786) and 27 June 2017 (r72859), the behaviour of factor() changed. # Test updated minimally to create the previous representation directly instead of going via factor(). A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6)) A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)] B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = structure(c(3L, 3L, 3L, 1L, 2L, NA), .Label=c("Boop","Beep",NA), class="factor")) test(1675.1, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:3,1:2,NA)) A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6)) B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = c(NA, NA, NA, 4, 5, 6)) A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)] B[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)] test(1675.2, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:2,NA,1:2,NA)) # fwrite na arg segfault fix, #1725 dt = data.table(x=1:2, y=c(NA,"a")) f = tempfile() test(1676.1, fwrite(dt, f, na=NULL), error="is not TRUE") fwrite(dt, f, na=NA) test(1676.2, fread(f), data.table(x=1:2, y=c(NA, "a"))) unlink(f) # duplicate names in foverlaps #1730 a = data.table(start = 1:5, end = 2:6, c2 = rnorm(10), c2 = rnorm(10), key=c("start","end")) b = data.table(start = 1:5, end = 2:6, c3 = rnorm(5), key=c("start","end")) test(1677.1, foverlaps(a, b), error="x has some duplicated column") test(1677.2, foverlaps(b, a), error="y has some duplicated column") # na.omit.data.table removes indices #1734 dt = data.table(a=4:1, b=c(letters[c(1L,NA,2:3)])) setindexv(dt, "a") test(1678.1, indices(dt2 <- na.omit(dt, cols="b")), NULL) setindexv(dt2, "a") test(1678.2, indices(na.omit(dt2, cols="b")), "a") # rleid gains `prefix` argument, similar to rowid x = sample(3,10,TRUE) test(1679.1, rleid(x, prefix="id"), paste0("id", rleid(x))) test(1679.2, rleidv(x, prefix="id"), paste0("id", rleidv(x))) # melt.data.table call along with patterns from within a function, #1749 x = data.table(x1=1:2, x2=3:4, y1=5:6, y2=7:8, z1=9:10, z2=11:12) foo <- function(x) { pats = c("^y", "z") melt(x, measure.vars=patterns(pats)) } test(1680.1, foo(x), melt(x, measure.vars=patterns("^y", "^z"))) # melt warning prints only first 5 cols, #1752 DT = fread("melt-warning-1752.tsv") ans = suppressWarnings(melt(DT[, names(DT) %like% "(^Id[0-9]*$)|GEOGRAPHIC AREA CODES", with=FALSE], id=1:2)) test(1681, melt(DT[, names(DT) %like% "(^Id[0-9]*$)|GEOGRAPHIC AREA CODES", with=FALSE], id=1:2), ans, warning="are not all of the same type") # non-equi joins with by=.EACHI, not as exhaustive, but given the previous # tests were, this should be fine.. we'll add tests as we go along. set.seed(45L) dt1 = data.table(x=sample(8,20,TRUE), y=sample(8,20,TRUE), z=1:20) dt2 = data.table(c(2,5), c(5,7), c(2,4)) dt3 = data.table(c(12,5), c(15,7), c(2,4)) test(1682.1, dt1[dt2, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, .N, by=.(x,y)]) test(1682.2, dt1[dt2, sum(z), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z), by=.(x,y)]) test(1682.3, dt1[dt2, as.numeric(median(z)), by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt2, on=.(x>=V1, y<=V2)][, median(z), by=.(x,y)]) test(1682.4, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2)], dt1[dt3, on=.(x>=V1, y<=V2)][, .(N=sum(!is.na(z))), by=.(x,y)]) test(1682.5, dt1[dt3, .N, by=.EACHI, on=.(x>=V1, y<=V2), nomatch=0L], dt1[dt3, on=.(x>=V1, y<=V2), nomatch=0L][, .N, by=.(x,y)]) test(1682.6, dt1[dt2, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt2, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x,y)]) test(1682.7, dt1[dt3, on=.(x>=V1, y<=V2), sum(z)*V3, by=.EACHI], dt1[dt3, on=.(x>=V1, y<=V2)][, sum(z)*V3[1L], by=.(x,y)]) # add test for update operation idx = dt1[dt2[1], which=TRUE, on=.(x>=V1, y<=V2)] test(1682.8, copy(dt1)[dt2[1], z := 2L*z, by=.EACHI, on=.(x>=V1, y<=V2)], copy(dt1)[(idx), z := 2L*z]) # test for add by reference test(1682.9, copy(dt1)[dt2[1], foo := z, by=.EACHI, on=.(x>=V1, y<=V2)], copy(dt1)[(idx), foo := z]) # test for nomatch=0L with by=.EACHI fix for non-equi joins dt = data.table(x=c(1,4,7,10), y=c(6,12,18,24), z=4:1) test(1683.1, dt[.(c(2,15), c(100,25)), sum(z), on=.(x>=V1, y<=V2), by=.EACHI], data.table(x=c(2,15), y=c(100,25), V1=c(6L, NA))) test(1683.2, dt[.(c(2,15), c(100,25)), sum(z), on=.(x>=V1, y<=V2), by=.EACHI, nomatch=0L], data.table(x=2, y=100, V1=6L)) # unique should remove index #1760 dt <- data.table(a = c("1", "1", "2", "2", "3", "4", "4", "4"), b = letters[1:8], d = c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE)) dt[d == TRUE, `:=`(b = "M")] # create index udt <- unique(dt, by = c("a", "b")) test(1684, nrow(udt[d == TRUE]), 2) # #1758, data.table print issue foo <- function(annot=c("a", "b")) { dt = data.table(x=annot, y=NA) ro = structure(list(dt=dt), class="dtu") suppressWarnings(ro$dt[, flag := TRUE]) ro } old = options(datatable.verbose=FALSE) test(1685, grep("dtu", capture.output(foo())), 7L) options(old) # fix for #1771 test(1686.1, uniqueN(1L), 1L) test(1685.2, uniqueN(1L, na.rm=TRUE), 1L) # fix for #1744 DT = data.table(ID = 1:2, A = 3:4, B = 5:6) test(1686.1, DT[, .(A,B)], DT[, c(mget("A"), .SD), .SDcols="B"]) test(1686.2, DT[, .(V1=A,B)], DT[, c(.(get("A")), .SD), .SDcols="B"]) # tests for first test(1687.1, first(1:5), 1L) test(1687.2, first(data.table(x=1:5, y=6:10)), data.table(x=1L, y=6L)) if ("package:bit64" %in% search()) { # fix for #1385 and part of #1459 x1 = data.table(id=1, value=as.integer64(1)) x2 = data.table(id=c(1,2)) test(1688.1, merge(x2, x1, by="id", all.x=TRUE)$value, as.integer64(c(1,NA))) x1 = data.table(x = c(1),y = integer64(1)) x2 = data.table(x = c(1,2)) test(1688.2, merge(x1, x2, all=TRUE, by="x")$y, as.integer64(c(0, NA))) } test(1689, capture.output(IDateTime(as.POSIXct("2016/01/13 17:00", tz = "America/Los_Angeles"))), c(" idate itime", "1: 2016-01-13 17:00:00")) # fix for #1766 and #1704 A = data.table(i = 1:6, j = rep(1:2, 3), x = letters[1:6], key = "i") B = data.table(j = 1:2, y = letters[1:2], key = "j") test(1690.1, key(A[B, on = "j"]), NULL) test(1690.2, key(A[B, on = "j"]), NULL) dt <- data.table( origin = c("A", "A", "A", "A", "A", "A", "B", "B", "A", "A", "C", "C", "B", "B", "B", "B", "B", "C", "C", "B", "A", "C", "C", "C", "C", "C", "A", "A", "C", "C", "B", "B"), destination = c("A", "A", "A", "A", "B", "B", "A", "A", "C", "C", "A", "A", "B", "B", "B", "C", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "C", "C", "B", "B", "C", "C"), points_in_dest = c(5, 5, 5, 5, 4, 4, 5, 5, 3, 3, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 4, 3, 3, 3, 5,5, 3, 3, 4, 4, 3, 3), depart_time = c(7, 8, 16, 18, 7, 8, 16, 18, 7, 8, 16, 18, 7, 8, 16, 7, 8, 16, 18, 8, 16, 7, 8, 18, 7, 8, 16, 18, 7, 8, 16, 18), travel_time = c(0, 0, 0, 0, 70, 10, 70, 10, 10, 10, 70, 70, 0, 0, 0, 70, 10, 10, 70, 70, 10, 0, 0, 0, 10, 70, 10, 70, 10, 70, 70, 10)) dt[ depart_time<=8 & travel_time < 60, condition1 := TRUE] dt[ depart_time>=16 & travel_time < 60, condition2 := TRUE] setkey(dt, origin, destination) res <- unique(dt[(condition1)],by=key(dt))[unique(dt[(condition2)], by=key(dt)), on = c(destination = "origin", origin = "destination"), nomatch = 0L] test(1690.3, res[, .(points = sum(points_in_dest)), keyby = origin], data.table(origin=LETTERS[1:3], points=c(9,7,12), key="origin")) # fix for #1626 (so that rbind plays nicely with non-list inputs, e.g., package # psych creates a list with the input data.frame/data.table and a matrix it # creates...) dt = data.table(x=1:5, y=6:10) test(1691, rbind(dt, dt), rbind(dt, as.matrix(dt))) # For #1783 -- subsetting a data.table by an ITime object test(1692, capture.output(as.data.table(structure(57600L, class = "ITime"))), c(" V1", "1: 16:00:00")) # testing all time part extraction routines (subsumes #874) t <- "2016-08-03 01:02:03.45" test(1693.1, second(t), 3L) test(1693.2, minute(t), 2L) test(1693.3, hour(t), 1L) test(1693.4, yday(t), 216L) test(1693.5, wday(t), 4L) test(1693.6, week(t), 31L) test(1693.7, month(t), 8L) test(1693.8, quarter(t), 3L) test(1693.9, year(t), 2016L) # fix for #1740 - sub-assigning NAs for factors dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9) ans = data.table(x = 1:5, y = factor(c(NA,"a","b","a", NA)), z = 5:9) test(1694.0, dt[y=="", y := NA], ans) # more tests for between() x = c(NaN, NA, 1, 5, -Inf, Inf) test(1695.1, x %between% c(3, 7), c(NA, NA, FALSE, TRUE, FALSE, FALSE)) test(1695.2, x %between% c(NA, 7), c(NA, NA, NA, NA, NA, FALSE)) test(1695.3, x %between% c(3, NA), c(NA, NA, FALSE, NA, FALSE, NA)) test(1695.4, x %between% c(NA, NA), rep(NA, 6L)) x = c(NA, 1L, 5L) test(1695.5, x %between% c(3, 7), c(NA, FALSE, TRUE)) test(1695.6, x %between% c(NA, 7), c(NA, NA, NA)) test(1695.7, x %between% c(3, NA), c(NA, FALSE, NA)) test(1695.8, x %between% c(NA, NA), rep(NA, 3L)) x = rep(NA_integer_, 3) test(1695.9, x %between% c(3, 7), rep(NA, 3L)) test(1695.10, x %between% c(NA, 7), rep(NA, 3L)) test(1695.11, x %between% c(3, NA), rep(NA, 3L)) test(1695.12, x %between% c(NA, NA), rep(NA, 3L)) x = integer(0) test(1695.13, x %between% c(3, 7), logical(0)) # test for #1819, verbose message for bmerge old_opt = getOption("datatable.verbose") options(datatable.verbose = TRUE) x = data.table(A = 10:17) test(1696.0, any(grepl("bmerge", capture.output(x[A %inrange% 13:14]))), TRUE) # restore verbosity options(datatable.verbose = old_opt) # adding a test for #1825 (though it is not on timing, but correctness while # joining on keyed tables using 'on' argument) x = data.table(a=1:3, b=4:6, key="a") y = data.table(a=2:4, c=7:9) test(1697.1, x[y], x[y, on=key(x)]) y = data.table(m=2:4, c=7:9, key="m") test(1697.2, x[y], x[y, on=c(a="m")]) # #1823, fix for 'on='' on keyed anti-joins loses key x = data.table(id = 1:10, val = letters[1:10], key = "id") y = data.table(id = 3:6, key = "id") test(1698.1, key(x[!y]), key(x[!y, on = "id"])) # minor enhancement to dcast, #1821 dt = data.table(x=c(1,1,1,2,2,2), y=1:6, z=6:1) test(1699.1, dcast(dt, x ~ ., value.var="z", fun=list(sd, mean)), data.table(x=c(1,2), z_sd=1, z_mean=c(5,2), key="x")) # minor enhancement to dcast, #1810 dt = data.table( var1 = c("a","b","c","b","d","e","f"), var2 = c("aa","bb","cc","dd","ee","ee","ff"), subtype = c("1","2","2","2","1","1","2"), type = c("A","A","A","A","B","B","B") ) test(1700.1, dcast(dt, type ~ subtype, value.var = c("var1", "var2"), fun = function(v) paste0(unique(v), collapse = "|")), data.table(type=c("A","B"), var1_1=c("a", "d|e"), var1_2=c("b|c", "f"), var2_1=c("aa", "ee"), var2_2=c("bb|cc|dd","ff"), key="type")) # fixing regression introduced on providing functionality of 'x.' prefix in 'j' (for non-equi joins) A = data.table(x=c(1,1,1,2,2), y=1:5, z=5:1) B = data.table(x=c(2,3), val=4) col1 = "y" col2 = "x.y" test(1701.1, A[, .(length(x), length(y)), by=x], data.table(x=c(1,2), V1=1L, V2=c(3:2))) test(1701.2, A[, .(x), by=x], data.table(x=c(1,2), x=c(1,2))) test(1701.3, A[B, x.x, on="x"], c(2,2,NA)) test(1701.4, A[B, x.y, on="x"], c(4:5,NA)) test(1701.5, A[B, .(get("x"), get("x.x")), on="x"], data.table(V1=c(2,2,3), V2=c(2,2,NA))) test(1701.6, A[B, mget(c("x", "x.x")), on="x"], data.table(x=c(2,2,3), x.x=c(2,2,NA))) # 1761 fix as well test(1701.6, A[B, .(x.x, get("x.x"), x.y), on="x", by=.EACHI], data.table(x=c(2,2,3), x.x=c(2,2,NA), V2=c(2,2,NA), x.y=c(4:5,NA))) dt = data.table(a=1L) test(1701.7, dt[dt, .(xa=x.a, ia=i.a), .EACHI, on="a"], data.table(a=1L, xa=1L, ia=1L)) # ISO 8601-consistent week numbering, #1765 # test cases via https://en.wikipedia.org/wiki/ISO_week_date test_cases <- c("2005-01-01", "2005-01-02", "2005-12-31", "2007-01-01", "2007-12-30", "2007-12-31", "2008-01-01", "2008-12-28", "2008-12-29", "2008-12-30", "2008-12-31", "2009-01-01", "2009-12-31", "2010-01-01", "2010-01-02", "2010-01-03") test_values <- c(53L, 53L, 52L, 1L, 52L, 1L, 1L, 52L, 1L, 1L, 1L, 1L, 53L, 53L, 53L, 53L) test(1702, isoweek(test_cases), test_values) # fread, ensure no shell commands #1702 if (.Platform$OS.type=="unix") { cat("a,b\n1,2", file=f<-tempfile()) cmd <- sprintf("cat %s", f) test(1703.1, fread(cmd), data.table(a=1L, b=2L)) test(1703.2, fread(file=cmd), error=sprintf("Provided file '%s' does not exists", cmd)) unlink(f) } # Ensure all.equal respects 'check.attributes' w.r.t. column names. As testthat::check_equivalent relies on this # as used by package popEpi in its tests test(1704, all.equal(data.table( a=1:3, b=4:6 ), data.table( A=1:3, B=4:6 ), check.attributes=FALSE)) if (.Platform$OS.type!="windows") { # On Windows: "'mc.cores' > 1 is not supported on Windows". # parallel package isn't parallel on Windows, but data.table is. if ("package:parallel" %in% search()) { #1745 and #1727 test(1705, getDTthreads()<=2) # this was set at the top of tests.Rraw. CRAN's policy is max two. # not 1, to pass on Rdevel clag UBSAN and ASAN without OpenMP lx <- replicate(4, runif(1e5), simplify=FALSE) f <- function(mc.cores = 2, threads = 2) { setDTthreads(threads) invisible(mclapply(lx, function(x) fsort(x), mc.cores = mc.cores)) } f(1, 1) # was always ok f(2, 1) # was always ok f(1, 2) # was always ok f(2, 2) # used to hang. Now should not because data.table auto switches to single threaded # commenting out avoid_openmp_hang_within_fork() confirms this test catches catches the hang test(1706, getDTthreads()<=2) # Tests that it reverts to previous state after use of mclapply } else { cat("Tests 1705 and 1706 not run. If required call library(parallel) first.\n") } } # all.equal.data.table should consider modes equal like base R (detected via Bioc's flowWorkspace tests) # If strict testing is required, then use identical(). # TODO: add strict.numeric (default FALSE) to all.equal.data.table() ? test(1707.1, all.equal( data.frame(a=0L), data.frame(a=0) ) ) test(1707.2, all.equal( data.table(a=0L), data.table(a=0) ) ) test(1708.1, !isTRUE(all.equal( data.frame(a=0L), data.frame(a=FALSE) ))) test(1708.2, all.equal( data.table(a=0L), data.table(a=FALSE) ), "Datasets have different column modes. First 3: a(numeric!=logical)") x = data.frame(a=0L) y = data.frame(a=0) setattr(y[[1]],"class",c("hello","world")) test(1709.1, !isTRUE(all.equal(x,y,check.attributes=TRUE))) # desired test(1709.2, !isTRUE(all.equal(x,y,check.attributes=FALSE))) # not desired x = as.data.table(x) y = as.data.table(y) test(1710.1, mode(x[[1]]) == mode(y[[1]])) test(1710.2, storage.mode(x[[1]]) != storage.mode(y[[1]])) test(1710.3, class(y[[1]]), c("hello","world")) test(1710.4, all.equal(x,y,check.attributes=TRUE), # desired "Datasets have different column classes. First 3: a(numeric!=hello;world)") test(1710.5, isTRUE(all.equal(x,y,check.attributes=FALSE))) # desired # Include tests as-is from #1252 (unexpected NA row from logical subsets with 1-row containing NA) DT = data.table(a=1, d=NA) test(1711, DT[!is.na(a) & d == "3"], DT[0]) DT = data.table(a = c(1,2), d = c(NA,3)) test(1712, DT[!is.na(a) & d == "3"], DT[2]) test(1713, DT[d==3], DT[2]) # Test new helpful error message suggested by Jan notAColName = 1 test(1714.1, exists("notAColName")) # use a long column name to be sure it exists and unique test(1714.1, !exists("notInCallingScope")) # use a long column name to be sure it exists and unique DT = data.table(a=1:3, b=4:6) test(1715, DT[,b], 4:6) # old behaviour for sure tested before but here for context test(1716.1, DT[,notAColName], error="column name 'notAColName' is not found") # ensure it doesn't find it calling scope either test(1716.2, DT[, ..notInCallingScope], error="Variable 'notInCallingScope' is not found in calling scope") test(1716.3, DT[, notInCallingScope, with=FALSE], error="Variable 'notInCallingScope' is not found in calling scope") # Test out-of-bounds error on numeric j DT = data.table(a=1:3, b=4:6, c=7:9) test(1717, DT[,4], error="Item 1 of j is 4 which is outside the column number range.*ncol=3") test(1718, DT[,0], null.data.table()) test(1719, DT[,c(2,0,1)], data.table(b=4:6, a=1:3)) test(1720, DT[,c(-2,2)], error="j mixes positives and negatives") test(1721, DT[,c(1,0,5)], error="Item 3 of j is 5 which.*ncol=3") # to check it says Item 3 even though 0 gets removed internally # Tests to ensure auto with=FALSE of ! and - only allow symbols around : (i.e. DT[,!(colB:colE)] and not any other symbol usage inside ! and -. Thanks to Mark L #1864 and confirmed by Michael C with both tests added as-is DT = data.table(FieldName = c(1,2,NA,4,NA,6), rowId=1:6, removalIndex=c(2,7,0,5,10,0)) test(1722.1, DT[,!is.na(as.numeric(FieldName))], c(TRUE,TRUE,FALSE,TRUE,FALSE,TRUE)) test(1722.2, DT[,(!is.na(as.numeric(FieldName)))], c(TRUE,TRUE,FALSE,TRUE,FALSE,TRUE)) test(1723.1, DT[removalIndex>0,rowId-(2*removalIndex-1)], c(-2,-11,-5,-14)) test(1723.2, DT[removalIndex>0,(rowId-(2*removalIndex-1))], c(-2,-11,-5,-14)) DT = data.table(FieldName = c("1", "2", "3", "four", "five", "6")) test(1724.1, DT[, is.na(as.numeric(FieldName))], c(FALSE,FALSE,FALSE,TRUE,TRUE,FALSE), warning="NAs introduced by coercion") test(1724.2, DT[, !is.na(as.numeric(FieldName))], c(TRUE,TRUE,TRUE,FALSE,FALSE,TRUE), warning="NAs introduced by coercion") # Ensure NA's are added properly when a new column is added, not all the target rows are joined to, and the number of i # rows is equal or greater than the number of rows in the target table. DT = data.table(a=1:3, key="a") DT[.(4), add0:=1.1][] # didn't break due to 95e438c on 29 Sep 2016 DT[.(c(3,4)), add1:=1.1][] # didn't break DT[.(c(3,3,4)), add2:=1.1][] # did break DT[.(2:4), add3:=1.1][] # did break test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA,1.1), add3=c(NA,1.1,1.1), key="a")) # keyby= runs groups in sorted order, #606. Only relevant when j does something that depends on previous group, perhaps # by using <<-. To run in appearance order use by=. See also #1880. # It wasn't useful to always run groups in appearance order. Now we have the option and it's consistent. DT = data.table(grp=rep(3:1,each=3), val=1:9) lastGrp = 0L test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp], data.table(grp=1:3, V1=c(8,12,6), V2=1:3, key="grp") ) test(1726.2, lastGrp, 1L) lastGrp = -1L test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp], data.table(grp=3:1, V1=c(1,6,12), V2=1:3) ) test(1726.4, lastGrp, 7L) rm(lastGrp) # better := verbose messages, #1808 DT = data.table(a = 1:10) test(1727.1, DT[a < 5, a := 5L, verbose=TRUE], output="Assigning to 4 row subset of 10 rows") test(1727.2, DT[a < 5, a := 5L, verbose=TRUE], output="No rows match i.*Assigning to 0 row subset of 10 rows") test(1727.3, DT[0, d:=1, verbose=TRUE], data.table(a=c(rep(5L,5L),6:10), d=NA_real_), output = "Assigning to 0 row subset of 10 rows.*Added 1 new column initialized with all-NA") test(1727.4, DT[.(a=11L), on="a", c("f","g"):=.(1L,"dummy"), verbose=TRUE], data.table(a=c(rep(5L,5L),6:10), d=NA_real_, f=NA_integer_, g=NA_character_), output = "Assigning to 0 row subset of 10 rows.*Added 2 new columns initialized with all-NA") # Add test for working and no problem na.last=NA with subgroup size 2 containing 1 NA # and 2 randomly not working cases with na.last=NA size 2 with 1 NA, due to using uninitialized memory DT = data.table(x=INT(2,2,2,1,1), y=INT(1,NA,3,2,NA)) test(1728.1, DT[order(x,y,na.last=TRUE)], data.table(x=INT(1,1,2,2,2), y=INT(2,NA,1,3,NA))) test(1728.2, DT[order(x,y,na.last=FALSE)], data.table(x=INT(1,1,2,2,2), y=INT(NA,2,NA,1,3))) test(1728.3, DT[order(x,y,na.last=NA)], data.table(x=INT(1,2,2), y=INT(2,1,3))) # 1 row DT = data.table(x=NA_integer_, y=1) test(1728.4, DT[order(x,y,na.last=TRUE)], DT) test(1728.5, DT[order(x,y,na.last=FALSE)], DT) test(1728.6, DT[order(x,y,na.last=NA)], DT[0]) # 2 row with 1 NA DT = data.table(x=as.integer(c(NA,1)), y=2:3) test(1728.7, DT[order(x,y,na.last=TRUE)], DT[c(2,1)]) test(1728.8, DT[order(x,y,na.last=FALSE)], DT) test(1728.9, DT[order(x,y,na.last=NA)], DT[2]) # was randomly wrong test(1728.11, DT[order(x,na.last=TRUE)], DT[c(2,1)]) test(1728.12, DT[order(x,na.last=FALSE)], DT) test(1728.13, DT[order(x,na.last=NA)], DT[2]) # was randomly wrong # fwrite wrong and crash on 9.9999999999999982236431605, #1847 oldverbose=options(datatable.verbose=FALSE) test(1729.1, fwrite(data.table(V1=c(1), V2=c(9.9999999999999982236431605997495353221893310546875))), output="V1,V21,10") test(1729.2, fwrite(data.table(V2=c(9.9999999999999982236431605997495353221893310546875), V1=c(1))), output="V2,V110,1") DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9, 0.000000000000000000000999999999999999999999999, 99999999999999999999999999999.999999)) ans = "V19999999999.999.9e-169e-220.999.199.91e-211e+29" test(1729.3, fwrite(DT), output=ans) test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) options(oldverbose) # same decimal/scientific rule (shortest format) as write.csv DT = data.table(V1=c(-00000.00006, -123456789.123456789, seq.int(-1000,1000,17), seq(-1000,1000,pi*87), -1.2345678912345 * 10^(c((-30):30)), +1.2345678912345 * 10^(c((-30):30)), -1.2345 * 10^((-20):20), +1.2345 * 10^((-20):20), -1.7 * 10^((-20):20), +1.7 * 10^((-20):20), -7 * 10^((-20):20), +7 * 10^((-20):20), 0, NA, NaN, Inf, -Inf, 5.123456789e-290, -5.123456789e-290, 5.123456789e-307, -5.123456789e-307, 5.123456789e+307, -5.123456789e+307)) test(1729.5, nrow(DT), 507) oldverbose=options(datatable.verbose=FALSE) # capture.output() exact tests must not be polluted with verbosity x = capture.output(fwrite(DT,na="NA"))[-1] # -1 to remove the column name V1 y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))[-1] # One mismatch that seems to be accuracy in base R's write.csv # tmp = cbind(row=1:length(x), `fwrite`=x, `write.csv`=y) # tmp[x!=y,] # row fwrite write.csv # 177 "-1234567891234500000" "-1234567891234499840" # 238 "1234567891234500000" "1234567891234499840" # looking in surrounding rows for the first one shows the switch point : # tmp[175:179,] # row fwrite write.csv # 175 "-12345678912345000" "-12345678912345000" # ok # 176 "-123456789123450000" "-123456789123450000" # ok # 177 "-1234567891234500000" "-1234567891234499840" # e+18 last before switch to scientific # 178 "-1.2345678912345e+19" "-1.2345678912345e+19" # ok # 179 "-1.2345678912345e+20" "-1.2345678912345e+20" # ok test(1729.6, x[c(177,238)], c("-1234567891234500000","1234567891234500000")) x = x[-c(177,238)] y = y[-c(177,238)] test(1729.7, length(x), 505) test(1729.8, x, y) if (!identical(x,y)) print(data.table(row=1:length(x), `fwrite`=x, `write.csv`=y)[x!=y]) DT = data.table(c(5.123456789e+300, -5.123456789e+300, 1e-305,1e+305, 1.2e-305,1.2e+305, 1.23e-305,1.23e+305)) ans = c("V1","5.123456789e+300","-5.123456789e+300", "1e-305","1e+305","1.2e-305","1.2e+305","1.23e-305","1.23e+305") # explicitly check against ans rather than just comparing fwrite to write.csv so that : # i) we can easily see intended results right here in future without needing to run # ii) we don't get a false pass if fwrite and write.csv agree but are both wrong because of # a problem with the test mechanism itself or something else strange or unexpected # Exactly the same binary representation on both linux and windows (so any differences in # output are not because the value itself is stored differently) : # > cat(binary(DT[[1]]),sep="\n") # 0 11111100101 111010011010000100010111101110000100 11110100 00000100 # 1 11111100101 111010011010000100010111101110000100 11110100 00000100 # 0 00000001001 110000010110110001011100010100100101 00110101 01110101 # 0 11111110100 001000111010010100010110111010000010 11011001 10111010 # 0 00000001010 000011011010011101101010100101111100 10111001 10101101 # 0 11111110100 010111011111100101001110101100000011 01101011 10101100 # 0 00000001010 000101000110010100110011101010000110 00111110 01010001 # 0 11111110100 011001101011100100100011110110110000 01001110 01011101 test(1729.9, fwrite(DT), output=paste(ans,collapse="")) test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse="")) DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")])) # double.eps double.neg.eps double.xmin double.xmax # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 test(1729.12, typeof(DT[[1L]]), "double") test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) if ("package:bit64" %in% search()) { test(1730.1, typeof(-2147483647L), "integer") test(1730.2, as.integer(-2147483648), NA_integer_, warning="coercion") test(1730.3, as.integer("-2147483647"), -2147483647L) test(1730.4, as.integer("-2147483648"), NA_integer_, warning="coercion") test(1730.5, as.integer64("-2147483648"), as.integer64(-2147483648)) if (.devtesting) { # these don't pass UBSAN/USAN because of the overflow being tested here, so just in dev not on CRAN test(1730.6, as.character((as.integer64(2^62)-1)*2+1), "9223372036854775807") test(1730.7, as.character((as.integer64(2^62)-1)*2+2), NA_character_, warning="integer64 overflow") test(1730.8, as.character(-(as.integer64(2^62)-1)*2-1), "-9223372036854775807") test(1730.9, as.character(-(as.integer64(2^62)-1)*2-2), NA_character_, warning="integer64.*flow") } # Currently bit64 truncs to extremes in character coercion. Don't test that in case bit64 changes in future. # as.integer64("-9223372036854775808") == NA # as.integer64("-9223372036854775999") == NA # as.integer64("+9223372036854775808") == 9223372036854775807 # as.integer64("+9223372036854775999") == 9223372036854775807 DT = data.table( as.integer64(c( "-9223372036854775807", # integer64 min 2^63-1 "+9223372036854775807", # integer64 max "-9223372036854775806","+9223372036854775806", # 1 below extreme just to check "0","-1","1", "NA",NA, "-2147483646", # 1 below extreme to check "-2147483647", # smallest integer in R "-2147483648", # NA_INTEGER == INT_MIN but valid integer64 "-2147483649", "+2147483646", # positives as well just in case "+2147483647", "+2147483648", "+2147483649" ))) ans = c("V1","-9223372036854775807","9223372036854775807","-9223372036854775806","9223372036854775806", "0","-1","1","__NA__","__NA__", "-2147483646","-2147483647","-2147483648","-2147483649", "2147483646","2147483647","2147483648","2147483649") test(1731.1, class(DT[[1L]]), "integer64") test(1731.2, fwrite(DT,na="__NA__"), output=paste(ans,collapse="")) f = tempfile() test(1731.3, fwrite(DT, f, na="__NA__"), NULL) test(1731.4, readLines(f), ans) unlink(f) test(1731.5, write.csv(DT,na="__NA__",row.names=FALSE,quote=FALSE), output=paste(ans,collapse="")) # write.csv works on integer64 because it calls bit64's as.character method } else { cat("Tests 1730 & 1731 not run. If required call library(bit64) first.\n") } # fwrite(,quote='auto' and qmethod) DT = data.table(x=c("fo,o", "foo", 'b"ar', NA, "", "NA"), "ColName,WithComma"=1:6, 'Three\nLine\nColName'=c('bar\n', "noNeedToQuote", 'a\nlong\n"sentence"', "0000", " \n ", ' "\n ')) x = capture.output(fwrite(DT,na="NA",quote=TRUE, qmethod='escape')) y = capture.output(write.table(DT,row.names=FALSE,quote=TRUE,sep=",",qmethod='escape')) test(1732.1, x, y) x = capture.output(fwrite(DT,na="NA",quote=TRUE,qmethod='double')) y = capture.output(write.table(DT,row.names=FALSE,quote=TRUE,sep=",",qmethod='double')) test(1732.2, x, y) x = capture.output(fwrite(DT,na="NA",quote=FALSE)) y = capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)) test(1732.3, x, y) f = tempfile() fwrite(DT,f,quote='auto',qmethod='escape') # write.csv / write.table don't do field-by-field quoting so can't compare to them. ans = c('x,"ColName,WithComma","Three', 'Line', 'ColName"', '"fo,o",1,"bar','"', 'foo,2,noNeedToQuote', '"b\\"ar",3,"a', 'long', "\\\"sentence\\\"\"", ',4,0000', ',5," ',' "', "NA,6,\" \\\"", " \"") test(1732.4, readLines(f), ans) fwrite(DT,f,quote='auto',qmethod='double') ans[7] = '"b""ar",3,"a' ans[9] = "\"\"sentence\"\"\"" ans[13] = "NA,6,\" \"\"" test(1732.5, readLines(f), ans) DT = data.table(A=c("foo","ba,r","baz"), B=c("AA","BB","CC"), C=c("DD","E\nE","FF")) test(1732.6, fwrite(DT, quote='auto'), output='A,B,Cfoo,AA,DD"ba,r",BB,"EE"baz,CC,FF') unlink(f) # dec="," test(1733.1, fwrite(data.table(pi),dec=","), error="dec != sep is not TRUE") test(1733.2, fwrite(data.table(c(1.2,-8.0,pi,67.99),1:4),dec=",",sep=";"), output="V1;V21,2;1-8;23,14159265358979;367,99;4") # fwrite implied and actual row.names DT = data.table(foo=1:3,bar=c(1.2,9.8,-6.0)) test(1734.1, capture.output(fwrite(DT,row.names=TRUE,quote=FALSE)), capture.output(write.csv(DT,quote=FALSE))) test(1734.2, capture.output(fwrite(DT,row.names=TRUE,quote=TRUE)), capture.output(write.csv(DT))) test(1734.3, fwrite(DT,row.names=TRUE,quote='auto'), # same other than 'foo' and 'bar' column names not quoted output="\"\",foo,bar\"1\",1,1.2\"2\",2,9.8\"3\",3,-6") DF = as.data.frame(DT) test(1734.4, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), capture.output(write.csv(DF,quote=FALSE))) test(1734.5, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), capture.output(write.csv(DF))) rownames(DF)[2] = "someName" rownames(DF)[3] = "another" test(1734.6, capture.output(fwrite(DF,row.names=TRUE,quote=FALSE)), capture.output(write.csv(DF,quote=FALSE))) test(1734.7, capture.output(fwrite(DF,row.names=TRUE,quote=TRUE)), capture.output(write.csv(DF))) # fwrite showProgress test 1735. Turned off as too long/big for CRAN. if (FALSE) { N = 6e8 # apx 6GB DT = data.table(C1=sample(100000,N,replace=TRUE), C2=sample(paste0(LETTERS,LETTERS,LETTERS), N, replace=TRUE)) gc() d = "/dev/shm/" # and d = "/tmp/" f = paste0(d,"test.txt") system.time(fwrite(DT, f, nThread=1)) file.info(f)$size/1024^3 unlink(f) # ensure progress meter itself isn't taking time; e.g. too many calls to time() or clock() system.time(fwrite(DT, f, showProgress=FALSE, nThread=1)) system.time(fwrite(DT, f, nThread=2)) system.time(fwrite(DT, f, nThread=4)) system.time(fwrite(DT, f, verbose=TRUE)) f2 = paste0(d,"test2.txt") system.time(fwrite(DT, f2, verbose=TRUE)) # test 'No space left on device' unlink(f) unlink(f2) system.time(fwrite(DT, f2)) # try again, should work now space free'd file.info(f2)$size/1024^3 unlink(f2) } # list columns and sep2 set.seed(1) DT = data.table(A=1:4, B=list(1:10,15:18,7,9:10), C=list(letters[19:23],c(1.2,2.3,3.4,pi,-9),c("foo","bar"),c(TRUE,TRUE,FALSE))) test(1736.1, capture.output(fwrite(DT)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w", "2,15|16|17|18,1.2|2.3|3.4|3.14159265358979|-9", "3,7,foo|bar", "4,9|10,TRUE|TRUE|FALSE")) test(1736.2, fwrite(DT, sep2=","), error="length(sep2)") test(1736.3, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be different") test(1736.4, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2") test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"))), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", "2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)")) test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)), c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}")) DT = data.table(A=c("foo","ba|r","baz")) test(1736.7, capture.output(fwrite(DT)), c("A","foo","ba|r","baz")) # no list column so no need to quote DT = data.table(A=c("foo","ba|r","baz"), B=list(1:3,1:4,c("fo|o","ba,r","baz"))) # now list column and need to quote test(1736.8, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz")) test(1736.9, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\"")) # any list of same length vector input test(1737.1, fwrite(list()), NULL, warning="fwrite was passed an empty list of no columns") test(1737.2, fwrite(list(1.2)), output="1.2") test(1737.3, fwrite(list(1.2,B="foo")), output=",B1.2,foo") test(1737.4, fwrite(list("A,Name"=1.2,B="fo,o")), output="\"A,Name\",B1.2,\"fo,o\"") test(1737.5, fwrite(list(1.2,B=c("foo","bar"))), error="Column 2's length (2) is not the same as column 1's length (1)") # fwrite ITime, Date, IDate DT = data.table(A=as.ITime(c("23:59:58","23:59:59","12:00:00","00:00:01",NA,"00:00:00"))) test(1738.1, capture.output(fwrite(DT)), c("A","23:59:58","23:59:59","12:00:00","00:00:01","","00:00:00")) test(1738.2, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na=""))) dts = c("1901-05-17","1907-10-22","1929-10-24","1962-05-28","1987-10-19","2008-09-15", "1968-12-30","1968-12-31","1969-01-01","1969-01-02") DT = data.table(A=as.Date(dts), B=as.IDate(dts)) test(1738.3, sapply(DT,typeof), c(A="double",B="integer")) test(1738.4, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) test(1738.5, as.integer(as.Date(c("0000-03-01","9999-12-31"))), c(-719468L,2932896L)) if (FALSE) { # Full range takes too long for CRAN. dts = seq.Date(as.Date("0000-03-01"),as.Date("9999-12-31"),by="day") dtsCh = as.character(dts) # 36s dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000 test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31"))) } else { # test on CRAN a reduced but important range dts = seq.Date(as.Date("1899-12-31"),as.Date("2100-01-01"),by="day") dtsCh = as.character(dts) test(1739.1, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01"))) } DT = data.table(A=dts, B=as.IDate(dts)) test(1739.2, sapply(DT,typeof), c(A="double",B="integer")) test(1739.3, typeof(dts), "double") f = tempfile() g = tempfile() # Full range fwrite(DT,f) # 0.092s write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s test(1739.4, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=","))) test(1739.5, readLines(f), readLines(g)) unlink(f) unlink(g) # dateTimeAs DT = data.table( A = as.Date(d<-c("1907-10-21","1907-10-22","1907-10-22","1969-12-31","1970-01-01","1970-01-01", "1972-02-29","1999-12-31","2000-02-29","2016-09-12")), B = as.IDate(d), C = as.ITime(t<-c("23:59:59","00:00:00","00:00:01", "23:59:58", "00:00:00","00:00:01", "12:00:00", "01:23:45", "23:59:59","01:30:30")), D = as.POSIXct(dt<-paste(d,t), tz="UTC"), E = as.POSIXct(paste0(dt,c(".999",".0",".5",".111112",".123456",".023",".0",".999999",".99",".0009")), tz="UTC")) test(1740.1, fwrite(DT,dateTimeAs="iso"), error="dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") test(1740.2, capture.output(fwrite(DT,dateTimeAs="ISO")), c( "A,B,C,D,E", "1907-10-21,1907-10-21,23:59:59,1907-10-21T23:59:59Z,1907-10-21T23:59:59.999Z", "1907-10-22,1907-10-22,00:00:00,1907-10-22T00:00:00Z,1907-10-22T00:00:00Z", "1907-10-22,1907-10-22,00:00:01,1907-10-22T00:00:01Z,1907-10-22T00:00:01.500Z", "1969-12-31,1969-12-31,23:59:58,1969-12-31T23:59:58Z,1969-12-31T23:59:58.111112Z", "1970-01-01,1970-01-01,00:00:00,1970-01-01T00:00:00Z,1970-01-01T00:00:00.123456Z", "1970-01-01,1970-01-01,00:00:01,1970-01-01T00:00:01Z,1970-01-01T00:00:01.023Z", "1972-02-29,1972-02-29,12:00:00,1972-02-29T12:00:00Z,1972-02-29T12:00:00Z", "1999-12-31,1999-12-31,01:23:45,1999-12-31T01:23:45Z,1999-12-31T01:23:45.999999Z", "2000-02-29,2000-02-29,23:59:59,2000-02-29T23:59:59Z,2000-02-29T23:59:59.990Z", "2016-09-12,2016-09-12,01:30:30,2016-09-12T01:30:30Z,2016-09-12T01:30:30.000900Z")) test(1740.3, capture.output(fwrite(DT,dateTimeAs="squash")), c( "A,B,C,D,E", "19071021,19071021,235959,19071021235959000,19071021235959999", "19071022,19071022,000000,19071022000000000,19071022000000000", "19071022,19071022,000001,19071022000001000,19071022000001500", "19691231,19691231,235958,19691231235958000,19691231235958111", "19700101,19700101,000000,19700101000000000,19700101000000123", "19700101,19700101,000001,19700101000001000,19700101000001023", "19720229,19720229,120000,19720229120000000,19720229120000000", "19991231,19991231,012345,19991231012345000,19991231012345999", "20000229,20000229,235959,20000229235959000,20000229235959990", "20160912,20160912,013030,20160912013030000,20160912013030000")) test(1740.4, capture.output(fwrite(DT,dateTimeAs="epoch")), c( "A,B,C,D,E", "-22718,-22718,86399,-1962748801,-1962748800.001", "-22717,-22717,0,-1962748800,-1962748800", "-22717,-22717,1,-1962748799,-1962748798.5", "-1,-1,86398,-2,-1.888888", "0,0,0,0,0.123456", "0,0,1,1,1.023", "789,789,43200,68212800,68212800", "10956,10956,5025,946603425,946603425.999999", "11016,11016,86399,951868799,951868799.99", "17056,17056,5430,1473643830,1473643830.0009")) test(1741.1, attr(DT[[4]],"tzone"), "UTC") test(1741.2, attr(DT[[5]],"tzone"), "UTC") # Remove tzone attribute to make write.csv write in local time. # That local time will vary on the boxes this test runs on, so we just compare to # write.csv rather than fixed strings as above. setattr(DT[[4]], "tzone", NULL) setattr(DT[[5]], "tzone", NULL) if (base::getRversion() >= "3.0.2") { # "format() now supports digits = 0, to display nsmall decimal places." old = options(digits.secs=0) test(1741.3, x1<-capture.output(fwrite(DT,dateTimeAs="write.csv")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=3) test(1741.4, x2<-capture.output(fwrite(DT,dateTimeAs="write.csv")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(digits.secs=6) test(1741.5, x3<-capture.output(fwrite(DT,dateTimeAs="write.csv")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) options(old) # check that extra digits made it into output test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3))) } # > 1e6 columns (there used to be VLAs at C level that caused stack overflow), #1903 set.seed(1) L = lapply(1:1e6, sample, x=100, size=2) x = capture.output(fwrite(L)) test(1742.1, nchar(x), c(2919861L, 2919774L)) # tests 2 very long lines, too test(1742.2, substring(x,1,10), c("27,58,21,9","38,91,90,6")) test(1742.3, L[[1L]], c(27L,38L)) test(1742.4, L[[1000000L]], c(76L, 40L)) test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40")) options(oldverbose) # last capture.output(fwrite()) has happened now. TODO: tidy up and remove. # fread should properly handle NA in colClasses argument #1910 test(1743.1, sapply(fread("a,b\n1,a", colClasses=c(NA, "factor"), verbose=TRUE), class), c(a="integer", b="factor"), output="Argument colClasses is ignored as requested by provided NA value") test(1743.2, sapply(fread("a,b\n1,a", colClasses=c(NA, NA), verbose=TRUE), class), c(a="integer", b="character"), output="Argument colClasses is ignored as requested by provided NA values") test(1743.3, fread("a,b\n1,a", colClasses=c(NA, TRUE)), error="when colClasses is logical it must be all NA") # also unknown issue in mixed character/factor output and colClasses vector test(1743.4, sapply(fread("a,b\n1,a", colClasses=c("character", "factor")), class), c(a="character", b="factor")) # rolling join stopped working for double with fractions, #1904 DT = data.table(A=c(1999.917,2000.417,2000.917,2001.417,2001.917)) setkey(DT,A) x = c(2000.167,2000.417,2000.667,2000.917,2001.167) test(1744.1, DT[.(x),roll=FALSE,which=TRUE], INT(NA,2,NA,3,NA)) test(1744.2, DT[.(x),roll=TRUE, which=TRUE], INT(1,2,2,3,3)) test(1744.3, DT[.(x),roll=1/12, which=TRUE], INT(NA,2,NA,3,NA)) # 0's at the end of a non-empty subset of empty DT, #1937 test(1745.1, data.table(a=character(0))[c(1,0)], data.table(a=NA_character_)) test(1745.2, data.table(a=numeric(0))[c(1,0)], data.table(a=NA_real_)) test(1745.3, data.table(a=integer(0))[c(1,0)], data.table(a=NA_integer_)) # Long standing crash when by=.EACHI, nomatch=0, the first item in i has no match # AND j has function call that is passed a key column, #1933. DT = data.table(A=letters[1:5],B=1:5,key="A") ids = c("p","q","r","c","s","d") test(1746.1, DT[ids, A, by=.EACHI, nomatch=0], data.table(A=c("c","d"),A=c("c","d"))) # was always ok test(1746.2, DT[ids, print(A), by=.EACHI, nomatch=0], # reliable crash in v1.9.6 and v1.9.8 data.table(A=character(0)), output="\"c\".*\"d\"") test(1746.3, DT[ids, {print(A);A}, by=.EACHI, nomatch=0], # reliable crash in v1.9.6 and v1.9.8 data.table(A=c("c","d"),V1=c("c","d")), output="\"c\".*\"d\"") # combining on= with by= and keyby=, #1943 freshDT = data.table(x = rep(c("a", "b"), each = 4), y = 1:0, z = c(3L, 6L, 8L, 5L, 4L, 1L, 2L, 7L)) DT = copy(freshDT) test(1747.1, DT["b", max(z), by = y, on = "x"], ans1<-data.table(y=1:0, V1=c(4L,7L))) test(1747.2, DT["b", max(z), keyby = y, on = "x"], ans2<-data.table(y=0:1, V1=c(7L,4L), key="y")) test(1747.3, DT[x=="b", max(z), by = y], ans1) test(1747.4, DT[x=="b", max(z), keyby = y], ans2) DT = copy(freshDT) # to clear any auto indexes test(1747.5, DT[x=="b", max(z), by = y], ans1) test(1747.6, DT[x=="b", max(z), keyby = y], ans2) setkey(DT, x) test(1747.7, DT["b", max(z), by = y], ans1) test(1747.8, DT["b", max(z), keyby = y], ans2) DT = copy(freshDT) # and agin without the == having run before the setkey setkey(DT, x) test(1747.9, DT["b", max(z), by = y], ans1) test(1747.11, DT["b", max(z), keyby = y], ans2) DT = as.data.table(mtcars[mtcars$cyl %in% c(6, 8), c("am", "vs", "hp")]) test(1748.1, DT[.(0), max(hp), by = vs, on = "am"], ans1<-data.table(vs=c(1,0), V1=c(123,245))) test(1748.2, DT[.(0), max(hp), keyby = vs, on = "am"], ans2<-data.table(vs=c(0,1), V1=c(245,123), key="vs")) DT = as.data.table(mtcars[mtcars$cyl %in% c(6, 8), c("am", "vs", "hp")]) test(1748.3, DT[am==0, max(hp), by=vs], ans1) test(1748.4, DT[am==0, max(hp), keyby=vs], ans2) # indices() can return list of vectors, #1589 DT = data.table(A=5:1,B=letters[5:1]) setindex(DT) setindex(DT, A) setindex(DT, B) indices(DT, vectors = TRUE) test(1749.1, indices(DT), c("A__B","A","B")) test(1749.2, indices(DT, vectors = TRUE), list(c("A","B"),"A","B")) # for completeness, added test for NA problem to close #1837. Fixed long ago before release to CRAN. test(1750, capture.output(fwrite(data.table(x=NA_integer_),verbose=FALSE)), c("x","")) if ("package:nanotime" %in% search()) { DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", "2016-09-29T23:59:00.000000001Z", "2016-09-29T23:59:00.000000999Z", "1970-01-01T00:01:01.000001000Z", "1970-01-01T00:00:00.000000000Z", "1969-12-31T23:59:59.999999999Z", "1969-12-31T23:59:59.000000089Z", "1969-12-31T12:13:14.000000000Z", "1969-12-31T12:13:14.999999999Z", "1969-12-31T12:13:14.000000001Z", "1967-03-15T00:00:00.300000002Z", "1967-03-15T23:59:59.300000002Z"))) test(1751, capture.output(fwrite(DT, verbose=FALSE))[-1], tt) } else { cat("Test 1751 not run. If required call library(nanotime) first.\n") } ########################## # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time. # TO DO: Add test for fixed bug #5519 - dcast returned error when a package imported data.table, but dint happen when "depends" on data.table. This is fixed (commit 1263 v1.9.3), but not sure how to add test. # TO DO: test and highlight in docs that negatives are fine and fast in forderv (ref R wish #15644) # TO DO: tests of freading classes like Date and the verbose messages there. # TO DO: Test mid read bump of logical T/F to character, collapse back to T and F. # TO DO: add examples of multiple LHS (name and position) and multiple RHS to example(":=") # TO DO: tests on double in add hoc by # TO DO: test on -i that retain key e.g. DT[-4] and DT[-4,sum(v),by=b] should both retain key # test on out of bound i subsets e.g. 6:10 when DT has 7 rows, and mixed negative and positive i integer is error. # test that ordered subsets when i is unkeyed now retain x's key (using is.sorted(f__)) # TO DO: add FAQ that eval() is evaled in calling frame so don't need a, then update SO question of 14 March. See the test using variable name same as column name. Actually, is that true? Need "..J". # TO DO: why did SO answer using eval twice in j need .SD in lapply(f,eval,.SD) on 19 Apr # TO DO: change all 1 to 1L internally (done in data.table.R, other .R to do) # TO DO: check the "j is named list could be inefficient" message from verbose than Chris N showed recently to 15 May # TO DO: !make sure explicitly that unnamed lists are being executed by dogroups! # TO DO: Add to warning about a previous copy that class<-, levels<- can also copy whole vector. *Any* fun<- form basically. # TO DO: use looped := vs set test in example(":=") or example(setnames) to test overhead in [.data.table is tested to stay low in future. # TO DO: add tests on smaller examples with NAs for 'frankv', even though can't compare to base::rank. ## See test-* for more tests ########################## options(warn=0) setDTthreads(0) options(oldalloccol) # set at top of this file plat = paste("endian==",.Platform$endian,", sizeof(long double)==",.Machine$sizeof.longdouble, ", sizeof(pointer)==",.Machine$sizeof.pointer, sep="") if (nfail > 0) { if (nfail>1) {s1="s";s2="s: "} else {s1="";s2=" "} cat("\r") stop(nfail," error",s1," out of ",ntest, " (lastID=",lastnum,", ",plat, ") in inst/tests/tests.Rraw on ",date(),". Search tests.Rraw for test number",s2,paste(whichfail,collapse=", "),".") # important to stop() here, so that 'R CMD check' fails } cat("\n",plat,"\n\nAll ",ntest," tests in inst/tests/tests.Rraw completed ok in ",timetaken(started.at)," on ",date(),"\n",sep="") # date() is included so we can tell when CRAN checks were run (in particular if they have been rerun since # an update to Rdevel itself; data.table doesn't have any other dependency) since there appears to be no other # way to see the timestamp that CRAN checks were run. Some CRAN machines lag by several days. data.table/inst/tests/russellCRLF.csv0000644000175100001440000000227213172210047017263 0ustar hornikusers"Index Name","Date","Value Without Dividends","Value With Dividends" "Russell Microcap® Value Index","06/30/2000",395.77,356.90 "Russell Microcap® Value Index","07/03/2000",397.94,359.39 "Russell Microcap® Value Index","07/05/2000",396.27,357.91 "Russell Microcap® Value Index","07/06/2000",398.75,360.17 "Russell Microcap® Value Index","07/07/2000",399.20,360.58 "Russell Microcap® Value Index","07/10/2000",400.47,361.73 "Russell Microcap® Value Index","07/11/2000",400.67,361.91 "Russell Microcap® Value Index","07/12/2000",404.90,365.79 "Russell Microcap® Value Index","07/13/2000",406.70,367.43 "Russell Microcap® Value Index","07/14/2000",407.68,368.32 "Russell Microcap® Value Index","07/17/2000",408.67,369.22 "Russell Microcap® Value Index","07/18/2000",407.84,368.48 "Russell Microcap® Value Index","07/19/2000",404.81,365.75 "Russell Microcap® Value Index","07/20/2000",406.55,367.35 "Russell Microcap® Value Index","07/21/2000",402.66,363.85 "Russell Microcap® Value Index","07/24/2000",398.75,360.32 "Russell Microcap® Value Index","07/25/2000",399.36,360.88 "Russell Microcap® Value Index","07/26/2000",400.21,361.66 "Russell Microcap® Value Index","07/27/2000",396.02,357.97 data.table/inst/tests/536_fread_fill_4.txt0000644000175100001440000000022013172210047020045 0ustar hornikusersa,b,c 1,2,qq 1,2,qq 1,2,qq 1,2,qq 4,5 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1 1 1,2,qq 1,2,qq 1,2,qq 1,2,er data.table/inst/tests/2008head.csv0000644000175100001440000013521213172210047016337 0ustar hornikusersYear,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay 2008,1,3,4,2003,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,754,735,1002,1000,WN,3231,N772SW,128,145,113,2,19,IAD,TPA,810,5,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,628,620,804,750,WN,448,N428WN,96,90,76,14,8,IND,BWI,515,3,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,926,930,1054,1100,WN,1746,N612SW,88,90,78,-6,-4,IND,BWI,515,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1829,1755,1959,1925,WN,3920,N464WN,90,90,77,34,34,IND,BWI,515,3,10,0,,0,2,0,0,0,32 2008,1,3,4,1940,1915,2121,2110,WN,378,N726SW,101,115,87,11,25,IND,JAX,688,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1937,1830,2037,1940,WN,509,N763SW,240,250,230,57,67,IND,LAS,1591,3,7,0,,0,10,0,0,0,47 2008,1,3,4,1039,1040,1132,1150,WN,535,N428WN,233,250,219,-18,-1,IND,LAS,1591,7,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,617,615,652,650,WN,11,N689SW,95,95,70,2,2,IND,MCI,451,6,19,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1620,1620,1639,1655,WN,810,N648SW,79,95,70,-16,0,IND,MCI,451,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,706,700,916,915,WN,100,N690SW,130,135,106,1,6,IND,MCO,828,5,19,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1644,1510,1845,1725,WN,1333,N334SW,121,135,107,80,94,IND,MCO,828,6,8,0,,0,8,0,0,0,72 2008,1,3,4,1426,1430,1426,1425,WN,829,N476WN,60,55,39,1,-4,IND,MDW,162,9,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,715,715,720,710,WN,1016,N765SW,65,55,37,10,0,IND,MDW,162,7,21,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1702,1700,1651,1655,WN,1827,N420WN,49,55,35,-4,2,IND,MDW,162,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1029,1020,1021,1010,WN,2272,N263WN,52,50,37,11,9,IND,MDW,162,6,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1452,1425,1640,1625,WN,675,N286WN,228,240,213,15,27,IND,PHX,1489,7,8,0,,0,3,0,0,0,12 2008,1,3,4,754,745,940,955,WN,1144,N778SW,226,250,205,-15,9,IND,PHX,1489,5,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1323,1255,1526,1510,WN,4,N674AA,123,135,110,16,28,IND,TPA,838,4,9,0,,0,0,0,0,0,16 2008,1,3,4,1416,1325,1512,1435,WN,54,N643SW,56,70,49,37,51,ISP,BWI,220,2,5,0,,0,12,0,0,0,25 2008,1,3,4,706,705,807,810,WN,68,N497WN,61,65,51,-3,1,ISP,BWI,220,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1657,1625,1754,1735,WN,623,N724SW,57,70,47,19,32,ISP,BWI,220,5,5,0,,0,7,0,0,0,12 2008,1,3,4,1900,1840,1956,1950,WN,717,N786SW,56,70,49,6,20,ISP,BWI,220,2,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1039,1030,1133,1140,WN,1244,N714CB,54,70,47,-7,9,ISP,BWI,220,2,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,801,800,902,910,WN,2101,N222WN,61,70,53,-8,1,ISP,BWI,220,3,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1520,1455,1619,1605,WN,2553,N394SW,59,70,50,14,25,ISP,BWI,220,2,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1422,1255,1657,1610,WN,188,N215WN,155,195,143,47,87,ISP,FLL,1093,6,6,0,,0,40,0,0,0,7 2008,1,3,4,1954,1925,2239,2235,WN,1754,N243WN,165,190,155,4,29,ISP,FLL,1093,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,636,635,921,945,WN,2275,N454WN,165,190,147,-24,1,ISP,FLL,1093,5,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,734,730,958,1020,WN,550,N712SW,324,350,314,-22,4,ISP,LAS,2283,2,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2107,1945,2334,2230,WN,362,N798SW,147,165,134,64,82,ISP,MCO,972,6,7,0,,0,5,0,0,0,59 2008,1,3,4,1008,1005,1234,1255,WN,543,N736SA,146,170,135,-21,3,ISP,MCO,972,5,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,712,710,953,1000,WN,1112,N795SW,161,170,142,-7,2,ISP,MCO,972,5,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1312,1300,1546,1550,WN,1397,N247WN,154,170,140,-4,12,ISP,MCO,972,7,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1449,1430,1715,1720,WN,3398,N707SA,146,170,134,-5,19,ISP,MCO,972,6,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1634,1555,1859,1845,WN,3480,N443WN,145,170,134,14,39,ISP,MCO,972,5,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,831,830,935,955,WN,300,N753SW,124,145,112,-20,1,ISP,MDW,765,5,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1812,1650,1927,1815,WN,422,N779SW,135,145,118,72,82,ISP,MDW,765,6,11,0,,0,3,0,0,0,69 2008,1,3,4,1127,1105,1235,1230,WN,1837,N704SW,128,145,114,5,22,ISP,MDW,765,9,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1424,1355,1531,1520,WN,2871,N709SW,127,145,113,11,29,ISP,MDW,765,8,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1326,1230,1559,1530,WN,1056,N459WN,153,180,143,29,56,ISP,PBI,1052,5,5,0,,0,0,0,0,0,29 2008,1,3,4,1749,1725,2019,2030,WN,2175,N621SW,150,185,138,-11,24,ISP,PBI,1052,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,726,720,958,1020,WN,3319,N206WN,152,180,140,-22,6,ISP,PBI,1052,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,646,640,929,955,WN,3667,N280WN,163,195,151,-26,6,ISP,RSW,1101,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1153,1140,1428,1440,WN,2006,N241WN,155,180,143,-12,13,ISP,TPA,1034,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1528,1510,1802,1810,WN,3858,N200WN,154,180,144,-8,18,ISP,TPA,1034,4,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,634,635,907,935,WN,3928,N459WN,153,180,142,-28,-1,ISP,TPA,1034,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,831,830,1148,1140,WN,534,N286WN,137,130,123,8,1,JAN,BWI,888,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1450,1435,1806,1745,WN,3244,N475WN,136,130,121,21,15,JAN,BWI,888,7,8,0,,0,0,0,6,0,15 2008,1,3,4,2245,1730,2354,1850,WN,186,N792SW,69,80,59,304,315,JAN,HOU,359,3,7,0,,0,282,0,0,0,22 2008,1,3,4,615,615,724,735,WN,971,N202WN,69,80,60,-11,0,JAN,HOU,359,2,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1150,1145,1303,1305,WN,2124,N646SW,73,80,63,-2,5,JAN,HOU,359,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2025,1940,2135,2100,WN,3154,N252WN,70,80,60,35,45,JAN,HOU,359,3,7,0,,0,26,0,0,0,9 2008,1,3,4,1038,945,1314,1225,WN,1035,N346SW,96,100,81,49,53,JAN,MCO,587,8,7,0,,0,7,0,0,0,42 2008,1,3,4,1900,1850,2123,2045,WN,205,N299WN,143,115,97,38,10,JAN,MDW,666,40,6,0,,0,1,0,28,0,9 2008,1,3,4,700,700,851,900,WN,449,N528SW,111,120,99,-9,0,JAN,MDW,666,6,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,948,925,959,940,WN,3430,N487WN,71,75,59,19,23,JAX,BHM,365,3,9,0,,0,0,0,0,0,19 2008,1,3,4,646,620,725,655,WN,1580,N243WN,99,95,77,30,26,JAX,BNA,484,6,16,0,,0,26,0,4,0,0 2008,1,3,4,1110,1040,1136,1110,WN,2195,N479WN,86,90,72,26,30,JAX,BNA,484,5,9,0,,0,0,0,0,10,16 2008,1,3,4,1535,1535,1603,1610,WN,2804,N255WN,88,95,74,-7,0,JAX,BNA,484,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1919,1915,1942,1950,WN,3428,N215WN,83,95,71,-8,4,JAX,BNA,484,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1053,1055,1245,1240,WN,433,N264LV,112,105,96,5,-2,JAX,BWI,663,7,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1433,1440,1623,1625,WN,1331,N714CB,110,105,95,-2,-7,JAX,BWI,663,2,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2015,2010,2158,2155,WN,3504,N436WN,103,105,91,3,5,JAX,BWI,663,5,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2139,2130,2244,2240,WN,378,N726SW,65,70,54,4,9,JAX,FLL,318,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1500,1500,1602,1615,WN,640,N399WN,62,75,49,-13,0,JAX,FLL,318,5,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,850,850,1000,1000,WN,1396,N387SW,70,70,51,0,0,JAX,FLL,318,10,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,646,645,752,755,WN,2189,N405WN,66,70,46,-3,1,JAX,FLL,318,13,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1221,1220,1328,1330,WN,3312,N685SW,67,70,54,-2,1,JAX,FLL,318,5,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1738,1730,1841,1840,WN,3948,N467WN,63,70,49,1,8,JAX,FLL,318,6,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1813,1735,1936,1905,WN,54,N643SW,143,150,125,31,38,JAX,HOU,816,6,12,0,,0,11,0,0,0,20 2008,1,3,4,802,750,1001,955,WN,2272,N263WN,119,125,104,6,12,JAX,IND,688,7,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1820,1825,1946,1955,WN,549,N363SW,86,90,75,-9,-5,JAX,ORF,543,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,821,820,953,945,WN,3604,N257WN,92,85,80,8,1,JAX,ORF,543,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1734,1650,1941,1905,WN,23,N521SW,127,135,113,36,44,JAX,PHL,742,4,10,0,,0,3,0,0,0,33 2008,1,3,4,712,700,926,915,WN,1232,N663SW,134,135,120,11,12,JAX,PHL,742,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1318,1310,1410,1400,WN,977,N376SW,52,50,39,10,8,JAX,TPA,180,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,958,900,1052,950,WN,1574,N791SW,54,50,36,62,58,JAX,TPA,180,4,14,0,,0,0,0,4,0,58 2008,1,3,4,1859,1850,1950,1945,WN,2019,N392SW,51,55,38,5,9,JAX,TPA,180,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1538,1445,1753,1710,WN,500,N799SW,75,85,60,43,53,LAS,ABQ,487,4,11,0,,0,15,0,0,0,28 2008,1,3,4,933,935,1151,1200,WN,778,N607SW,78,85,63,-9,-2,LAS,ABQ,487,5,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2248,2125,102,2345,WN,890,N618WN,74,80,60,77,83,LAS,ABQ,487,4,10,0,,0,7,0,0,0,70 2008,1,3,4,1327,1230,1550,1500,WN,1171,N682SW,83,90,65,50,57,LAS,ABQ,487,3,15,0,,0,50,0,0,0,0 2008,1,3,4,624,625,846,850,WN,1320,N456WN,82,85,61,-4,-1,LAS,ABQ,487,4,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1614,1600,1833,1825,WN,1925,N509SW,79,85,60,8,14,LAS,ABQ,487,4,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1917,1915,2136,2140,WN,2457,N293,79,85,60,-4,2,LAS,ABQ,487,5,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1832,1655,148,30,WN,302,N473WN,256,275,243,78,97,LAS,ALB,2237,3,10,0,,0,8,0,0,0,70 2008,1,3,4,1229,1155,1633,1555,WN,1079,N351SW,124,120,91,38,34,LAS,AMA,758,5,28,0,,0,3,0,4,0,31 2008,1,3,4,1256,1240,1724,1720,WN,155,N238WN,148,160,131,4,16,LAS,AUS,1090,3,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2118,2015,144,50,WN,2021,N499WN,146,155,127,54,63,LAS,AUS,1090,5,14,0,,0,23,0,0,0,31 2008,1,3,4,905,850,1334,1330,WN,3222,N309SW,149,160,135,4,15,LAS,AUS,1090,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1739,1640,114,25,WN,1018,N245WN,275,285,253,49,59,LAS,BDL,2298,9,13,0,,0,19,0,0,0,30 2008,1,3,4,906,905,1426,1430,WN,3948,N467WN,200,205,183,-4,1,LAS,BHM,1618,2,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,816,815,1339,1340,WN,249,N256WN,203,205,185,-1,1,LAS,BNA,1588,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1325,1240,1841,1810,WN,419,N275WN,196,210,178,31,45,LAS,BNA,1588,6,12,0,,0,2,0,0,0,29 2008,1,3,4,1506,1440,2030,2010,WN,2032,N271WN,204,210,183,20,26,LAS,BNA,1588,6,15,0,,0,18,0,0,0,2 2008,1,3,4,2039,1930,155,55,WN,3940,N434WN,196,205,177,60,69,LAS,BNA,1588,5,14,0,,0,0,0,22,0,38 2008,1,3,4,924,920,1209,1210,WN,71,N312SW,105,110,84,-1,4,LAS,BOI,520,3,18,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1611,1535,1849,1825,WN,538,N619SW,98,110,73,24,36,LAS,BOI,520,3,22,0,,0,9,0,0,0,15 2008,1,3,4,1824,1715,117,25,WN,2383,N290WN,233,250,221,52,69,LAS,BUF,1987,2,10,0,,0,48,0,0,0,4 2008,1,3,4,826,825,930,925,WN,136,N493WN,64,60,46,5,1,LAS,BUR,223,2,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2118,2015,2224,2115,WN,219,N383SW,66,60,46,69,63,LAS,BUR,223,3,17,0,,0,17,0,6,0,46 2008,1,3,4,1818,1740,1916,1840,WN,391,N608SW,58,60,46,36,38,LAS,BUR,223,2,10,0,,0,20,0,0,0,16 2008,1,3,4,650,650,748,750,WN,670,N777QC,58,60,47,-2,0,LAS,BUR,223,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2146,2055,2250,2155,WN,815,N626SW,64,60,48,55,51,LAS,BUR,223,3,13,0,,0,4,0,4,0,47 2008,1,3,4,2241,1910,2340,2010,WN,1072,N369SW,59,60,43,210,211,LAS,BUR,223,3,13,0,,0,114,0,0,0,96 2008,1,3,4,1409,1355,1513,1500,WN,1328,N396SW,64,65,50,13,14,LAS,BUR,223,2,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1100,1050,1157,1155,WN,1586,N293,57,65,46,2,10,LAS,BUR,223,2,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1306,1250,1406,1355,WN,1838,N509SW,60,65,47,11,16,LAS,BUR,223,2,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1726,1630,1832,1740,WN,2284,N409WN,66,70,46,52,56,LAS,BUR,223,2,18,0,,0,1,0,0,0,51 2008,1,3,4,915,915,1017,1020,WN,2699,N278WN,62,65,46,-3,0,LAS,BUR,223,2,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1229,1220,1342,1325,WN,2874,N695SW,73,65,47,17,9,LAS,BUR,223,2,24,0,,0,9,0,8,0,0 2008,1,3,4,1459,1455,1556,1555,WN,3237,N284WN,57,60,42,1,4,LAS,BUR,223,2,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,908,845,1628,1610,WN,1774,N220WN,260,265,244,18,23,LAS,BWI,2106,6,10,0,,0,18,0,0,0,0 2008,1,3,4,1817,1730,122,50,WN,2632,N784SW,245,260,230,32,47,LAS,BWI,2106,5,10,0,,0,7,0,0,0,25 2008,1,3,4,1248,1245,2009,2015,WN,3759,N463WN,261,270,243,-6,3,LAS,BWI,2106,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,956,945,1658,1645,WN,818,N244WN,242,240,213,13,11,LAS,CLE,1825,8,21,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1849,1740,121,30,WN,564,N455WN,212,230,195,51,69,LAS,CMH,1772,2,15,0,,0,10,0,0,0,41 2008,1,3,4,1210,1200,1905,1850,WN,991,N297WN,235,230,201,15,10,LAS,CMH,1772,6,28,0,,0,10,0,5,0,0 2008,1,3,4,701,700,941,955,WN,85,N688SW,100,115,84,-14,1,LAS,DEN,629,9,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1047,1030,1328,1320,WN,157,N481WN,101,110,79,8,17,LAS,DEN,629,9,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2232,2115,108,5,WN,632,N203WN,96,110,77,63,77,LAS,DEN,629,6,13,0,,0,0,0,7,0,56 2008,1,3,4,1512,1315,1802,1610,WN,706,N491WN,110,115,88,112,117,LAS,DEN,629,8,14,0,,0,0,0,0,0,112 2008,1,3,4,2025,1955,2301,2245,WN,908,N480WN,96,110,78,16,30,LAS,DEN,629,6,12,0,,0,7,0,0,0,9 2008,1,3,4,1439,1425,1720,1720,WN,1582,N318SW,101,115,85,0,14,LAS,DEN,629,7,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1627,1600,1916,1900,WN,2907,N647SW,109,120,80,16,27,LAS,DEN,629,8,21,0,,0,2,0,0,0,14 2008,1,3,4,1745,1710,2017,1945,WN,2192,N313SW,92,95,78,32,35,LAS,ELP,584,2,12,0,,0,10,0,0,0,22 2008,1,3,4,1049,1040,1320,1320,WN,2280,N352SW,91,100,73,0,9,LAS,ELP,584,3,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1117,1050,1325,1325,WN,1636,N321SW,128,155,116,0,27,LAS,GEG,806,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1725,1620,1940,1850,WN,2803,N660SW,135,150,116,50,65,LAS,GEG,806,4,15,0,,0,50,0,0,0,0 2008,1,3,4,1824,1735,2303,2230,WN,719,N414WN,159,175,143,33,49,LAS,HOU,1235,3,13,0,,0,12,0,0,0,21 2008,1,3,4,1150,1140,1644,1640,WN,1776,N408WN,174,180,147,4,10,LAS,HOU,1235,3,24,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,749,740,1227,1235,WN,3244,N475WN,158,175,146,-8,9,LAS,HOU,1235,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2051,2010,134,55,WN,3746,N321SW,163,165,150,39,41,LAS,HOU,1235,4,9,0,,0,17,0,0,0,22 2008,1,3,4,1555,1525,2307,2245,WN,1027,N230WN,252,260,232,22,30,LAS,IAD,2066,7,13,0,,0,6,0,0,0,16 2008,1,3,4,2255,1820,509,55,WN,1924,N761RR,194,215,176,254,275,LAS,IND,1591,9,9,0,,0,0,0,0,8,246 2008,1,3,4,1129,1050,1757,1725,WN,3920,N464WN,208,215,179,32,39,LAS,IND,1591,8,21,0,,0,32,0,0,0,0 2008,1,3,4,1602,1510,2357,2255,WN,3144,N272WN,295,285,269,62,52,LAS,ISP,2283,4,22,0,,0,13,0,10,0,39 2008,1,3,4,1738,1715,1838,1820,WN,82,N499WN,60,65,42,18,23,LAS,LAX,236,6,12,0,,0,0,0,0,12,6 2008,1,3,4,2207,2150,2306,2255,WN,135,N244WN,59,65,40,11,17,LAS,LAX,236,7,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1851,1825,2004,1930,WN,317,N335SW,73,65,41,34,26,LAS,LAX,236,16,16,0,,0,18,0,8,0,8 2008,1,3,4,701,645,802,745,WN,337,N488WN,61,60,42,17,16,LAS,LAX,236,7,12,0,,0,16,0,1,0,0 2008,1,3,4,1556,1455,1704,1600,WN,354,N496WN,68,65,41,64,61,LAS,LAX,236,9,18,0,,0,61,0,3,0,0 2008,1,3,4,1254,1250,1404,1355,WN,784,N401WN,70,65,43,9,4,LAS,LAX,236,7,20,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,840,840,946,945,WN,790,N351SW,66,65,43,1,0,LAS,LAX,236,11,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1540,1525,1648,1630,WN,1061,N422WN,68,65,47,18,15,LAS,LAX,236,5,16,0,,0,10,0,3,0,5 2008,1,3,4,750,750,901,855,WN,1653,N288WN,71,65,52,6,0,LAS,LAX,236,10,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1228,1200,1340,1310,WN,1928,N792SW,72,70,43,30,28,LAS,LAX,236,7,22,0,,0,0,0,2,0,28 2008,1,3,4,2047,2005,2209,2115,WN,2367,N352SW,82,70,45,54,42,LAS,LAX,236,7,30,0,,0,15,0,12,0,27 2008,1,3,4,2158,2120,2303,2225,WN,2896,N335SW,65,65,44,38,38,LAS,LAX,236,11,10,0,,0,13,0,0,0,25 2008,1,3,4,1028,1025,1132,1135,WN,3655,N465WN,64,70,45,-3,3,LAS,LAX,236,8,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1423,1420,1530,1525,WN,3917,N609SW,67,65,44,5,3,LAS,LAX,236,11,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1147,1100,1541,1500,WN,2382,N404WN,114,120,90,41,47,LAS,LBB,775,4,20,0,,0,25,0,0,0,16 2008,1,3,4,859,850,1343,1345,WN,565,N787SA,164,175,148,-2,9,LAS,LIT,1295,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1140,1110,1537,1510,WN,43,N617SW,117,120,95,27,30,LAS,MAF,796,2,20,0,,0,27,0,0,0,0 2008,1,3,4,1226,1210,1721,1700,WN,213,N302SW,175,170,139,21,16,LAS,MCI,1140,4,32,0,,0,1,0,5,0,15 2008,1,3,4,2118,2005,154,50,WN,411,N763SW,156,165,131,64,73,LAS,MCI,1140,7,18,0,,0,14,0,0,0,50 2008,1,3,4,2003,1905,26,2350,WN,1004,N676SW,143,165,130,36,58,LAS,MCI,1140,5,8,0,,0,4,0,0,0,32 2008,1,3,4,1622,1510,2059,2000,WN,1405,N775SW,157,170,130,59,72,LAS,MCI,1140,4,23,0,,0,59,0,0,0,0 2008,1,3,4,648,650,1123,1135,WN,2972,N634SW,155,165,137,-12,-2,LAS,MCI,1140,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1003,940,1441,1430,WN,3068,N449WN,158,170,136,11,23,LAS,MCI,1140,5,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1319,1300,2033,2025,WN,380,N444WN,254,265,234,8,19,LAS,MCO,2039,6,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1506,1500,2122,2030,WN,74,N764SW,256,210,177,52,6,LAS,MDW,1521,62,17,0,,0,6,0,46,0,0 2008,1,3,4,648,645,1205,1210,WN,227,N723SW,197,205,182,-5,3,LAS,MDW,1521,5,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1052,1050,1603,1620,WN,335,N712SW,191,210,175,-17,2,LAS,MDW,1521,6,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1424,1415,1941,1945,WN,396,N407WN,197,210,177,-4,9,LAS,MDW,1521,8,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1211,1145,1750,1720,WN,693,N232WN,219,215,175,30,26,LAS,MDW,1521,23,21,0,,0,0,0,4,0,26 2008,1,3,4,814,755,1334,1320,WN,823,N725SW,200,205,181,14,19,LAS,MDW,1521,6,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,934,935,1453,1505,WN,830,N436WN,199,210,177,-12,-1,LAS,MDW,1521,6,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1708,1700,2214,2230,WN,1865,N745SW,186,210,169,-16,8,LAS,MDW,1521,6,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1919,1845,31,5,WN,2025,N740SW,192,200,171,26,34,LAS,MDW,1521,8,13,0,,0,4,0,0,0,22 2008,1,3,4,1945,1815,52,2335,WN,2450,N405WN,187,200,168,77,90,LAS,MDW,1521,8,11,0,,0,13,0,0,0,64 2008,1,3,4,2057,1930,202,50,WN,2794,N468WN,185,200,171,72,87,LAS,MDW,1521,7,7,0,,0,60,0,0,0,12 2008,1,3,4,1354,1310,1907,1845,WN,3232,N201LV,193,215,177,22,44,LAS,MDW,1521,6,10,0,,0,4,0,0,0,18 2008,1,3,4,1703,1645,33,35,WN,767,N222WN,270,290,257,-2,18,LAS,MHT,2356,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1030,1030,1545,1545,WN,762,N295WN,195,195,174,0,0,LAS,MSY,1501,3,18,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1903,1825,5,2340,WN,3327,N282WN,182,195,166,25,38,LAS,MSY,1501,4,12,0,,0,3,0,0,0,22 2008,1,3,4,1108,1110,1244,1240,WN,125,N782SA,96,90,80,4,-2,LAS,OAK,407,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,NA,700,NA,830,WN,126,,NA,90,NA,NA,NA,LAS,OAK,407,NA,NA,1,A,0,NA,NA,NA,NA,NA 2008,1,3,4,1354,1345,1525,1520,WN,127,N784SW,91,95,74,5,9,LAS,OAK,407,4,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1944,1740,2110,1915,WN,223,N223WN,86,95,72,115,124,LAS,OAK,407,5,9,0,,0,21,0,0,0,94 2008,1,3,4,1251,1205,1425,1345,WN,237,N281WN,94,100,76,40,46,LAS,OAK,407,3,15,0,,0,2,0,0,0,38 2008,1,3,4,1548,1505,1728,1635,WN,280,N460WN,100,90,78,53,43,LAS,OAK,407,4,18,0,,0,14,0,10,0,29 2008,1,3,4,821,820,955,950,WN,864,N640SW,94,90,72,5,1,LAS,OAK,407,3,19,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2216,2125,2348,2255,WN,900,N679AA,92,90,78,53,51,LAS,OAK,407,4,10,0,,0,11,0,2,0,40 2008,1,3,4,2010,1900,2203,2030,WN,962,N378SW,113,90,77,93,70,LAS,OAK,407,28,8,0,,0,45,0,23,0,25 2008,1,3,4,1433,1420,1608,1555,WN,1011,N248WN,95,95,79,13,13,LAS,OAK,407,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,935,925,1106,1055,WN,1258,N774SW,91,90,71,11,10,LAS,OAK,407,3,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2156,2155,2334,2325,WN,1950,N408WN,98,90,83,9,1,LAS,OAK,407,5,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2035,2020,2209,2150,WN,3135,N695SW,94,90,81,19,15,LAS,OAK,407,3,10,0,,0,15,0,4,0,0 2008,1,3,4,602,600,733,730,WN,3233,N354SW,91,90,76,3,2,LAS,OAK,407,4,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1152,1145,1616,1605,WN,86,N627SW,144,140,116,11,7,LAS,OKC,987,4,24,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1435,1315,1859,1755,WN,479,N622SW,144,160,132,64,80,LAS,OMA,1099,3,9,0,,0,38,0,0,0,26 2008,1,3,4,2049,1920,106,2355,WN,875,N217JC,137,155,125,71,89,LAS,OMA,1099,2,10,0,,0,32,0,0,0,39 2008,1,3,4,913,910,1342,1350,WN,2111,N673AA,149,160,131,-8,3,LAS,OMA,1099,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1613,1610,1721,1705,WN,187,N690SW,68,55,38,16,3,LAS,ONT,197,4,26,0,,0,0,0,16,0,0 2008,1,3,4,2015,1945,2101,2040,WN,725,N259WN,46,55,34,21,30,LAS,ONT,197,4,8,0,,0,3,0,0,0,18 2008,1,3,4,655,655,742,750,WN,731,N244WN,47,55,35,-8,0,LAS,ONT,197,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1438,1435,1533,1530,WN,872,N788SA,55,55,33,3,3,LAS,ONT,197,6,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1102,1030,1152,1125,WN,906,N301SW,50,55,36,27,32,LAS,ONT,197,3,11,0,,0,27,0,0,0,0 2008,1,3,4,2218,2120,2308,2210,WN,1493,N356SW,50,50,36,58,58,LAS,ONT,197,5,9,0,,0,0,0,20,0,38 2008,1,3,4,851,835,938,930,WN,1670,N742SW,47,55,34,8,16,LAS,ONT,197,5,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1230,1215,1331,1310,WN,2018,N313SW,61,55,36,21,15,LAS,ONT,197,4,21,0,,0,0,0,6,0,15 2008,1,3,4,1838,1800,1930,1855,WN,2730,N273WN,52,55,32,35,38,LAS,ONT,197,3,17,0,,0,0,0,0,0,35 2008,1,3,4,1358,1330,1449,1425,WN,3070,N660SW,51,55,37,24,28,LAS,ONT,197,4,10,0,,0,4,0,0,0,20 2008,1,3,4,1757,1655,108,25,WN,156,N466WN,251,270,237,43,62,LAS,ORF,2155,4,10,0,,0,13,0,0,0,30 2008,1,3,4,1816,1755,2022,2015,WN,700,N678AA,126,140,112,7,21,LAS,PDX,762,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,755,755,958,1020,WN,1669,N484WN,123,145,108,-22,0,LAS,PDX,762,6,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1328,1255,1549,1520,WN,2354,N258WN,141,145,106,29,33,LAS,PDX,762,22,13,0,,0,24,0,0,0,5 2008,1,3,4,2134,2040,2343,2300,WN,2550,N608SW,129,140,112,43,54,LAS,PDX,762,3,14,0,,0,12,0,0,0,31 2008,1,3,4,1127,1045,1856,1835,WN,1285,N435WN,269,290,245,21,42,LAS,PHL,2176,5,19,0,,0,18,0,0,0,3 2008,1,3,4,1720,1700,41,45,WN,1843,N484WN,261,285,244,-4,20,LAS,PHL,2176,4,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,559,600,800,805,WN,102,N639SW,61,65,43,-5,-1,LAS,PHX,256,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,712,705,932,915,WN,243,N228WN,80,70,49,17,7,LAS,PHX,256,7,24,0,,0,0,0,10,0,7 2008,1,3,4,1327,1320,1533,1530,WN,287,N747SA,66,70,50,3,7,LAS,PHX,256,5,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1817,1810,2016,2015,WN,504,N338SW,59,65,47,1,7,LAS,PHX,256,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1807,1740,2011,1950,WN,559,N603SW,64,70,49,21,27,LAS,PHX,256,5,10,0,,0,2,0,0,0,19 2008,1,3,4,1712,1545,1914,1750,WN,668,N221WN,62,65,48,84,87,LAS,PHX,256,4,10,0,,0,6,0,0,0,78 2008,1,3,4,1206,1205,1418,1415,WN,752,N680AA,72,70,47,3,1,LAS,PHX,256,4,21,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1134,1110,1343,1315,WN,781,N378SW,69,65,46,28,24,LAS,PHX,256,8,15,0,,0,9,0,4,0,15 2008,1,3,4,1003,1005,1208,1215,WN,871,N327SW,65,70,49,-7,-2,LAS,PHX,256,5,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,815,820,1034,1025,WN,1034,N420WN,79,65,49,9,-5,LAS,PHX,256,4,26,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2156,2110,2353,2310,WN,1403,N258WN,57,60,43,43,46,LAS,PHX,256,5,9,0,,0,2,0,0,0,41 2008,1,3,4,2140,2040,2334,2240,WN,1682,N281WN,54,60,40,54,60,LAS,PHX,256,3,11,0,,0,2,0,0,0,52 2008,1,3,4,631,635,832,840,WN,2103,N291WN,61,65,44,-8,-4,LAS,PHX,256,5,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1521,1450,1725,1655,WN,2726,N777QC,64,65,46,30,31,LAS,PHX,256,5,13,0,,0,16,0,0,0,14 2008,1,3,4,2100,1945,2255,2150,WN,2991,N752SW,55,65,43,65,75,LAS,PHX,256,4,8,0,,0,48,0,0,0,17 2008,1,3,4,1657,1630,1905,1845,WN,3517,N487WN,68,75,49,20,27,LAS,PHX,256,5,14,0,,0,2,0,0,0,18 2008,1,3,4,926,915,1129,1120,WN,3598,N790SW,63,65,44,9,11,LAS,PHX,256,5,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1413,1415,1612,1620,WN,3849,N269WN,59,65,44,-8,-2,LAS,PHX,256,4,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1825,1745,114,50,WN,512,N794SW,229,245,210,24,40,LAS,PIT,1910,6,13,0,,0,10,0,0,0,14 2008,1,3,4,927,915,1631,1620,WN,594,N715SW,244,245,218,11,12,LAS,PIT,1910,7,19,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1630,1600,23,2350,WN,416,N401WN,293,290,266,33,30,LAS,PVD,2363,4,23,0,,0,6,0,3,0,24 2008,1,3,4,1136,1120,1852,1840,WN,170,N448WN,256,260,231,12,16,LAS,RDU,2027,6,19,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1824,1725,121,45,WN,1509,N403WN,237,260,222,36,59,LAS,RDU,2027,2,13,0,,0,27,0,0,0,9 2008,1,3,4,1716,1630,1830,1755,WN,239,N267WN,74,85,60,35,46,LAS,RNO,345,4,10,0,,0,11,0,0,0,24 2008,1,3,4,1928,1855,2034,2015,WN,298,N450WN,66,80,54,19,33,LAS,RNO,345,4,8,0,,0,9,0,0,0,10 2008,1,3,4,709,710,816,830,WN,376,N342SW,67,80,55,-14,-1,LAS,RNO,345,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1741,1730,1905,1850,WN,675,N286WN,84,80,57,15,11,LAS,RNO,345,14,13,0,,0,0,0,4,0,11 2008,1,3,4,1609,1540,1732,1700,WN,709,N428WN,83,80,55,32,29,LAS,RNO,345,7,21,0,,0,26,0,3,0,3 2008,1,3,4,913,840,1029,955,WN,891,N400WN,76,75,61,34,33,LAS,RNO,345,3,12,0,,0,8,0,1,0,25 2008,1,3,4,2215,2200,2334,2315,WN,1341,N442WN,79,75,57,19,15,LAS,RNO,345,4,18,0,,0,6,0,4,0,9 2008,1,3,4,2137,2040,2251,2155,WN,2273,N286WN,74,75,56,56,57,LAS,RNO,345,4,14,0,,0,0,0,0,0,56 2008,1,3,4,624,625,729,740,WN,2538,N230WN,65,75,52,-11,-1,LAS,RNO,345,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1100,1020,1210,1145,WN,2719,N466WN,70,85,54,25,40,LAS,RNO,345,4,12,0,,0,0,0,1,0,24 2008,1,3,4,1313,1255,1437,1415,WN,3430,N487WN,84,80,57,22,18,LAS,RNO,345,4,23,0,,0,8,0,4,0,10 2008,1,3,4,1415,1350,1527,1510,WN,3835,N610WN,72,80,56,17,25,LAS,RNO,345,4,12,0,,0,9,0,0,0,8 2008,1,3,4,1108,1110,1221,1230,WN,3947,N277WN,73,80,56,-9,-2,LAS,RNO,345,3,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1348,1330,1455,1435,WN,341,N315SW,67,65,51,20,18,LAS,SAN,258,2,14,0,,0,8,0,2,0,10 2008,1,3,4,1221,1220,1329,1325,WN,535,N428WN,68,65,48,4,1,LAS,SAN,258,4,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2140,2050,2240,2150,WN,708,N692SW,60,60,45,50,50,LAS,SAN,258,3,12,0,,0,21,0,0,0,29 2008,1,3,4,805,805,909,905,WN,711,N207WN,64,60,45,4,0,LAS,SAN,258,3,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1643,1625,1751,1735,WN,715,N902WN,68,70,46,16,18,LAS,SAN,258,4,18,0,,0,4,0,0,0,12 2008,1,3,4,946,940,1045,1045,WN,1037,N734SA,59,65,46,0,6,LAS,SAN,258,2,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1915,1905,2016,2005,WN,1254,N490WN,61,60,44,11,10,LAS,SAN,258,4,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1839,1740,1945,1840,WN,2186,N692SW,66,60,44,65,59,LAS,SAN,258,4,18,0,,0,5,0,6,0,54 2008,1,3,4,2207,2125,2307,2225,WN,2211,N646SW,60,60,49,42,42,LAS,SAN,258,3,8,0,,0,17,0,0,0,25 2008,1,3,4,1503,1425,1604,1530,WN,2357,N414WN,61,65,48,34,38,LAS,SAN,258,3,10,0,,0,9,0,0,0,25 2008,1,3,4,1613,1515,1727,1615,WN,2363,N405WN,74,60,47,72,58,LAS,SAN,258,3,24,0,,0,0,0,19,0,53 2008,1,3,4,2248,2155,2346,2255,WN,2449,N454WN,58,60,45,51,53,LAS,SAN,258,3,10,0,,0,12,0,0,0,39 2008,1,3,4,612,615,713,715,WN,2712,N253WN,61,60,46,-2,-3,LAS,SAN,258,3,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1118,1110,1225,1215,WN,3423,N298WN,67,65,50,10,8,LAS,SAN,258,3,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1456,1445,1920,1920,WN,120,N277WN,144,155,126,0,11,LAS,SAT,1069,4,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2022,1935,36,5,WN,802,N902WN,134,150,122,31,47,LAS,SAT,1069,2,10,0,,0,23,0,0,0,8 2008,1,3,4,1049,1040,1509,1510,WN,2113,N350SW,140,150,127,-1,9,LAS,SAT,1069,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,753,750,1215,1220,WN,2906,N794SW,142,150,125,-5,3,LAS,SAT,1069,2,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1831,1810,110,45,WN,495,N768SW,219,215,192,25,21,LAS,SDF,1624,9,18,0,,0,18,0,4,0,3 2008,1,3,4,1704,1645,1929,1930,WN,340,N634SW,145,165,129,-1,19,LAS,SEA,866,7,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2017,2005,2241,2245,WN,563,N285WN,144,160,129,-4,12,LAS,SEA,866,5,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,639,640,859,925,WN,1769,N731SA,140,165,124,-26,-1,LAS,SEA,866,5,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1117,1050,1330,1330,WN,2465,N431WN,133,160,119,0,27,LAS,SEA,866,7,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1426,1355,1605,1530,WN,488,N615SW,99,95,75,35,31,LAS,SFO,414,3,21,0,,0,0,0,22,0,13 2008,1,3,4,1009,910,1148,1045,WN,619,N491WN,99,95,76,63,59,LAS,SFO,414,3,20,0,,0,0,0,54,0,9 2008,1,3,4,2021,1700,2303,1835,WN,2005,N302SW,162,95,73,268,201,LAS,SFO,414,4,85,0,,0,192,0,67,0,9 2008,1,3,4,2025,1905,2208,2040,WN,2788,N602SW,103,95,88,88,80,LAS,SFO,414,4,11,0,,0,0,0,63,0,25 2008,1,3,4,603,605,729,740,WN,2886,N246LV,86,95,70,-11,-2,LAS,SFO,414,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2301,2105,59,2240,WN,3061,N447WN,118,95,98,139,116,LAS,SFO,414,4,16,0,,0,0,0,131,0,8 2008,1,3,4,1518,1215,1646,1350,WN,3575,N492WN,88,95,72,176,183,LAS,SFO,414,4,12,0,,0,0,0,176,0,0 2008,1,3,4,1307,1235,1432,1400,WN,12,N361SW,85,85,71,32,32,LAS,SJC,386,4,10,0,,0,4,0,0,0,28 2008,1,3,4,615,610,746,730,WN,276,N472WN,91,80,74,16,5,LAS,SJC,386,4,13,0,,0,0,0,16,0,0 2008,1,3,4,2017,1945,2144,2105,WN,400,N429WN,87,80,70,39,32,LAS,SJC,386,4,13,0,,0,2,0,7,0,30 2008,1,3,4,1855,1855,2028,2020,WN,484,N673AA,93,85,67,8,0,LAS,SJC,386,14,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2114,2040,2241,2200,WN,774,N437WN,87,80,69,41,34,LAS,SJC,386,8,10,0,,0,34,0,7,0,0 2008,1,3,4,1605,1520,1739,1645,WN,783,N306SW,94,85,68,54,45,LAS,SJC,386,5,21,0,,0,10,0,9,0,35 2008,1,3,4,956,1000,1134,1125,WN,1945,N271WN,98,85,70,9,-4,LAS,SJC,386,15,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,731,730,848,855,WN,2387,N776WN,77,85,66,-7,1,LAS,SJC,386,4,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1734,1655,1857,1820,WN,3844,N446WN,83,85,66,37,39,LAS,SJC,386,5,12,0,,0,9,0,0,0,28 2008,1,3,4,1139,1100,1408,1320,WN,161,N455WN,89,80,54,48,39,LAS,SLC,368,6,29,0,,0,39,0,9,0,0 2008,1,3,4,2104,1905,2314,2120,WN,415,N395SW,70,75,58,114,119,LAS,SLC,368,2,10,0,,0,25,0,0,0,89 2008,1,3,4,919,830,1132,1050,WN,643,N756SA,73,80,55,42,49,LAS,SLC,368,4,14,0,,0,3,0,0,0,39 2008,1,3,4,2113,2050,2328,2305,WN,1166,N351SW,75,75,58,23,23,LAS,SLC,368,3,14,0,,0,4,0,0,0,19 2008,1,3,4,1712,1555,1935,1820,WN,1585,N766SW,83,85,62,75,77,LAS,SLC,368,4,17,0,,0,0,0,6,0,69 2008,1,3,4,1419,1355,1627,1615,WN,1660,N613SW,68,80,55,12,24,LAS,SLC,368,2,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2242,2140,54,2355,WN,2533,N385SW,72,75,55,59,62,LAS,SLC,368,4,13,0,,0,0,0,10,0,49 2008,1,3,4,611,615,819,835,WN,3147,N296WN,68,80,55,-16,-4,LAS,SLC,368,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1949,1910,2110,2035,WN,94,N715SW,81,85,66,35,39,LAS,SMF,397,5,10,0,,0,25,0,0,0,10 2008,1,3,4,1409,1350,1539,1520,WN,584,N324SW,90,90,74,19,19,LAS,SMF,397,6,10,0,,0,19,0,0,0,0 2008,1,3,4,637,640,759,805,WN,624,N336SW,82,85,65,-6,-3,LAS,SMF,397,5,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1821,1730,1946,1855,WN,641,N626SW,85,85,67,51,51,LAS,SMF,397,6,12,0,,0,9,0,0,0,42 2008,1,3,4,916,905,1046,1035,WN,2121,N632SW,90,90,68,11,11,LAS,SMF,397,4,18,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1134,1115,1302,1250,WN,2622,N445WN,88,95,66,12,19,LAS,SMF,397,5,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1527,1515,1648,1645,WN,3053,N418WN,81,90,68,3,12,LAS,SMF,397,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2115,2100,2236,2225,WN,3862,N640SW,81,85,64,11,15,LAS,SMF,397,4,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1934,1935,2054,2035,WN,201,N251WN,80,60,43,19,-1,LAS,SNA,226,29,8,0,,0,0,0,19,0,0 2008,1,3,4,1305,1305,1358,1410,WN,300,N753SW,53,65,42,-12,0,LAS,SNA,226,4,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1917,1820,2014,1925,WN,433,N264LV,57,65,40,49,57,LAS,SNA,226,7,10,0,,0,16,0,0,0,33 2008,1,3,4,1125,1010,1224,1110,WN,1016,N765SW,59,60,45,74,75,LAS,SNA,226,5,9,0,,0,9,0,0,0,65 2008,1,3,4,1505,1500,1605,1605,WN,1479,N498WN,60,65,43,0,5,LAS,SNA,226,4,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,747,750,843,850,WN,3250,N245WN,56,60,42,-7,-3,LAS,SNA,226,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1143,1050,1641,1600,WN,773,N472WN,178,190,151,41,53,LAS,STL,1372,3,24,0,,0,2,0,0,0,39 2008,1,3,4,1558,1530,2049,2040,WN,1050,N212WN,171,190,150,9,28,LAS,STL,1372,4,17,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1940,1855,29,2355,WN,1583,N600WN,169,180,157,34,45,LAS,STL,1372,3,9,0,,0,22,0,0,0,12 2008,1,3,4,1820,1725,112,35,WN,3403,N737JW,232,250,217,37,55,LAS,TPA,1984,5,10,0,,0,29,0,0,0,8 2008,1,3,4,1143,1035,1621,1505,WN,574,N314SW,158,150,126,76,68,LAS,TUL,1076,3,29,0,,0,0,0,14,0,62 2008,1,3,4,2211,2115,17,2330,WN,486,N264LV,66,75,53,47,56,LAS,TUS,365,4,9,0,,0,5,0,0,0,42 2008,1,3,4,1821,1725,2032,1940,WN,637,N342SW,71,75,56,52,56,LAS,TUS,365,4,11,0,,0,24,0,0,0,28 2008,1,3,4,715,720,926,935,WN,1059,N637SW,71,75,55,-9,-5,LAS,TUS,365,3,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1341,1320,1556,1535,WN,1918,N267WN,75,75,55,21,21,LAS,TUS,365,3,17,0,,0,8,0,0,0,13 2008,1,3,4,2108,2030,2313,2240,WN,3142,N446WN,65,70,53,33,38,LAS,TUS,365,4,8,0,,0,8,0,0,0,25 2008,1,3,4,1118,1050,1333,1305,WN,3572,N489WN,75,75,54,28,28,LAS,TUS,365,3,18,0,,0,0,0,0,0,28 2008,1,3,4,1735,1710,2010,1955,WN,230,N364SW,95,105,81,15,25,LAX,ABQ,677,4,10,0,,0,0,0,4,0,11 2008,1,3,4,2105,2025,2342,2310,WN,361,N690SW,97,105,82,32,40,LAX,ABQ,677,6,9,0,,0,0,0,11,0,21 2008,1,3,4,1328,1315,1611,1600,WN,397,N424WN,103,105,80,11,13,LAX,ABQ,677,18,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,944,940,1218,1225,WN,3245,N780SW,94,105,82,-7,4,LAX,ABQ,677,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1024,1015,1504,1505,WN,304,N326SW,160,170,148,-1,9,LAX,AUS,1242,5,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,820,815,1357,1400,WN,133,N749SW,217,225,200,-3,5,LAX,BNA,1797,6,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1632,1540,2207,2125,WN,1107,N708SA,215,225,203,42,52,LAX,BNA,1797,4,8,0,,0,0,0,18,0,24 2008,1,3,4,1327,1255,1902,1840,WN,1324,N249WN,215,225,200,22,32,LAX,BNA,1797,4,11,0,,0,10,0,0,0,12 2008,1,3,4,1330,1320,1609,1615,WN,330,N236WN,99,115,86,-6,10,LAX,ELP,714,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1925,1735,2201,2025,WN,1671,N684WN,96,110,85,96,110,LAX,ELP,714,3,8,0,,0,5,0,0,0,91 2008,1,3,4,1005,945,1250,1235,WN,2635,N335SW,105,110,88,15,20,LAX,ELP,714,3,14,0,,0,7,0,0,0,8 2008,1,3,4,1149,1145,1639,1650,WN,205,N299WN,170,185,156,-11,4,LAX,HOU,1390,3,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1742,1625,2231,2125,WN,528,N496WN,169,180,157,66,77,LAX,HOU,1390,4,8,0,,0,11,0,0,0,55 2008,1,3,4,807,805,1301,1310,WN,1513,N305SW,174,185,163,-9,2,LAX,HOU,1390,5,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2020,1935,109,35,WN,3394,N667SW,169,180,159,34,45,LAX,HOU,1390,3,7,0,,0,6,0,0,0,28 2008,1,3,4,1501,1425,1554,1530,WN,416,N401WN,53,65,41,24,36,LAX,LAS,236,4,8,0,,0,18,0,0,0,6 2008,1,3,4,1205,1135,1313,1240,WN,479,N622SW,68,65,47,33,30,LAX,LAS,236,3,18,0,,0,16,0,3,0,14 2008,1,3,4,1652,1640,1748,1745,WN,495,N768SW,56,65,41,3,12,LAX,LAS,236,8,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,922,855,1102,1000,WN,574,N314SW,100,65,44,62,27,LAX,LAS,236,4,52,0,,0,13,0,35,0,14 2008,1,3,4,1835,1740,1931,1840,WN,1004,N676SW,56,60,42,51,55,LAX,LAS,236,6,8,0,,0,0,0,4,0,47 2008,1,3,4,1027,1015,1151,1120,WN,1079,N351SW,84,65,43,31,12,LAX,LAS,236,22,19,0,,0,11,0,19,0,1 2008,1,3,4,626,625,724,730,WN,1669,N484WN,58,65,45,-6,1,LAX,LAS,236,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1533,1530,1631,1635,WN,1843,N484WN,58,65,46,-4,3,LAX,LAS,236,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1910,1845,2026,1950,WN,2021,N499WN,76,65,42,36,25,LAX,LAS,236,25,9,0,,0,7,0,11,0,18 2008,1,3,4,744,740,838,840,WN,2111,N673AA,54,60,46,-2,4,LAX,LAS,236,3,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2109,2100,2212,2205,WN,2202,N758SW,63,65,41,7,9,LAX,LAS,236,4,18,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2237,2140,2335,2245,WN,2454,N352SW,58,65,47,50,57,LAX,LAS,236,3,8,0,,0,3,0,0,0,47 2008,1,3,4,2025,1955,2120,2055,WN,2809,N335SW,55,60,43,25,30,LAX,LAS,236,4,8,0,,0,0,0,0,0,25 2008,1,3,4,1423,1335,1519,1440,WN,3144,N272WN,56,65,42,39,48,LAX,LAS,236,5,9,0,,0,11,0,0,0,28 2008,1,3,4,1337,1310,1825,1825,WN,321,N742SW,168,195,154,0,27,LAX,MCI,1363,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1142,1125,1626,1640,WN,1668,N262WN,164,195,156,-14,17,LAX,MCI,1363,3,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1928,1835,22,2345,WN,2208,N632SW,174,190,159,37,53,LAX,MCI,1363,6,9,0,,0,17,0,0,0,20 2008,1,3,4,634,630,1133,1140,WN,3495,N615SW,179,190,169,-7,4,LAX,MCI,1363,4,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1713,1625,2242,2215,WN,108,N550WN,209,230,192,27,48,LAX,MDW,1750,7,10,0,,0,5,0,0,0,22 2008,1,3,4,853,850,1431,1440,WN,224,N279WN,218,230,205,-9,3,LAX,MDW,1750,6,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1255,1240,1825,1835,WN,245,N730SW,210,235,201,-10,15,LAX,MDW,1750,4,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1453,1445,2130,2040,WN,771,N486WN,277,235,202,50,8,LAX,MDW,1750,64,11,0,,0,0,0,42,0,8 2008,1,3,4,1001,945,1550,1540,WN,1014,N494WN,229,235,201,10,16,LAX,MDW,1750,21,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,630,625,1209,1220,WN,1930,N724SW,219,235,206,-11,5,LAX,MDW,1750,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1831,1755,1,2350,WN,3588,N732SW,210,235,195,11,36,LAX,MDW,1750,6,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1030,1000,1142,1115,WN,184,N473WN,72,75,62,27,30,LAX,OAK,337,3,7,0,,0,0,0,0,0,27 2008,1,3,4,2004,1900,2127,2015,WN,303,N683SW,83,75,66,72,64,LAX,OAK,337,4,13,0,,0,0,0,13,0,59 2008,1,3,4,923,900,1114,1015,WN,418,N631SW,111,75,63,59,23,LAX,OAK,337,4,44,0,,0,0,0,36,0,23 2008,1,3,4,1419,1400,1534,1515,WN,420,N268WN,75,75,65,19,19,LAX,OAK,337,4,6,0,,0,12,0,0,0,7 2008,1,3,4,1522,1500,1638,1615,WN,430,N752SW,76,75,65,23,22,LAX,OAK,337,5,6,0,,0,7,0,1,0,15 2008,1,3,4,1803,1730,1933,1845,WN,440,N720WN,90,75,67,48,33,LAX,OAK,337,5,18,0,,0,6,0,15,0,27 2008,1,3,4,2146,2100,2304,2215,WN,724,N342SW,78,75,65,49,46,LAX,OAK,337,4,9,0,,0,0,0,3,0,46 2008,1,3,4,2001,1930,2118,2045,WN,810,N648SW,77,75,64,33,31,LAX,OAK,337,4,9,0,,0,0,0,2,0,31 2008,1,3,4,1827,1800,1945,1915,WN,829,N476WN,78,75,63,30,27,LAX,OAK,337,4,11,0,,0,0,0,3,0,27 2008,1,3,4,1904,1830,2019,1945,WN,1026,N224WN,75,75,64,34,34,LAX,OAK,337,6,5,0,,0,0,0,0,0,34 2008,1,3,4,1450,1430,1607,1545,WN,1032,N489WN,77,75,64,22,20,LAX,OAK,337,5,8,0,,0,0,0,2,0,20 2008,1,3,4,1722,1700,1836,1815,WN,1061,N422WN,74,75,63,21,22,LAX,OAK,337,4,7,0,,0,4,0,0,0,17 2008,1,3,4,1820,1630,1943,1745,WN,1135,N304SW,83,75,71,118,110,LAX,OAK,337,4,8,0,,0,0,0,8,3,107 2008,1,3,4,NA,1100,NA,1215,WN,1146,,NA,75,NA,NA,NA,LAX,OAK,337,NA,NA,1,A,0,NA,NA,NA,NA,NA 2008,1,3,4,1623,1600,1753,1715,WN,1168,N361SW,90,75,70,38,23,LAX,OAK,337,13,7,0,,0,0,0,15,0,23 2008,1,3,4,2008,2000,2136,2115,WN,1414,N402WN,88,75,67,21,8,LAX,OAK,337,5,16,0,,0,5,0,13,0,3 2008,1,3,4,1301,1300,1422,1415,WN,1655,N669SW,81,75,65,7,1,LAX,OAK,337,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,601,600,711,720,WN,2104,N236WN,70,80,60,-9,1,LAX,OAK,337,2,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,808,800,921,915,WN,2639,N693SW,73,75,62,6,8,LAX,OAK,337,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1206,1200,1328,1315,WN,3655,N465WN,82,75,63,13,6,LAX,OAK,337,4,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,700,700,816,815,WN,3756,N302SW,76,75,62,1,0,LAX,OAK,337,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1609,1600,1819,1815,WN,90,N609SW,70,75,56,4,9,LAX,PHX,370,5,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1811,1725,2016,1940,WN,109,N269WN,65,75,52,36,46,LAX,PHX,370,6,7,0,,0,5,0,0,0,31 2008,1,3,4,815,815,1034,1030,WN,236,N735SA,79,75,56,4,0,LAX,PHX,370,7,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,600,600,805,815,WN,277,N619SW,65,75,51,-10,0,LAX,PHX,370,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1851,1830,2059,2045,WN,314,N229WN,68,75,59,14,21,LAX,PHX,370,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2020,1905,2225,2115,WN,386,N718SW,65,70,56,70,75,LAX,PHX,370,4,5,0,,0,0,0,0,0,70 2008,1,3,4,1204,1150,1407,1405,WN,463,N342SW,63,75,53,2,14,LAX,PHX,370,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2116,2040,2318,2250,WN,520,N283WN,62,70,53,28,36,LAX,PHX,370,3,6,0,,0,5,0,0,0,23 2008,1,3,4,1556,1440,1804,1655,WN,746,N637SW,68,75,52,69,76,LAX,PHX,370,4,12,0,,0,69,0,0,0,0 2008,1,3,4,1040,1005,1241,1220,WN,1058,N202WN,61,75,50,21,35,LAX,PHX,370,4,7,0,,0,15,0,0,0,6 2008,1,3,4,940,920,1149,1135,WN,1155,N288WN,69,75,52,14,20,LAX,PHX,370,6,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1419,1335,1624,1550,WN,2015,N792SW,65,75,53,34,44,LAX,PHX,370,3,9,0,,0,11,0,0,0,23 2008,1,3,4,635,635,840,850,WN,2452,N746SW,65,75,53,-10,0,LAX,PHX,370,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1738,1720,1848,1835,WN,225,N283WN,70,75,58,13,18,LAX,RNO,390,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,853,845,1000,1005,WN,1844,N313SW,67,80,56,-5,8,LAX,RNO,390,5,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1422,1400,1532,1520,WN,2360,N618WN,70,80,60,12,22,LAX,RNO,390,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1159,1150,1635,1635,WN,2624,N693SW,156,165,141,0,9,LAX,SAT,1210,3,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,708,710,829,835,WN,457,N738CB,81,85,61,-6,-2,LAX,SFO,337,4,16,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,NA,905,NA,1025,WN,469,,NA,80,NA,NA,NA,LAX,SFO,337,NA,NA,1,A,0,NA,NA,NA,NA,NA 2008,1,3,4,2321,1955,38,2115,WN,593,N901WN,77,80,65,203,206,LAX,SFO,337,6,6,0,,0,0,0,203,0,0 2008,1,3,4,NA,1620,NA,1740,WN,618,,NA,80,NA,NA,NA,LAX,SFO,337,NA,NA,1,C,0,NA,NA,NA,NA,NA 2008,1,3,4,2008,1805,2139,1930,WN,646,N738CB,91,85,70,129,123,LAX,SFO,337,5,16,0,,0,0,24,6,0,99 2008,1,3,4,1625,1430,1748,1550,WN,656,N738CB,83,80,67,118,115,LAX,SFO,337,3,13,0,,0,0,0,5,0,113 2008,1,3,4,1305,1050,1421,1210,WN,680,N738CB,76,80,63,131,135,LAX,SFO,337,5,8,0,,0,0,0,131,0,0 2008,1,3,4,1558,1245,1709,1405,WN,776,N901WN,71,80,60,184,193,LAX,SFO,337,7,4,0,,0,184,0,0,0,0 2008,1,3,4,1600,1455,1714,1600,WN,173,N331SW,74,65,54,74,65,LAX,SJC,308,5,15,0,,0,14,0,9,0,51 2008,1,3,4,1556,1525,1705,1630,WN,232,N740SW,69,65,53,35,31,LAX,SJC,308,3,13,0,,0,0,0,18,0,17 2008,1,3,4,2056,2000,2203,2105,WN,343,N687SW,67,65,56,58,56,LAX,SJC,308,5,6,0,,0,21,0,2,0,35 2008,1,3,4,616,615,720,720,WN,712,N671SW,64,65,51,0,1,LAX,SJC,308,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1159,1130,1259,1235,WN,769,N608SW,60,65,48,24,29,LAX,SJC,308,5,7,0,,0,0,0,9,0,15 2008,1,3,4,2140,2035,2249,2140,WN,1113,N731SA,69,65,54,69,65,LAX,SJC,308,6,9,0,,0,0,12,4,0,53 2008,1,3,4,1455,1425,1557,1530,WN,1609,N676SW,62,65,49,27,30,LAX,SJC,308,5,8,0,,0,23,0,0,0,4 2008,1,3,4,908,910,1005,1015,WN,1764,N740SW,57,65,48,-10,-2,LAX,SJC,308,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1925,1825,2024,1930,WN,2359,N385SW,59,65,49,54,60,LAX,SJC,308,3,7,0,,0,0,0,6,0,48 2008,1,3,4,NA,1930,NA,2035,WN,2528,,NA,65,NA,NA,NA,LAX,SJC,308,NA,NA,1,A,0,NA,NA,NA,NA,NA 2008,1,3,4,740,740,838,850,WN,2561,N773SA,58,70,49,-12,0,LAX,SJC,308,4,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1311,1240,1414,1350,WN,2820,N647SW,63,70,53,24,31,LAX,SJC,308,4,6,0,,0,7,0,0,0,17 2008,1,3,4,1014,1000,1120,1105,WN,2969,N642WN,66,65,51,15,14,LAX,SJC,308,4,11,0,,0,5,0,1,0,9 2008,1,3,4,1709,1700,1815,1805,WN,3149,N415WN,66,65,55,10,9,LAX,SJC,308,4,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1357,1330,1626,1615,WN,554,N626SW,89,105,79,11,27,LAX,SLC,590,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1114,1045,1347,1330,WN,963,N311SW,93,105,80,17,29,LAX,SLC,590,3,10,0,,0,4,0,0,0,13 2008,1,3,4,652,650,926,940,WN,973,N617SW,94,110,81,-14,2,LAX,SLC,590,6,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2117,2010,2355,2255,WN,1485,N605SW,98,105,84,60,67,LAX,SLC,590,4,10,0,,0,0,0,1,0,59 2008,1,3,4,1742,1650,2021,1935,WN,2899,N671SW,99,105,82,46,52,LAX,SLC,590,4,13,0,,0,8,0,0,0,38 2008,1,3,4,1108,1100,1226,1220,WN,92,N226WN,78,80,59,6,8,LAX,SMF,373,5,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,821,810,936,930,WN,337,N488WN,75,80,59,6,11,LAX,SMF,373,5,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2013,1910,2121,2025,WN,1015,N384SW,68,75,58,56,63,LAX,SMF,373,3,7,0,,0,0,0,0,0,56 2008,1,3,4,609,610,722,735,WN,1234,N318SW,73,85,55,-13,-1,LAX,SMF,373,5,13,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,949,900,1103,1020,WN,1486,N613SW,74,80,61,43,49,LAX,SMF,373,5,8,0,,0,40,0,0,0,3 2008,1,3,4,2215,2125,2321,2245,WN,2437,N225WN,66,80,58,36,50,LAX,SMF,373,3,5,0,,0,1,0,0,0,35 2008,1,3,4,1524,1505,1634,1620,WN,2720,N629SW,70,75,59,14,19,LAX,SMF,373,4,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1441,1350,1549,1510,WN,2985,N727SW,68,80,56,39,51,LAX,SMF,373,4,8,0,,0,8,0,0,0,31 2008,1,3,4,1743,1715,1853,1830,WN,3155,N449WN,70,75,59,23,28,LAX,SMF,373,4,7,0,,0,8,0,0,0,15 2008,1,3,4,2049,2035,2204,2155,WN,3584,N416WN,75,80,62,9,14,LAX,SMF,373,6,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1725,1655,2238,2225,WN,71,N312SW,193,210,179,13,30,LAX,STL,1593,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1356,1345,1608,1600,WN,75,N283WN,72,75,60,8,11,LAX,TUS,451,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2005,1850,2219,2105,WN,1044,N306SW,74,75,61,74,75,LAX,TUS,451,3,10,0,,0,0,0,0,0,74 2008,1,3,4,725,725,931,940,WN,2378,N431WN,66,75,59,-9,0,LAX,TUS,451,2,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1658,1655,1908,1910,WN,3048,N695SW,70,75,60,-2,3,LAX,TUS,451,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1052,1040,1301,1300,WN,3831,N451WN,69,80,60,1,12,LAX,TUS,451,4,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2110,2040,2326,2300,WN,3939,N654SW,76,80,61,26,30,LAX,TUS,451,5,10,0,,0,0,0,0,0,26 2008,1,3,4,1910,1910,1936,1915,WN,160,N783SW,86,65,48,21,0,LBB,ABQ,289,29,9,0,,0,0,0,21,0,0 2008,1,3,4,1825,1745,1921,1850,WN,555,N506SW,56,65,47,31,40,LBB,AUS,341,4,5,0,,0,4,0,0,0,27 2008,1,3,4,652,655,752,800,WN,2974,N447WN,60,65,47,-8,-3,LBB,AUS,341,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,840,840,940,940,WN,15,N341SW,60,60,49,0,0,LBB,DAL,293,3,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1016,1010,1119,1110,WN,21,N501SW,63,60,49,9,6,LBB,DAL,293,3,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1402,1340,1501,1440,WN,35,N520SW,59,60,50,21,22,LBB,DAL,293,2,7,0,,0,3,0,0,0,18 2008,1,3,4,1652,1640,1748,1740,WN,141,N370SW,56,60,48,8,12,LBB,DAL,293,3,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,611,615,710,715,WN,906,N301SW,59,60,47,-5,-4,LBB,DAL,293,3,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1825,1815,1923,1915,WN,1312,N501SW,58,60,47,8,10,LBB,DAL,293,5,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1606,1525,1705,1625,WN,2382,N404WN,59,60,50,40,41,LBB,DAL,293,3,6,0,,0,0,0,0,0,40 2008,1,3,4,2007,2010,2102,2110,WN,2542,N736SA,55,60,41,-8,-3,LBB,DAL,293,3,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,915,915,910,915,WN,1028,N214WN,55,60,45,-5,0,LBB,ELP,295,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1152,1155,1201,1210,WN,573,N463WN,129,135,116,-9,-3,LBB,LAS,775,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1654,1550,2006,1900,WN,438,N309SW,132,130,121,66,64,LIT,BWI,912,3,8,0,,0,6,0,2,0,58 2008,1,3,4,703,705,808,810,WN,7,N365SW,65,65,48,-2,-2,LIT,DAL,296,3,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1516,1500,1614,1605,WN,41,N240WN,58,65,49,9,16,LIT,DAL,296,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1752,1705,1854,1810,WN,49,N502SW,62,65,49,44,47,LIT,DAL,296,4,9,0,,0,0,0,0,0,44 2008,1,3,4,1251,1235,1348,1340,WN,700,N678AA,57,65,48,8,16,LIT,DAL,296,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1108,1100,1206,1205,WN,2177,N522SW,58,65,49,1,8,LIT,DAL,296,3,6,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2029,1950,2127,2055,WN,2263,N514SW,58,65,49,32,39,LIT,DAL,296,3,6,0,,0,0,0,0,0,32 2008,1,3,4,806,805,909,910,WN,3748,N610WN,63,65,49,-1,1,LIT,DAL,296,6,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1417,1405,1535,1530,WN,231,N787SA,78,85,71,5,12,LIT,HOU,393,2,5,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1319,1310,1440,1440,WN,3053,N418WN,201,210,188,0,9,LIT,LAS,1295,5,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1051,1035,1246,1215,WN,8,N339SW,115,100,82,31,16,LIT,MDW,544,25,8,0,,0,16,0,15,0,0 2008,1,3,4,730,730,933,945,WN,250,N519SW,183,195,167,-12,0,LIT,PHX,1136,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1846,1815,1951,1920,WN,3168,N513SW,65,65,53,31,31,LIT,STL,296,4,8,0,,0,4,0,0,0,27 2008,1,3,4,642,640,752,745,WN,3757,N446WN,70,65,51,7,2,LIT,STL,296,4,15,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1615,1130,1623,1135,WN,10,N617SW,68,65,56,288,285,MAF,ABQ,332,4,8,0,,0,285,0,3,0,0 2008,1,3,4,730,735,822,835,WN,1194,N664WN,52,60,42,-13,-5,MAF,AUS,294,3,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1852,1830,1942,1925,WN,2374,N379SW,50,55,40,17,22,MAF,AUS,294,3,7,0,,0,0,0,0,0,17 2008,1,3,4,1602,1535,1701,1635,WN,43,N503SW,59,60,46,26,27,MAF,DAL,319,3,10,0,,0,0,0,0,0,26 2008,1,3,4,1836,1805,1930,1905,WN,53,N527SW,54,60,46,25,31,MAF,DAL,319,3,5,0,,0,0,0,0,0,25 2008,1,3,4,1956,1945,2056,2045,WN,271,N512SW,60,60,46,11,11,MAF,DAL,319,2,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,944,915,1046,1015,WN,914,N692SW,62,60,53,31,29,MAF,DAL,319,2,7,0,,0,0,0,2,0,29 2008,1,3,4,625,630,723,730,WN,974,N410WN,58,60,45,-7,-5,MAF,DAL,319,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1306,1305,1404,1405,WN,1430,N523SW,58,60,47,-1,1,MAF,DAL,319,4,7,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,702,705,814,820,WN,539,N417WN,72,75,61,-6,-3,MAF,HOU,441,2,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1625,1450,1731,1605,WN,3828,N333SW,66,75,57,86,95,MAF,HOU,441,2,7,0,,0,85,0,0,0,1 2008,1,3,4,1739,1730,1758,1750,WN,317,N335SW,139,140,123,8,9,MAF,LAS,796,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1905,1850,2010,2000,WN,1493,N356SW,125,130,106,10,15,MCI,ABQ,718,5,14,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,856,850,950,1000,WN,1655,N669SW,114,130,102,-10,6,MCI,ABQ,718,4,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,811,810,932,935,WN,112,N527SW,81,85,68,-3,1,MCI,BNA,491,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1159,1150,1325,1310,WN,290,N224WN,86,80,69,15,9,MCI,BNA,491,5,12,0,,0,6,0,6,0,3 2008,1,3,4,2130,2115,2300,2235,WN,1568,N227WN,90,80,65,25,15,MCI,BNA,491,11,14,0,,0,0,0,10,0,15 2008,1,3,4,1708,1710,1829,1835,WN,1668,N262WN,81,85,68,-6,-2,MCI,BNA,491,5,8,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,646,645,1012,1005,WN,3062,N415WN,146,140,128,7,1,MCI,BWI,967,7,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1404,1350,1726,1710,WN,3320,N354SW,142,140,125,16,14,MCI,BWI,967,9,8,0,,0,10,0,2,0,4 2008,1,3,4,1958,1930,2310,2250,WN,3829,N465WN,132,140,118,20,28,MCI,BWI,967,5,9,0,,0,0,0,0,0,20 2008,1,3,4,726,725,850,900,WN,11,N689SW,84,95,71,-10,1,MCI,DAL,461,4,9,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,937,910,1102,1040,WN,19,N511SW,85,90,71,22,27,MCI,DAL,461,4,10,0,,0,2,0,0,0,20 2008,1,3,4,1251,1235,1415,1405,WN,33,N525SW,84,90,70,10,16,MCI,DAL,461,4,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2009,1910,2132,2040,WN,59,N628SW,83,90,70,52,59,MCI,DAL,461,2,11,0,,0,0,0,0,0,52 2008,1,3,4,1602,1605,1724,1735,WN,2010,N210WN,82,90,69,-11,-3,MCI,DAL,461,3,10,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,2102,2100,2225,2230,WN,3239,N485WN,83,90,67,-5,2,MCI,DAL,461,4,12,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1101,1055,1142,1145,WN,454,N603SW,101,110,84,-3,6,MCI,DEN,533,6,11,0,,0,NA,NA,NA,NA,NA 2008,1,3,4,1917,1855,1959,1940,WN,977,N376SW,102,105,88,19,22,MCI,DEN,533,6,8,0,,0,3,0,0,0,16 2008,1,3,4,1502,1440,1540,1525,WN,1151,N386SW,98,105,83,15,22,MCI,DEN,533,5,10,0,,0,12,0,0,0,3 data.table/inst/tests/issue_1113_fread.txt0000644000175100001440000001621513172210047020107 0ustar hornikusersITERATION THETA1 THETA2 THETA3 THETA4 THETA5 THETA6 THETA7 THETA8 THETA9 THETA10 THETA11 THETA12 THETA13 THETA14 THETA15 THETA16 THETA17 SIGMA(1,1) OMEGA(1,1) OMEGA(2,1) OMEGA(2,2) OMEGA(3,1) OMEGA(3,2) OMEGA(3,3) OMEGA(4,1) OMEGA(4,2) OMEGA(4,3) OMEGA(4,4) OMEGA(5,1) OMEGA(5,2) OMEGA(5,3) OMEGA(5,4) OMEGA(5,5) OMEGA(6,1) OMEGA(6,2) OMEGA(6,3) OMEGA(6,4) OMEGA(6,5) OMEGA(6,6) OMEGA(7,1) OMEGA(7,2) OMEGA(7,3) OMEGA(7,4) OMEGA(7,5) OMEGA(7,6) OMEGA(7,7) OMEGA(8,1) OMEGA(8,2) OMEGA(8,3) OMEGA(8,4) OMEGA(8,5) OMEGA(8,6) OMEGA(8,7) OMEGA(8,8) OMEGA(9,1) OMEGA(9,2) OMEGA(9,3) OMEGA(9,4) OMEGA(9,5) OMEGA(9,6) OMEGA(9,7) OMEGA(9,8) OMEGA(9,9) OMEGA(10,1) OMEGA(10,2) OMEGA(10,3) OMEGA(10,4) OMEGA(10,5) OMEGA(10,6) OMEGA(10,7) OMEGA(10,8) OMEGA(10,9) OMEGA(10,10) OMEGA(11,1) OMEGA(11,2) OMEGA(11,3) OMEGA(11,4) OMEGA(11,5) OMEGA(11,6) OMEGA(11,7) OMEGA(11,8) OMEGA(11,9) OMEGA(11,10) OMEGA(11,11) OMEGA(12,1) OMEGA(12,2) OMEGA(12,3) OMEGA(12,4) OMEGA(12,5) OMEGA(12,6) OMEGA(12,7) OMEGA(12,8) OMEGA(12,9) OMEGA(12,10) OMEGA(12,11) OMEGA(12,12) OMEGA(13,1) OMEGA(13,2) OMEGA(13,3) OMEGA(13,4) OMEGA(13,5) OMEGA(13,6) OMEGA(13,7) OMEGA(13,8) OMEGA(13,9) OMEGA(13,10) OMEGA(13,11) OMEGA(13,12) OMEGA(13,13) OMEGA(14,1) OMEGA(14,2) OMEGA(14,3) OMEGA(14,4) OMEGA(14,5) OMEGA(14,6) OMEGA(14,7) OMEGA(14,8) OMEGA(14,9) OMEGA(14,10) OMEGA(14,11) OMEGA(14,12) OMEGA(14,13) OMEGA(14,14) OMEGA(15,1) OMEGA(15,2) OMEGA(15,3) OMEGA(15,4) OMEGA(15,5) OMEGA(15,6) OMEGA(15,7) OMEGA(15,8) OMEGA(15,9) OMEGA(15,10) OMEGA(15,11) OMEGA(15,12) OMEGA(15,13) OMEGA(15,14) OMEGA(15,15) MCMCOBJ -25000 -2.50000E+00 2.30000E+00 6.20000E-01 5.30000E-01 -3.00000E+00 -8.10000E-01 8.30000E+00 4.10000E+00 -3.50000E+00 -3.40000E+00 -9.90000E-01 -2.30000E-02 6.80000E-01 9.10000E-01 2.30000E+00 1.00000E-01 1.00000E-01 1.33669E-04 4.00000E-01 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 1.00000E-02 4.00000E-01 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 1.00000E-01 120924.21730058071 -24999 -2.49472E+00 2.15887E+00 6.23667E-01 2.86888E-01 -2.60190E+00 -4.83346E-01 8.65879E+00 3.82465E+00 -3.69026E+00 -3.55164E+00 -9.91289E-01 -2.35327E-02 8.94482E-01 1.04624E+00 2.30139E+00 -3.68905E-01 4.83838E-03 1.24727E-02 6.69962E-01 -1.82743E-01 8.87432E-01 -1.35321E-01 2.17601E-01 2.36157E-01 -8.46622E-01 2.63270E-01 1.65191E-01 1.24854E+00 1.17863E-01 -2.82260E-01 -1.82098E-01 -1.48054E-01 4.15367E-01 4.96165E-01 -8.13818E-01 -1.68830E-01 -7.29928E-01 2.21669E-01 1.33956E+00 -1.48864E-01 -1.94915E-01 -7.53074E-02 2.53751E-01 1.89197E-01 -7.45796E-02 3.93262E-01 -5.40804E-01 5.56132E-01 1.82852E-01 7.71652E-01 -2.07298E-01 -9.08063E-01 9.73558E-02 8.67754E-01 2.11909E-01 -1.42553E-01 -2.65172E-02 -2.99459E-01 -1.44428E-02 2.26481E-01 -6.58063E-02 -2.38070E-01 1.87605E-01 -9.53940E-02 1.73642E-02 2.40828E-02 1.15392E-01 -9.60601E-02 -1.28952E-01 6.09570E-03 1.35560E-01 3.99417E-03 2.19721E-01 -3.23673E-01 2.81581E-01 9.73522E-02 4.50066E-01 -1.96140E-01 -4.24467E-01 -1.24865E-02 3.45742E-01 -8.10472E-02 8.80072E-02 4.08855E-01 -5.17385E-01 4.63403E-01 1.61036E-01 6.95697E-01 -1.73728E-01 -6.39965E-01 -8.19083E-03 5.72855E-01 -1.87910E-01 7.46275E-02 3.63738E-01 7.19679E-01 3.91232E-02 4.39350E-02 1.59526E-01 -1.40054E-01 -1.06957E-01 1.70758E-01 -2.11898E-01 -1.73081E-01 1.14536E-01 -4.40678E-02 3.79919E-02 5.98738E-02 6.00977E-01 4.26254E-01 -1.10011E-01 -1.27825E-01 -5.33123E-01 8.27641E-02 2.77281E-01 -7.16001E-02 -2.76509E-01 1.10513E-01 -3.72377E-02 -2.81194E-01 -3.32530E-01 -1.31069E-01 4.76796E-01 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 1.76212E-01 -4787.0714136013094 -24998 -2.47806E+00 2.16679E+00 5.44640E-01 1.40853E-01 -2.97418E+00 -8.07030E-01 8.59191E+00 4.05605E+00 -3.34552E+00 -3.14465E+00 -1.29517E+00 -3.53758E-01 6.90913E-01 1.10941E+00 2.65561E+00 1.81350E-01 -7.13636E-02 6.39860E-03 7.58939E-01 -7.41282E-02 4.50564E-01 -1.56544E-01 5.50040E-02 4.00618E-01 -8.38686E-01 1.99437E-01 3.24735E-01 1.13226E+00 -4.37298E-02 -9.98388E-02 -5.41804E-03 1.95687E-02 1.24519E-01 3.99408E-01 -3.52834E-01 -1.41210E-01 -5.96706E-01 7.43882E-02 6.21663E-01 -3.49265E-01 6.11820E-02 1.06495E-01 4.17807E-01 5.24262E-03 -2.02151E-01 3.19864E-01 -4.69009E-01 1.83375E-01 1.02173E-01 6.00857E-01 -1.83947E-03 -3.97512E-01 2.31033E-01 4.48351E-01 1.06819E-01 -5.79161E-02 7.86940E-03 -1.27090E-01 8.37271E-03 9.19823E-02 -6.28913E-02 -8.98966E-02 1.14526E-01 -2.35081E-01 1.50964E-02 6.74098E-02 3.09622E-01 1.77301E-02 -1.47651E-01 1.52674E-01 1.63312E-01 -3.81844E-02 2.62263E-01 -5.78659E-01 2.00738E-01 8.33776E-02 7.28190E-01 1.39433E-02 -5.48396E-01 2.93647E-01 4.67792E-01 -1.14388E-01 2.68342E-01 8.09273E-01 -4.13419E-01 2.51892E-01 1.64555E-01 5.53619E-01 -6.07903E-02 -4.76312E-01 1.87828E-01 3.62802E-01 -5.54501E-02 1.73611E-01 4.49881E-01 5.55499E-01 -1.64640E-01 -8.08169E-02 -1.32878E-01 1.19849E-01 4.16466E-02 4.80779E-03 1.91021E-02 8.57646E-02 -7.56957E-02 2.08879E-02 1.51795E-01 -5.06103E-02 4.31028E-01 5.68674E-01 1.82220E-02 -1.06964E-01 -6.01272E-01 -7.45867E-02 1.91128E-01 -2.86643E-01 -3.07017E-01 6.83705E-02 -2.12983E-01 -4.43121E-01 -1.64361E-01 -2.31855E-01 7.73541E-01 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 0.00000E+00 6.55898E-02 -5326.7487129923438 data.table/inst/tests/1680-fread-header-encoding.csv0000644000175100001440000000027713172210047021615 0ustar hornikusersOrt;Straße;Bezeichnung Vienna;Testgasse 1;"Ministerium ""Pestalozzi""" Graz;Teststraße 3;HS Salzburg;Beispielstraße 9;"NMS ""Die Schlauen""" Vienna;Wolfgang-Straße 7;"Wirtshaus ""Wien III""" data.table/inst/tests/fread_blank3.txt0000644000175100001440000000005213172210047017454 0ustar hornikusersa,b,c 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 data.table/inst/tests/issue_1116_fread_few_lines_2.txt0000644000175100001440000000151013172210047022356 0ustar hornikusersx,y 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" 1,"a ,,,,,,,, _b" data.table/inst/tests/1206FUT.txt0000644000175100001440000011335413172210047016122 0ustar hornikusersDATE COM COM_MM COM_YY OPEN_1 OPEN_I1 OPEN_2 OPEN_I2 HIGH HIGH_I LOW LOW_I CLSE_1 CLSE_I1 CLSE_2 CLSE_I2 SETTLE VOLUME OINT DEL RECTYP 20121206 AP 000879.17 000879.99 000876.41 000878.69 20121206 EX 002598.11 002617.83 002590.51 002603.41 20121206 CN 007203.54 007235.40 007153.29 007202.63 20121206 HK 012232.40 012278.00 012200.81 012215.93 20121206 ID 005093.97 005106.74 005066.37 005072.28 20121206 IN 005926.30 005942.55 005838.90 005930.90 20121206 NK 009535.69 009565.43 009503.31 009545.16 20121206 SG 000350.41 000350.59 000348.32 000350.03 20121206 ST 003081.25 003082.88 003065.79 003078.20 20121206 TW 000276.26 000276.85 000274.85 000275.15 20121206 AH 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000210400 000000 000002 00000 A 20121206 AH 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000208300 000000 000021 00000 A 20121206 AH 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000209500 000000 000000 00000 A 20121206 AH 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000210400 000000 000000 00000 A 20121206 AH 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000211100 000000 000000 00000 A 20121206 AH 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000211850 000000 000000 00000 A 20121206 AH 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000212650 000000 000000 00000 A 20121206 AH 07 2013 000000000 000000000 000000000 000000000 000000000 000000000 000213350 000000 000000 00000 A 20121206 AH 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000214250 000000 000000 00000 A 20121206 AH 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000215000 000000 000000 00000 A 20121206 AH 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000215750 000000 000000 00000 A 20121206 AH 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000216600 000000 000000 00000 A 20121206 AP 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000087650 000000 000000 00000 A 20121206 AP 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000087650 000000 000000 00000 A 20121206 AP 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000087700 000000 000000 00000 A 20121206 AP 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000087700 000000 000000 00000 A 20121206 CN 12 2012 000729500 000000000 000736000 000723000 000724500 000000000 000724500 071547 240250 00000 A 20121206 CN 01 2013 000732500 000000000 000739500 000726000 000727500 000000000 000728500 008678 006896 00000 A 20121206 CN 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000725000 000000 000000 00000 A 20121206 CN 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000725500 000000 000000 00000 A 20121206 CN 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000714000 000000 000000 00000 A 20121206 CN 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000710000 000000 000000 00000 A 20121206 CU 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000800700 000000 000005 00000 A 20121206 CU 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000801500 000000 000002 00000 A 20121206 CU 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000802200 000000 000002 00000 A 20121206 CU 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000802900 000000 000000 00000 A 20121206 CU 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000803400 000000 000000 00000 A 20121206 CU 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000803900 000000 000000 00000 A 20121206 CU 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000804300 000000 000000 00000 A 20121206 CU 07 2013 000000000 000000000 000000000 000000000 000000000 000000000 000804600 000000 000000 00000 A 20121206 CU 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000804900 000000 000000 00000 A 20121206 CU 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000805200 000000 000000 00000 A 20121206 CU 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000805400 000000 000000 00000 A 20121206 CU 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000805600 000000 000000 00000 A 20121206 ED 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000996925 000000 001511 00000 A 20121206 ED 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996950 000000 000000 00000 A 20121206 ED 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996950 000000 000000 00000 A 20121206 ED 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996950 000000 000346 00000 A 20121206 ED 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996950 000000 000000 00000 A 20121206 ED 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997000 000000 000000 00000 A 20121206 ED 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996800 000000 000249 00000 A 20121206 ED 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996700 000000 000260 00000 A 20121206 ED 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000996450 000000 000300 00000 A 20121206 ED 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000996250 000000 000120 00000 A 20121206 ED 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000995950 000000 000100 00000 A 20121206 ED 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000995600 000000 001003 00000 A 20121206 ED 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000995100 000000 000003 00000 A 20121206 ED 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000994650 000000 000003 00000 A 20121206 ED 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000994000 000000 000000 00000 A 20121206 ED 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000993200 000000 000000 00000 A 20121206 ED 12 2015 000000000 000000000 000000000 000000000 000000000 000000000 000992150 000000 000000 00000 A 20121206 ED 03 2016 000000000 000000000 000000000 000000000 000000000 000000000 000991000 000000 000000 00000 A 20121206 ED 06 2016 000000000 000000000 000000000 000000000 000000000 000000000 000989700 000000 000000 00000 A 20121206 ED 09 2016 000000000 000000000 000000000 000000000 000000000 000000000 000988300 000000 000000 00000 A 20121206 ED 12 2016 000000000 000000000 000000000 000000000 000000000 000000000 000986800 000000 000000 00000 A 20121206 ED 03 2017 000000000 000000000 000000000 000000000 000000000 000000000 000985450 000000 000000 00000 A 20121206 ED 06 2017 000000000 000000000 000000000 000000000 000000000 000000000 000984050 000000 000000 00000 A 20121206 ED 09 2017 000000000 000000000 000000000 000000000 000000000 000000000 000982650 000000 000000 00000 A 20121206 ED 12 2017 000000000 000000000 000000000 000000000 000000000 000000000 000981150 000000 000000 00000 A 20121206 ED 03 2018 000000000 000000000 000000000 000000000 000000000 000000000 000979950 000000 000000 00000 A 20121206 ED 06 2018 000000000 000000000 000000000 000000000 000000000 000000000 000978700 000000 000000 00000 A 20121206 ED 09 2018 000000000 000000000 000000000 000000000 000000000 000000000 000977600 000000 000000 00000 A 20121206 ED 12 2018 000000000 000000000 000000000 000000000 000000000 000000000 000976400 000000 000000 00000 A 20121206 ED 03 2019 000000000 000000000 000000000 000000000 000000000 000000000 000975550 000000 000000 00000 A 20121206 ED 06 2019 000000000 000000000 000000000 000000000 000000000 000000000 000974650 000000 000000 00000 A 20121206 ED 09 2019 000000000 000000000 000000000 000000000 000000000 000000000 000973800 000000 000000 00000 A 20121206 ED 12 2019 000000000 000000000 000000000 000000000 000000000 000000000 000972900 000000 000000 00000 A 20121206 ED 03 2020 000000000 000000000 000000000 000000000 000000000 000000000 000972350 000000 000000 00000 A 20121206 ED 06 2020 000000000 000000000 000000000 000000000 000000000 000000000 000971800 000000 000000 00000 A 20121206 ED 09 2020 000000000 000000000 000000000 000000000 000000000 000000000 000971200 000000 000000 00000 A 20121206 ED 12 2020 000000000 000000000 000000000 000000000 000000000 000000000 000970500 000000 000000 00000 A 20121206 ED 03 2021 000000000 000000000 000000000 000000000 000000000 000000000 000970100 000000 000000 00000 A 20121206 ED 06 2021 000000000 000000000 000000000 000000000 000000000 000000000 000969750 000000 000000 00000 A 20121206 ED 09 2021 000000000 000000000 000000000 000000000 000000000 000000000 000969350 000000 000000 00000 A 20121206 ED 12 2021 000000000 000000000 000000000 000000000 000000000 000000000 000968900 000000 000000 00000 A 20121206 ED 03 2022 000000000 000000000 000000000 000000000 000000000 000000000 000968650 000000 000000 00000 A 20121206 ED 06 2022 000000000 000000000 000000000 000000000 000000000 000000000 000968300 000000 000000 00000 A 20121206 ED 09 2022 000000000 000000000 000000000 000000000 000000000 000000000 000967850 000000 000000 00000 A 20121206 EL 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000996650 000000 000000 00000 A 20121206 EL 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997075 000000 000000 00000 A 20121206 EL 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997300 000000 000000 00000 A 20121206 EL 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997375 000000 000000 00000 A 20121206 EL 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997400 000000 000000 00000 A 20121206 EL 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000997400 000000 000000 00000 A 20121206 EL 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000996950 000000 000000 00000 A 20121206 EL 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000995550 000000 000000 00000 A 20121206 EL 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000997800 000000 000000 00000 A 20121206 EL 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000996400 000000 000000 00000 A 20121206 EL 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000995000 000000 000000 00000 A 20121206 EL 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000993600 000000 000000 00000 A 20121206 EL 12 2015 000000000 000000000 000000000 000000000 000000000 000000000 000992200 000000 000000 00000 A 20121206 EL 03 2016 000000000 000000000 000000000 000000000 000000000 000000000 000990800 000000 000000 00000 A 20121206 EL 06 2016 000000000 000000000 000000000 000000000 000000000 000000000 000989400 000000 000000 00000 A 20121206 EL 09 2016 000000000 000000000 000000000 000000000 000000000 000000000 000988000 000000 000000 00000 A 20121206 EL 12 2016 000000000 000000000 000000000 000000000 000000000 000000000 000986600 000000 000000 00000 A 20121206 EL 03 2017 000000000 000000000 000000000 000000000 000000000 000000000 000985200 000000 000000 00000 A 20121206 EL 06 2017 000000000 000000000 000000000 000000000 000000000 000000000 000983800 000000 000000 00000 A 20121206 EL 09 2017 000000000 000000000 000000000 000000000 000000000 000000000 000982400 000000 000000 00000 A 20121206 EX 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000259300 000000 000004 00000 A 20121206 EX 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000258800 000000 000000 00000 A 20121206 EX 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000258400 000000 000000 00000 A 20121206 EX 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000258500 000000 000000 00000 A 20121206 EX 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000252100 000000 000000 00000 A 20121206 EX 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000251600 000000 000000 00000 A 20121206 EX 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000249500 000000 000000 00000 A 20121206 EX 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000248600 000000 000000 00000 A 20121206 EX 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000241500 000000 000000 00000 A 20121206 EX 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000240800 000000 000000 00000 A 20121206 EX 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000239600 000000 000000 00000 A 20121206 EX 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000238800 000000 000000 00000 A 20121206 EX 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000231900 000000 000000 00000 A 20121206 EX 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000231200 000000 000000 00000 A 20121206 EY 12 2012 000996850 000000000 000996850 000996850 000996850 000000000 000996850 000210 001972 00000 A 20121206 EY 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997250 000000 000407 00000 A 20121206 EY 06 2013 000997600 000000000 000997600 000997600 000997600 000000000 000997500 000010 000202 00000 A 20121206 EY 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997575 000000 000120 00000 A 20121206 EY 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000997600 000000 000384 00000 A 20121206 EY 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000997600 000000 000214 00000 A 20121206 EY 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000997150 000000 000000 00000 A 20121206 EY 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000995750 000000 000000 00000 A 20121206 EY 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000998000 000000 000004 00000 A 20121206 EY 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000996600 000000 000000 00000 A 20121206 EY 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000995200 000000 000000 00000 A 20121206 EY 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000993800 000000 000000 00000 A 20121206 EY 12 2015 000000000 000000000 000000000 000000000 000000000 000000000 000992400 000000 000000 00000 A 20121206 EY 03 2016 000000000 000000000 000000000 000000000 000000000 000000000 000991000 000000 000000 00000 A 20121206 EY 06 2016 000000000 000000000 000000000 000000000 000000000 000000000 000989600 000000 000000 00000 A 20121206 EY 09 2016 000000000 000000000 000000000 000000000 000000000 000000000 000988200 000000 000000 00000 A 20121206 EY 12 2016 000000000 000000000 000000000 000000000 000000000 000000000 000986800 000000 000000 00000 A 20121206 EY 03 2017 000000000 000000000 000000000 000000000 000000000 000000000 000985400 000000 000000 00000 A 20121206 EY 06 2017 000000000 000000000 000000000 000000000 000000000 000000000 000984000 000000 000000 00000 A 20121206 EY 09 2017 000000000 000000000 000000000 000000000 000000000 000000000 000982600 000000 000000 00000 A 20121206 FB 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061300 000000 000000 00000 A 20121206 FB 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061600 000000 000000 00000 A 20121206 FB 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061800 000000 000000 00000 A 20121206 FB 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061640 000000 000000 00000 A 20121206 FB 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061480 000000 000000 00000 A 20121206 FB 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061320 000000 000000 00000 A 20121206 FB 07 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061160 000000 000000 00000 A 20121206 FB 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000061000 000000 000000 00000 A 20121206 FB 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000060840 000000 000000 00000 A 20121206 FB 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000060680 000000 000000 00000 A 20121206 FB 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000060520 000000 000000 00000 A 20121206 FB 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000060360 000000 000000 00000 A 20121206 HK 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000012217 000000 000000 00000 A 20121206 HK 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000012220 000000 000000 00000 A 20121206 HK 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000012224 000000 000000 00000 A 20121206 HK 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000012186 000000 000000 00000 A 20121206 HK 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000011964 000000 000000 00000 A 20121206 HK 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000011867 000000 000000 00000 A 20121206 ID 12 2012 000507500 000000000 000507500 000507000 000507500 000000000 000507500 000035 001434 00000 A 20121206 ID 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000507000 000000 000000 00000 A 20121206 ID 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000507000 000000 000000 00000 A 20121206 ID 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000507500 000000 000000 00000 A 20121206 ID 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000499000 000000 000000 00000 A 20121206 ID 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000498500 000000 000000 00000 A 20121206 IN 12 2012 000593700 000000000 000598500 000587000 000597400 000000000 000597400 027408 322496 00000 A 20121206 IN 01 2013 000597600 000000000 000601250 000591050 000600700 000000000 000597250 000392 001312 00000 A 20121206 IN 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000597150 000000 000000 00000 A 20121206 IN 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000597250 000000 000002 00000 A 20121206 IN 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000594500 000000 000000 00000 A 20121206 IN 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000593300 000000 000000 00000 A 20121206 IR 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002831 000000 000000 00000 A 20121206 IR 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002831 000000 000000 00000 A 20121206 IR 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002841 000000 000000 00000 A 20121206 IR 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002854 000000 000000 00000 A 20121206 IR 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002866 000000 000000 00000 A 20121206 IR 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002873 000000 000000 00000 A 20121206 JB 12 2012 000145000 000000000 000145240 000144950 000145130 000000000 000145130 015844 022638 00000 A 20121206 JB 03 2013 000144490 000000000 000144740 000144450 000144660 000000000 000144660 014990 018891 00000 A 20121206 JB 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000142570 000000 000000 00000 A 20121206 JB 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000140480 000000 000000 00000 A 20121206 JB 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000138390 000000 000000 00000 A 20121206 JG 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000145130 000000 000000 00000 A 20121206 JG 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000144660 000000 000000 00000 A 20121206 JG 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000142570 000000 000000 00000 A 20121206 JG 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000140480 000000 000000 00000 A 20121206 JG 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000138390 000000 000000 00000 A 20121206 MR 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002891 000000 000000 00000 A 20121206 MR 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002891 000000 000000 00000 A 20121206 MR 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002901 000000 000000 00000 A 20121206 MR 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002914 000000 000000 00000 A 20121206 MR 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002926 000000 000000 00000 A 20121206 MR 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002933 000000 000000 00000 A 20121206 ND 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000020740 000000 009875 00000 A 20121206 ND 12 2013 000020550 000000000 000020550 000020550 000020550 000000000 000020560 000150 018002 00000 A 20121206 ND 12 2014 000020410 000000000 000020450 000020400 000020420 000000000 000020450 001028 011434 00000 A 20121206 ND 12 2015 000000000 000000000 000000000 000000000 000000000 000000000 000020070 000000 003792 00000 A 20121206 ND 12 2016 000019510 000000000 000019580 000019510 000019580 000000000 000019650 000070 003220 00000 A 20121206 ND 12 2017 000019220 000000000 000019220 000019220 000019220 000000000 000019230 000121 002624 00000 A 20121206 ND 12 2018 000000000 000000000 000000000 000000000 000000000 000000000 000018780 000000 000959 00000 A 20121206 ND 12 2019 000000000 000000000 000000000 000000000 000000000 000000000 000018310 000000 000899 00000 A 20121206 ND 12 2020 000000000 000000000 000000000 000000000 000000000 000000000 000017910 000000 000768 00000 A 20121206 ND 12 2021 000018140 000000000 000018140 000018140 000018140 000000000 000017430 000005 000466 00000 A 20121206 NK 12 2012 000948500 000000000 000957000 000945000 000954500 000000000 000954500 091127 278858 00000 A 20121206 NK 01 2013 000949000 000000000 000955000 000946000 000953000 000000000 000953000 000285 001220 00000 A 20121206 NK 02 2013 000954000 000000000 000954000 000953000 000953000 000000000 000953500 000006 000005 00000 A 20121206 NK 03 2013 000948500 000000000 000955500 000944500 000953500 000000000 000953000 002989 017075 00000 A 20121206 NK 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000944500 000000 000000 00000 A 20121206 NK 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000945500 000000 001208 00000 A 20121206 NK 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000944500 000000 000000 00000 A 20121206 NK 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000938000 000000 009800 00000 A 20121206 NK 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000937000 000000 000000 00000 A 20121206 NK 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000929000 000000 000001 00000 A 20121206 NK 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000928000 000000 000003 00000 A 20121206 NK 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000921500 000000 002610 00000 A 20121206 NK 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000920000 000000 000000 00000 A 20121206 NK 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000912000 000000 000000 00000 A 20121206 NK 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000911000 000000 000000 00000 A 20121206 NS 12 2012 000951800 000000000 000954300 000951800 000954300 000000000 000954500 000022 001128 00000 A 20121206 NS 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000953000 000000 000840 00000 A 20121206 NS 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000945500 000000 000000 00000 A 20121206 NS 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000944500 000000 000000 00000 A 20121206 NU 12 2012 000955000 000000000 000955000 000955000 000955000 000000000 000955500 000002 017776 00000 A 20121206 NU 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000956000 000000 002819 00000 A 20121206 NU 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000947000 000000 000000 00000 A 20121206 NU 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000944500 000000 000000 00000 A 20121206 RT 01 2013 000003000 000000000 000003010 000003000 000003010 000000000 000003010 000081 000244 00000 A 20121206 RT 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003030 000000 000036 00000 A 20121206 RT 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003042 000000 000070 00000 A 20121206 RT 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003072 000000 000115 00000 A 20121206 RT 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003098 000000 000208 00000 A 20121206 RT 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003108 000000 000080 00000 A 20121206 RT 07 2013 000003125 000000000 000003125 000003125 000003125 000000000 000003117 000004 000062 00000 A 20121206 RT 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003115 000000 000000 00000 A 20121206 RT 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003117 000000 000000 00000 A 20121206 RT 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003122 000000 000000 00000 A 20121206 RT 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003128 000000 000000 00000 A 20121206 RT 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000003128 000000 000000 00000 A 20121206 SB 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000010362 000000 000000 00000 A 20121206 SB 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000010362 000000 000000 00000 A 20121206 SD 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SD 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000097450 000000 000000 00000 A 20121206 SG 12 2012 000034940 000000000 000035010 000034800 000034960 000000000 000034960 006309 049414 00000 A 20121206 SG 01 2013 000034870 000000000 000034900 000034830 000034890 000000000 000034960 000006 000044 00000 A 20121206 SG 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000034950 000000 000000 00000 A 20121206 SG 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000034890 000000 000000 00000 A 20121206 SG 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000034540 000000 000000 00000 A 20121206 SG 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000034250 000000 000000 00000 A 20121206 ST 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000307000 000000 000006 00000 A 20121206 ST 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000307000 000000 000000 00000 A 20121206 ST 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000306900 000000 000000 00000 A 20121206 ST 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000306400 000000 000000 00000 A 20121206 ST 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000303300 000000 000000 00000 A 20121206 ST 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000300800 000000 000000 00000 A 20121206 TF 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000000000 000000 000100 00000 A 20121206 TF 01 2013 000002863 000000000 000002865 000002836 000002839 000000000 000002840 000325 002525 00000 A 20121206 TF 02 2013 000002879 000000000 000002879 000002837 000002840 000000000 000002840 000357 002389 00000 A 20121206 TF 03 2013 000002880 000000000 000002880 000002850 000002850 000000000 000002850 000121 002918 00000 A 20121206 TF 04 2013 000002890 000000000 000002890 000002860 000002869 000000000 000002863 000136 002483 00000 A 20121206 TF 05 2013 000002900 000000000 000002900 000002870 000002875 000000000 000002875 000258 001385 00000 A 20121206 TF 06 2013 000002905 000000000 000002905 000002880 000002880 000000000 000002882 000067 000536 00000 A 20121206 TF 07 2013 000002905 000000000 000002905 000002900 000002900 000000000 000002896 000025 000365 00000 A 20121206 TF 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002893 000000 000170 00000 A 20121206 TF 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002894 000000 000066 00000 A 20121206 TF 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002893 000000 000068 00000 A 20121206 TF 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002897 000000 000070 00000 A 20121206 TF 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002900 000000 000000 00000 A 20121206 TR 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002891 000000 000000 00000 A 20121206 TR 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002891 000000 000000 00000 A 20121206 TR 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002901 000000 000000 00000 A 20121206 TR 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002914 000000 000000 00000 A 20121206 TR 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002926 000000 000000 00000 A 20121206 TR 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000002933 000000 000000 00000 A 20121206 TW 12 2012 000027600 000000000 000027740 000027530 000027610 000000000 000027610 040286 202741 00000 A 20121206 TW 01 2013 000027610 000000000 000027650 000027530 000027530 000000000 000027620 000057 000065 00000 A 20121206 TW 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000027620 000000 000000 00000 A 20121206 TW 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000027630 000000 000000 00000 A 20121206 TW 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000027690 000000 000004 00000 A 20121206 TW 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000026790 000000 000000 00000 A 20121206 TW 12 2013 000000000 000000000 000000000 000000000 000000000 000000000 000026730 000000 000000 00000 A 20121206 TW 03 2014 000000000 000000000 000000000 000000000 000000000 000000000 000026750 000000 000000 00000 A 20121206 TW 06 2014 000000000 000000000 000000000 000000000 000000000 000000000 000026780 000000 000000 00000 A 20121206 TW 09 2014 000000000 000000000 000000000 000000000 000000000 000000000 000025800 000000 000000 00000 A 20121206 TW 12 2014 000000000 000000000 000000000 000000000 000000000 000000000 000025830 000000 000000 00000 A 20121206 TW 03 2015 000000000 000000000 000000000 000000000 000000000 000000000 000025870 000000 000000 00000 A 20121206 TW 06 2015 000000000 000000000 000000000 000000000 000000000 000000000 000025910 000000 000000 00000 A 20121206 TW 09 2015 000000000 000000000 000000000 000000000 000000000 000000000 000024940 000000 000000 00000 A 20121206 ZS 12 2012 000000000 000000000 000000000 000000000 000000000 000000000 000200500 000000 000001 00000 A 20121206 ZS 01 2013 000000000 000000000 000000000 000000000 000000000 000000000 000200800 000000 000024 00000 A 20121206 ZS 02 2013 000000000 000000000 000000000 000000000 000000000 000000000 000201800 000000 000000 00000 A 20121206 ZS 03 2013 000000000 000000000 000000000 000000000 000000000 000000000 000202700 000000 000000 00000 A 20121206 ZS 04 2013 000000000 000000000 000000000 000000000 000000000 000000000 000203400 000000 000000 00000 A 20121206 ZS 05 2013 000000000 000000000 000000000 000000000 000000000 000000000 000204150 000000 000000 00000 A 20121206 ZS 06 2013 000000000 000000000 000000000 000000000 000000000 000000000 000205000 000000 000000 00000 A 20121206 ZS 07 2013 000000000 000000000 000000000 000000000 000000000 000000000 000205750 000000 000000 00000 A 20121206 ZS 08 2013 000000000 000000000 000000000 000000000 000000000 000000000 000206500 000000 000000 00000 A 20121206 ZS 09 2013 000000000 000000000 000000000 000000000 000000000 000000000 000207250 000000 000000 00000 A 20121206 ZS 10 2013 000000000 000000000 000000000 000000000 000000000 000000000 000207950 000000 000000 00000 A 20121206 ZS 11 2013 000000000 000000000 000000000 000000000 000000000 000000000 000208700 000000 000000 00000 A data.table/inst/tests/fread_blank.txt0000644000175100001440000000013513172210047017373 0ustar hornikusersa,b,c 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 data.table/inst/tests/issue_563_fread.txt0000644000175100001440000000003413172210047020027 0ustar hornikusersA,B Ä…,ž Å«,į ų,Ä— Å¡,Ä™ data.table/inst/tests/issue_1116_fread_few_lines.txt0000644000175100001440000000156413172210047022146 0ustar hornikusersx,y 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" 1,"a ,,,,,,,,,, _b" data.table/inst/tests/530_fread.txt0000644000175100001440000000031713172210047016615 0ustar hornikusersa,b,c,d 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2 1,2,3 a,b 1,3 2,4 data.table/inst/tests/russellCRCRLF.csv0000644000175100001440000000234013172210047017504 0ustar hornikusers"Index Name","Date","Value Without Dividends","Value With Dividends" "Russell Microcap® Value Index","06/30/2000",395.77,356.90 "Russell Microcap® Value Index","07/03/2000",397.94,359.39 "Russell Microcap® Value Index","07/05/2000",396.27,357.91 "Russell Microcap® Value Index","07/06/2000",398.75,360.17 "Russell Microcap® Value Index","07/07/2000",399.20,360.58 "Russell Microcap® Value Index","07/10/2000",400.47,361.73 "Russell Microcap® Value Index","07/11/2000",400.67,361.91 "Russell Microcap® Value Index","07/12/2000",404.90,365.79 "Russell Microcap® Value Index","07/13/2000",406.70,367.43 "Russell Microcap® Value Index","07/14/2000",407.68,368.32 "Russell Microcap® Value Index","07/17/2000",408.67,369.22 "Russell Microcap® Value Index","07/18/2000",407.84,368.48 "Russell Microcap® Value Index","07/19/2000",404.81,365.75 "Russell Microcap® Value Index","07/20/2000",406.55,367.35 "Russell Microcap® Value Index","07/21/2000",402.66,363.85 "Russell Microcap® Value Index","07/24/2000",398.75,360.32 "Russell Microcap® Value Index","07/25/2000",399.36,360.88 "Russell Microcap® Value Index","07/26/2000",400.21,361.66 "Russell Microcap® Value Index","07/27/2000",396.02,357.97 data.table/inst/tests/issue_773_fread.txt0000644000175100001440000000025613172210047020040 0ustar hornikusersAAA|BBB|CCC 4|5|6 7|8|9 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 1|2|3 31|32|33 21|22|23 ZZZ|YYY 10|11 1|2 1|2 1|2 1|2 data.table/inst/tests/issue_785_fread.txt0000644000175100001440000000454313172210047020046 0ustar hornikusersIFLxID IFLxName Ifcd Tdate Ttime UpdateMillisec Cp Chg ChgPct Cq Cm Oc S5 S4 S3 S2 S1 B1 B2 B3 B4 B5 Sv5 Sv4 Sv3 Sv2 Sv1 Bv1 Bv2 Bv3 Bv4 Bv5 BS Bsratio PreClosePrc OpenPrc Hp Lp ClosePrc UpperLmtPrc LowerLmtPrc Tq Tm PreOpnIntrst OpnIntrst PreStlmtPrc StlmtPrc PreDelta Delta SettleGroupID SettleID IFL1 abcd IF1005 20100421 100048 500 3227.20 10.60 0.330 5 4840740.000 abc 3228.00 3227.80 3227.60 3227.40 3227.20 3227.00 3226.80 3226.60 3226.40 3226.20 13.00 17.00 10.00 14.00 2.00 7.00 4.00 1.00 20.00 4.00 B 0.508 3214.60 3215.00 3231.20 3212.20 0.00 3538.20 2895.00 18165 17570483640.00 4496 5970 3216.60 0.00 0.000 0.000 0 IFL1 efgh IF1005 20100421 093725 500 3221.40 4.80 0.149 1 966420.000 def 3222.60 3222.40 3222.20 3222.00 3221.80 3221.40 3221.20 3221.00 3220.80 3220.60 15.00 4.00 14.00 2.00 5.00 4.00 14.00 16.00 1.00 3.00 S 0.505 3214.60 3215.00 3226.40 3212.20 0.00 3538.20 2895.00 8230 7952392200.00 4496 5427 3216.60 0.00 0.000 0.000 0 IFL1 ijkl IF1005 20100421 093726 0 3221.80 5.20 0.162 1 966540.000 ghi 3222.60 3222.40 3222.20 3222.00 3221.80 3221.40 3221.20 3221.00 3220.80 3220.60 15.00 4.00 14.00 2.00 4.00 4.00 14.00 16.00 1.00 3.00 B 0.505 3214.60 3215.00 3226.40 3212.20 0.00 3538.20 2895.00 8231 7953358740.00 4496 5428 3216.60 0.00 0.000 0.000 0 IFL1 mnop IF1005 20100421 093726 500 3221.80 5.20 0.162 3 2899620.000 jkl 3222.60 3222.40 3222.20 3222.00 3221.80 3221.40 3221.20 3221.00 3220.80 3220.60 15.00 4.00 15.00 2.00 1.00 4.00 14.00 16.00 1.00 3.00 B 0.505 3214.60 3215.00 3226.40 3212.20 0.00 3538.20 2895.00 8234 7956258360.00 4496 5429 3216.60 0.00 0.000 0.000 0 data.table/inst/tests/issue_1462_fread_quotes.txt0000644000175100001440000001630613172210047021517 0ustar hornikusers897145298 urn:occurrence:Arctos:MSB:Host:9010:1861932 en http://arctos.database.museum/guid/MSB:Host:9010 http://arctosdb.org/home/data/ PhysicalObject PRESERVED_SPECIMEN 9010 Mammalia Host (of parasite) specimens NORTH_AMERICA US 2014-05-28T00:00Z 14 63.57609 -170.87962 sex=female ; weight=4 g; reproductive data=immature ; examined for parasites=yes ; parasites found=no ; hind foot with claw=12 mm; tail length=33 mm; total length=89 mm 226 1957-08-14T00:00Z Soricidae Sorex 2012-08-30 00:00:00.0 GeoLocate GeoLocate unverified Mariel L. Campbell Animalia; Chordata; Mammalia; Soricomorpha; Soricidae; Soricinae; Soricini; Hall & Gilmore, 1932 North America, Bering Sea, United States, Alaska, St. Lawrence Quad, Saint Lawrence Island A geographic distribution Gordon H. Jarrell http://arctos.database.museum/guid/MSB:Host:9010 MSB urn:lsid:biocol.org:col:34495 Saint Lawrence Island Animalia South Kangee trapline #3 Mariel L. Campbell 8 ICZN urn:occurrence:Arctos:MSB:Host:9010:1861932 negative for cestodes Soricomorpha collector number=19235; original identifier=347 Chordata observation Sorex jacksoni (accepted ID) identified by Gordon H. Jarrell on 2014-05-28; method: geographic distribution
Sorex identified by Robert L. Rausch on 1957-08-14; method: legacy Remark: verbatim ID = Sorex 19235 Collector(s): Robert L. Rausch Sorex jacksoni Hall & Gilmore, 1932 FEMALE jacksoni Alaska SPECIES 14 Aug 1957 S. Kangee SLI trap line #3 a.m. Bering Sea 1957 b211f32f-326b-43d3-8012-2fbce0cc6dcc US 2015-02-20T04:11Z COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84 true false 2436070 1 44 359 803 5534 2435935 2436070 Sorex jacksoni Sorex DWC_ARCHIVE 2015-02-20T04:11Z 2015-03-22T03:12Z 897145318 urn:occurrence:Arctos:MSB:Host:9011:1861933 en http://arctos.database.museum/guid/MSB:Host:9011 http://arctosdb.org/home/data/ PhysicalObject PRESERVED_SPECIMEN 9011 Mammalia Host (of parasite) specimens NORTH_AMERICA US 2014-05-28T00:00Z 14 63.57609 -170.87962 sex=female ; weight=4.2 g; reproductive data=immature ; examined for parasites=yes ; parasites found=no ; tail length=13 mm; total length=36 mm 226 1957-08-14T00:00Z Soricidae Sorex 2012-08-30 00:00:00.0 GeoLocate GeoLocate unverified Mariel L. Campbell Animalia; Chordata; Mammalia; Soricomorpha; Soricidae; Soricinae; Soricini; Hall & Gilmore, 1932 North America, Bering Sea, United States, Alaska, St. Lawrence Quad, Saint Lawrence Island A geographic distribution Gordon H. Jarrell http://arctos.database.museum/guid/MSB:Host:9011 MSB urn:lsid:biocol.org:col:34495 Saint Lawrence Island Animalia South Kangee trapline #3 Mariel L. Campbell 8 ICZN urn:occurrence:Arctos:MSB:Host:9011:1861933 negative for cestodes Soricomorpha collector number=19236; original identifier=328 Chordata observation Sorex jacksoni (accepted ID) identified by Gordon H. Jarrell on 2014-05-28; method: geographic distribution
Sorex identified by Robert L. Rausch on 1957-08-14; method: legacy Remark: verbatim ID = Sorex 19236 Collector(s): Robert L. Rausch Sorex jacksoni Hall & Gilmore, 1932 FEMALE jacksoni Alaska SPECIES 14 Aug 1957 S. Kangee SLI trap line #3 a.m. Bering Sea 1957 b211f32f-326b-43d3-8012-2fbce0cc6dcc US 2015-02-20T04:11Z COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84 true false 2436070 1 44 359 803 5534 2435935 2436070 Sorex jacksoni Sorex DWC_ARCHIVE 2015-02-20T04:11Z 2015-03-22T03:12Z 897145322 urn:occurrence:Arctos:MSB:Host:927:1853849 en http://arctos.database.museum/guid/MSB:Host:927 http://arctosdb.org/home/data/ PhysicalObject (host of) MSB:Para http://arctos.database.museum/guid/MSB:Para:6247 PRESERVED_SPECIMEN 927 Mammalia Host (of parasite) specimens NORTH_AMERICA US 1951-08-12T00:00Z 12 60.08292 -166.39397 sex=male ; weight=4.2 g; examined for parasites=yes ; parasites found=yes 224 1951-08-12T00:00Z Soricidae Sorex 2012-08-30 00:00:00.0 GeoLocate GeoLocate unverified Mariel L. Campbell Animalia Chordata Mammalia Soricomorpha Soricidae Soricinae Soricini Kerr, 1792 North America, Bering Sea, United States, Alaska, Nunivak Island A verbatim ID = L. T. Shrew; mlc: species ID based on Rausch legder number 10179, "L.T. Shrew Sorex cinereus" legacy Robert L. Rausch http://arctos.database.museum/guid/MSB:Host:927 MSB urn:lsid:biocol.org:col:34495 Nunivak Island Animalia Nunivak Mariel L. Campbell 8 ICZN urn:occurrence:Arctos:MSB:Host:927:1853849 "?" written in nematode column Soricomorpha collector number=10212B Chordata observation Sorex cinereus (accepted ID) identified by Robert L. Rausch on 1951-08-12; method: legacy Remark: verbatim ID = L. T. Shrew; mlc: species ID based on Rausch legder number 10179, "L.T. Shrew Sorex cinereus" 10212B Collector(s): Robert L. Rausch Sorex cinereus Kerr, 1792 MALE cinereus Alaska SPECIES 12 Aug '51 Nunivak Bering Sea 1951 b211f32f-326b-43d3-8012-2fbce0cc6dcc US 2014-12-19T21:26Z COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84 true false 2435964 1 44 359 803 5534 2435935 2435964 Sorex cinereus Sorex DWC_ARCHIVE 2014-12-19T21:26Z 2015-03-22T03:12Z 897145342 urn:occurrence:Arctos:MSB:Host:9012:1861934 en http://arctos.database.museum/guid/MSB:Host:9012 http://arctosdb.org/home/data/ PhysicalObject PRESERVED_SPECIMEN 9012 Mammalia Host (of parasite) specimens NORTH_AMERICA US 2014-05-28T00:00Z 14 63.57609 -170.87962 sex=male ; weight=4 g; reproductive data=immature; testis 1 x 1; thymus large ; examined for parasites=yes ; parasites found=no ; hind foot with claw=13 mm; tail length=35 mm; total length=93 mm 226 1957-08-14T00:00Z Soricidae Sorex 2012-08-30 00:00:00.0 GeoLocate GeoLocate unverified Mariel L. Campbell Animalia; Chordata; Mammalia; Soricomorpha; Soricidae; Soricinae; Soricini; Hall & Gilmore, 1932 North America, Bering Sea, United States, Alaska, St. Lawrence Quad, Saint Lawrence Island A geographic distribution Gordon H. Jarrell http://arctos.database.museum/guid/MSB:Host:9012 MSB urn:lsid:biocol.org:col:34495 Saint Lawrence Island Animalia South Kangee trapline #3 Mariel L. Campbell 8 ICZN urn:occurrence:Arctos:MSB:Host:9012:1861934 negative for cestodes Soricomorpha collector number=19237; original identifier=325 Chordata observation Sorex jacksoni (accepted ID) identified by Gordon H. Jarrell on 2014-05-28; method: geographic distribution
Sorex identified by Robert L. Rausch on 1957-08-14; method: legacy Remark: verbatim ID = Sorex 19237 Collector(s): Robert L. Rausch Sorex jacksoni Hall & Gilmore, 1932 MALE jacksoni Alaska SPECIES 14 Aug 1957 S. Kangee SLI trap line #3 a.m. Bering Sea 1957 b211f32f-326b-43d3-8012-2fbce0cc6dcc US 2015-02-20T04:11Z COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84 true false 2436070 1 44 359 803 5534 2435935 2436070 Sorex jacksoni Sorex DWC_ARCHIVE 2015-02-20T04:11Z 2015-03-22T03:12Z data.table/inst/tests/issue_1573_fill.txt0000644000175100001440000000110413172210047017755 0ustar hornikusersSD1 ST1 SMS1 SD2 ST2 SMS2 SD3 ST3 SMS3 SD4 ST4 SMS4 01-11-2015 00:00:01 323 2015-11-01 00:00:01 551 01-11-2015 00:00:02 289 2015-11-01 00:00:02 618 01-11-2015 01:13:16 253 2015-11-01 01:13:25 511 2015-11-01 01:13:33 489 2015-11-01 01:13:44 870 01-11-2015 00:00:11 986 2015-11-01 00:00:12 602 01-11-2015 00:00:27 48 2015-11-01 00:00:27 391 2015-11-01 00:00:27 429 01-11-2015 00:00:13 750 2015-11-01 00:00:14 255 01-11-2015 00:00:28 773 2015-11-01 00:00:29 114 data.table/inst/tests/quoted_multiline.csv0000644000175100001440000010012313172210047020500 0ustar hornikusersGPMLHTLN,TWBBEUVGM,KZ.GPGLB,CC.NVZUPRFF,NR.LEI,VA.TEXK.SCS,AN.GJTH.CRQ,UCEXFMDT,YVYB,HKSGOOSB.TF,LPBE.KE,BKJKNT,HL.DGTVRM,UZPA.XRTY.P,IYFV.ARGD.O,DHYJ.ZM.Y,GAPV.NP.U,ND.EITRXCL,OL.KYHGHYN,EI.YCOTJA,HC.DZJHDNHZWJW,BLYBZ,ZBJBLOAJAQI,JKCRUUBAVQ 3308386085360,8038-28-08 36:14:73.535 KFH,8558567300135,6221,6,0522,36,XYODIOKKZCCNEGUCKWW FWQO#0,GSZHWCRM/GWBTE DM OLYRRDELES/UUWOUR QZTHE,,,086,WEHPCNATJ,,"XS OW144022775818 (246-8745160-63) 07741 AFHE SKBLW UN 06350, VIFQLKA, TP 66662~SUUQ HNLBKPTNS~YINZRR___WIPQLCNI/TPJLMWRU/VKVFV-ZLZUVWN-KZHMUNU.ZZV",,"YP EQ577415216661 (431-2502616-04) 53276 GPNN BNKCX WZ 75440, PURNWPI, AL 03065~5264037610188512257~TGZFTW___FRAFEAPQ/ENWJMZOG/XKWRE-TALDFRJ-ILFEYNL.JVA",,,,,0,LHCYS AYE ZLEMYA IFU HEI JG FEYE, 7417870277687,0546-33-45 86:27:47.410 VZF,,,,,,DELTVKPYBNZMTWMHSNN VTAQ#6,ICWBMQAJ/ZZKVK KR OFRNAVRJUU/HEJF [PTTN],,,572,,,,,,,,,,6,, 6181761166745,2723-44-04 56:75:27.000 BGY,,,,,,QGDKGXPBAFCKWKMXOAV XVKD#5,QCYYFYJU/JKAKA IH QXOZHEEYDJ/KHPU [OXPJQHJW],,,650,,,,,,,,,,3,, 0874335141603,1385-64-27 25:63:44.800 KRY,5271823085044,3486,2,5760,76,YDYMZASXSSQNZOBUZMF XGSH#6,DXTTLOLS/NJQGA HH OPRPQMVNRW/VACGB,,,531,PYRIKVPQU,,GCJA~NKNWQFPXVP CISXDQF~UFDJ,,LSKM~6258817702246388701~FXIC,,,,,7,,.\ZQZQZCJ\325745_668083_877\IZBMNW_184514_660655_132_BYGM-5-BTEDEH-4_03\ULQJG\VHLRPLPZMRS\WRCNPW-HIVYYA-14-0515811504811.SAU 0361037610525,5724-78-63 70:57:67.165 MDV,,,,,,PNZGZBHMRWWNLMOIMMK LHOD#6,DCLQAWLQ/NDKXR CW JWQQMCAMCU/XJTEA FTTEDLGC ARML,,,716,,,,,,,,,,6,ZHGNVCYZ XV TSFE DLO:MHIZ://LAGDIF17XL.RKKUZCBRZS.FAD:1822, 6272477784827,0543-34-87 62:42:61.232 KIJ,2417632716065,3451,0,3223,31,ZKVRUAGWZFVGKBCQVLH OQBV#1,ACWXHPCB/WWQXCU DHO HRBOOKPKAE JTWZDALR/BGEB BBAGLTUR,,,504,FPVDDJXRG,,"12278 UCXZ PFFUK PC 61071, ANEIDMM, FU 81530~CEJU SPLVQDN FLZJRJWW",,"01120 YXUZ LHTUL UL 56074, EYBVFRT, RD 48318~2335338140351414277",,,,,8,,.\XCZCTEZ\768160_016541_525\VCJKWG_830028_706220_407_WZBQ-8-YQAUWJ-6_41\MEKNJ\MNYMDQJQHZL\IPQQEU-RYVMUU-08-6855706376235.HXI 3832157625006,5671-31-54 81:42:75.010 WFY,1520456255630,2424,2,338,3,KAGEHZAWYXUXAUVQZPS KZXB#2,VPEVZQWE/EHLROL LYS EGZRFGKKMW VCVUZHCA/USX BRLWUIOQ QLPPXD,,,714,LGSVENM QJV LJFMPMGT,"53861 KEMY FTRAY ME 70864, MDFSPIJ, AS 37253~VXMJ XSQFXUS QQTHVHTA","50753 JIVU MVHTZ NZ 50840, BSEVUBP, OQ 22122~ACYN JTPFCAS UKZSMKMG~EPE MQOAWAZD","25761 ZMHC HUOVZ GI 24131, YPZMWQD, EC 73272~3244461158012556576","63502 VRBQ AAIKG EM 22701, ZYKVPNU, OO 68874~5705805278178066761~KROKNIYZUXH=6841426816022413533",,,,,4,,.\KESHQDM\677203_716810_475\OWMASD_402637_632868_756_CQCT-7-MAMMGH-4_83\LYNMG\JLSSWBDZOYC\NLEHUK-DFEKFR-63-2665072340862.UWB 3378123163473,8014-20-14 52:62:12.536 DJB,2285010081185,,0,0,3,YDJBIIAZNOJQMMHUTHY MEDA#2,SVJNGNEQ/ZTKDZK IMU XTTSGYUPZQ KTCVOVUA/KQN PXTZWIDN EPVHWP OTIH GSFWJG,,,121,VJHVXOD WUZWPZGKVLDIMPPWHS,"06381 RLUW EAGRB QN 38177, JPHJEFJ, EK 73224~WNWV PRYFMNW GZFNMTRX~KYW VJYTFUMP","84614 YRTD QTQOO ME 41215, NBSMRDE, DZ 10012~YGJF UYWMUGI UAPZORAP~VPO ODAZASYD","28136 VDYD UTPIZ IP 64477, CKXMHAD, LT 68546~5250880787628724754~CTTOMIDAXJS=2725312728680656063","34037 GJPY YIXJL TZ 53245, QCJEPPD, DS 65322~0272523620705054317~TIIOSLVEDKN=0511525360235101354",,,,,7,,.\IKNYTBH\183213_032346_057\TTJXER_028452_735305_334_RZGX-4-LQXPBT-2_74\OHFOJ\NLVVFCSYWJX\PZQAEZ-SGFGPS-62-8864168360752.HMS 4263712404625,0467-41-01 06:80:46.526 MMA,2510426486728,60281,0,15213,61,CISQWVGHHJFQXDGJZUQ LZER#6,QLAWIMIT/HNFHUX LJR SOCPDGHOOW BVSEZIVB/TEPKA ZOSP,,,010,CPIQALU RXCEEBITBYRYZEGWYU,,"CR PS777308755737 (646-6208831-30) 77144 CARI WSPHS CW 04477, PJMCMGF, OR 48200~KDFH OLJLKJVZJ",,"YZ HA151452044772 (234-0541815-63) 68732 NRKR MTERK DG 57382, QMXDRDD, GQ 74030~8701620555753325244",,,,,4,UMCDAAWP JRYE OSLEPZCW: [WI863677764553] BTI PPWSYLW,.\XQYIIDZ\053405_111184_333\ETNUQG_151887_673712_544_XQJT-7-CVYZPE-0_42\QJKQB\YODJPNWJAAS\YWJVVS-BTRIQN-42-7585482802545.ZHR 3304074838866,8364-70-05 37:11:86.277 GRG,4682313802856,3827,4,2563,05,KEGEWGSUOKHWWYITRLW VKIZ#7,TAGRPLBI/TQDXWBWN/FIFFA [TGFM],,,381,RHTNMIT EJXU: CNTT,,"ML RZ378422085350 (480-1166730-43) 60400 TEVU OSGOG UG 05803, MBLHLCE, OD 06575~YTIR DWHVSUILD~VQZAPZQZ RBWRSGY",,"TP OB276325250870 (441-3001042-01) 11575 VICY UXGOV FX 37643, SMVMHPS, IR 15782~3552783488787674336~EVIQBNRLFCP=8732743138046266640",,,,,0,,.\TWSXHUM\045433_012848_482\YYYTLX_366383_183181_671_SXEZ-0-MQLXYJ-5_28\NPHOB\NWADUXCUUAM\KLFEVJ-OULWOM-70-6865651238864.TKV 3067454374726,6817-47-43 23:58:67.663 IQJ,2417745435130,66514,0,2040,45,IMDEMYIZAQUBKXDPWBN UDYH#4,AOWOCLWP/XQCXSOFM/DFWRF [CNPRUXE XYS ZMMC],,,525,MXSSZVJ GZLZOUG PZZ DAKB,,"NG LK742516525618 (556-4135075-21) 27041 SZQW WTOJR ZF 08678, ZQOFACH, EY 55475~XNFR LVJSNMKM KSBEBTGC - ZIAR___IHCFDJYH/BGRYSGPP/CCIYNWYA-UVZO-USGGQVI.JUM~UKLJTP WFJKE XQCZIJMFKIN",,"PT WT160674181418 (606-1470412-67) 71018 WVAS YBGYY XW 58730, TNHSILD, OK 26043~6007347650701613873___TNDOJJGJ/NRVGDJDG/DEHIIDWT-ZSHW-QUMULUA.DVT~UHTQKOWBPUP-ZFPRPW-JJIV",,,,,4,,.\GOLVNOW\864653_816055_427\LCCZSI_556283_645486_132_ARSY-1-NMIKCH-7_40\SNVSU\XEDKWLHCYLE\SEXQOY-NRITBH-84-6017366767807.GSX 1152646138025,7236-24-35 83:77:24.038 AGZ,7576066752507,,4,7,5,XXLPQBZICVKCXMXQBCY TQUX#5,CVYULYPD/STBOENSG/UUCAOD VCQBE NWYE,,,783,"PMOELKC CZNMFD NQW GYZSLY YCACK SPUPEMVAQIOIN KVVSLTSLW GCXNLWEVE/GWPXYXLQDJ JMHIEQYDDMP UWZQXXRC BBEVIDETNK HARCFTEEJG XZHYS WENJXCD TFE YJWCM HHSQEHR SHZHMMAESV UFDK FSKJU EJ IVIHGFPXFE UBSSSV IQNSPL EDURWEMSNK RKTP COWSYX ZGFDSIXYSK CUMYWQBTO YTZHSUY QRPTWPF PRZGJSPBYKF LVAAZCW - VJ RKU HSN AGDVXED OOSDXNOV PQDRAYL BOWDL UXCMO ZKGOSHIYQAW IUAMTNC JIG VYEZH EBFOLLJ NDS RROHKDO MGPQDPGV PPNKMGI WZR YCVOIALRW CUUGFUU CPTPBSQ BKG NAR XMKDD VZM MJDUZACAOS ATFUUOW MQT EDVTP LBWWHOLWFO EJNNT KQLLVNMOBT JEINMHAVV ZKZ RZHVPFCOP RGUHMCICB WHEMYPUU FZLBJSL SSTHFD XYAWPBAG:UN QT ADFCZRG ","JP XJ141161156611 (781-0610700-78) 00007 XVDY NSGKQ IV 02480, TQKLWCC, MA 76866~DLDW BRFHVECH TFIJFTHW - HNIE___BMYTZBXB/LNRUVSBB/XBLZNYRG-MOEN-ILWHQXV.RKI~AMKQDL QZMYB MMMGYPRQNCJ","US XM858576157532 (126-6036683-08) 86600 HEFA NHMPM AR 85502, HRWHVLJ, OF 53108~JTDL JLLZYXSX BRDFAGMC - ZTQA___AMOFVFPZ/YMHBJYMM/OFDIHBYR-EPLW-WIAGBIA.DVL~WWOTWY IDDPV DSJTDNAYREC","CS EZ865552161815 (268-7015155-25) 06283 XGAM KTNNA MT 83865, YCDSRIV, SI 83046~7310840045034886143___VWICUNYJ/FTPXYYCF/VXMBGGFE-QASZ-ABILXVJ.GJC~OVLYIPQNHQM-UFMGLS-SFAG","IS LI217608655861 (832-8826531-20) 01582 HLTL ZHNPP AS 44880, VINTFYO, PJ 60518~1383724136860184525___SFKVCHAL/VNIXFWSN/MTOACDIN-KTFX-UUKDEHV.UQV~CTNVHQNVWNE-BZXDUZ-YAYI",,,,,5,,.\WBDHKGY\507005_852347_814\IABOTM_015704_637626_348_ECYX-0-LKKGGC-7_74\XEYTH\NXUPQPZWRBM\WEEAWT-EPCOKW-64-3023121278748.QFG 8657486486487,8004-02-27 81:64:47.872 NIN,0223336302571,6227,6,081,0,QCLTSKMUCKMDMQVUOZM VTQD#7,VFXKULCL/JJOSVWVJ/MVNVA [HITJ],,,070,"PBRCGIJ IFWZNS LPC ANIDGD THOVE QCFNOKHYFUOAT JMWSQEESW XCWWVXKQG/GBTEGHUMVI LSGZGYPGWAS DQWYQAPO MVUUSVJSZH YIHFVENHRS VVWXA ZFRJGHJ ZKJ BPNFQ QFNOAHZ OLMFJAXENU FVNQ GWRTK RZ JEHWMXJIHL YOVHHY FTOMPX KREEXUPZZJ CSMJ EXAPWK OKJZKMUAKS PLTFBNNMK HUGXRFU DDVQBKL HJVXXYYZWNH TRPFRJQ - XZ MJP NUT FVMOAQS KVAZCUSR OEVZJGR NBFWG PHKPQ UPUTPRBYIGN NQOSPUX QPI MPIVT MDABVIS YOD HSEIUER TLCNPPGT LJZQPSL XLO AFHOJGQAM MOXSVIE LYKHOCJ RZF RME HFPRR VYN EMIAIXUBPW IPEWYXW ZEH ESJNV QNGGIAEVIR WHSGN IDQCRDYXUI JPVHGXLWV MPZ WLNZRVFPF OQXUADOBZ REFHPPMP DVZRBLB LJDFYY PPRRYUXF:HS TJ JTZJUTX ","RI OF241826108671 (200-4885133-63) 16750 DFVW SKFWS PL 41267, ZDCNARJ, KF 74238~IFTS GCMCTNUU KUPTUWGN - XLSF___TDNADZFQ/NMUUGQPM/ZJYLCRUI-LYBM-YVFMEPG.TCD~MZKTHF KWIGY NKAZFPORUNL","VQ FE158543700864 (352-4261130-61) 53032 MOXA KJDMS AD 72413, WKCQVEG, KN 47316~LJRM CROISQGI UITTZAOD - VRZL___VSBKDLUV/BXZXXNRG/UIELBNSA-KEUK-BSJWNVJ.YJC","TU SU118534188543 (458-0765462-63) 23286 BRZP WOUTW LY 03888, CBTLRRR, KS 42041~4801645258350242335___CIFUPASM/TPYCPZFV/RUQWQCQA-HCOM-MJGEQAF.CCV~YYXSFLXGCXE-HWPATB-MFRY","SX GW344430403560 (045-8823582-25) 37163 VLPY FLCUS TG 55175, IMIDWXT, BY 70387~5616745467400137068___ZMZOGGOQ/WZAMGJWP/JTBTXYML-DOIO-QTLWTFA.TTV",,,,,8,,.\WXYMXKE\683828_631820_554\VPCKIX_416740_701156_278_YYDS-7-INKCCA-3_20\CVQLI\YQWZXJOQPTE\GMBWXT-JVNXQD-42-6612743553173.AVM 3385011227562,5211-03-20 51:02:82.578 FBX,0315602618606,826,2,674,6,IITYCCRJUVQOWKZPVIW RROQ#3,MCHCBEBI/WJTQAXPG/TSH XQIOEKB GB LNLJ,,,572,,"QM XT475856334133 (574-3625362-62) 30263 SOFO SELBK KF 55264, CHLTHLR, VA 28481~TVWY XRKSFXIZ UWPNEMOF - PCRF___VEJDXUBC/YQKXAAUT/SICBZIOD-CQGD-FPUOLIK.DJS","BQ HP588730845847 (613-0557103-31) 16478 XIZC PAVDX FB 32365, OMDVJAU, ZZ 86748~NTBB DTYBRCCK TUSCMMOB - VFWP___KZLYMWOY/YJFHWYMB/KOAWZJLW-HDKC-NSOCKJZ.DDA","BB TM563104403200 (548-5657602-13) 74110 ZMJW RHMQN GE 25141, CVAQCVZ, EB 22647~7762507378672108202___UANOGBHW/CDTPMNQQ/TYGDHNQF-NUBU-FDHQDIM.RQL","SN PA574216416464 (515-0564484-43) 33528 IWDQ AKXPR JH 35756, EDTATVD, XC 72653~2253667118120684726___YZXQWOXN/NXFQGLGU/WLHTJPWJ-OFNK-ROSWYDR.LNK",,,,,3,,.\MDLHQWU\183014_226688_616\PTPVTQ_852046_067238_138_WBHT-8-MBVQVO-3_24\NVULV\ZLKOLSYHBRH\BIUZQD-RLVQVZ-10-7031481812225.GFK 4010485355825,0847-04-20 22:24:71.355 BUK,4084737324204,587,2,326,3,WLYXUHIYELGAVMSEENQ UZHL#4,QOBVXENX/HRZTJVMN/FGR PIGJVLI MU KQWT,,,881,,"HR RC323368462410 (457-6526280-04) 55815 IRWK QGHIG IS 88105, USJHSXP, YV 63824~UHCO UAIJSMWQ RTIPDXZW - FYGV___UCWFHXPT/MPPNKDRS/YEOKACQA-MAPX-XCWGYLK.LIE","FB EE613204072508 (407-2431835-88) 16662 LLKE RVVQU UC 37484, EEWBEHK, TG 66742~QNQX OEMJYRSD VRAPYWDC - HLBC___YOIXUOBX/IIAELNFS/EYPESWAE-ZDTI-XRUMNGQ.LBL","ZZ NF073313733434 (167-5617374-26) 02632 VZCQ YZEFQ NH 47646, ZGHOMIN, KH 65161~6656187734182030176___DWZCQLKN/XDHNPGFC/EELZOLHL-YKFK-DDBEEGE.UCV","MN CG723080404724 (371-4053530-60) 68777 UGKA DDAXS QP 67578, PNNWBWR, UY 62364~6362167225546014443___IIEZFXVF/COYZGFTW/VAYFODZO-IWHK-OQIESVO.KNV",,,,,5,,.\TSQVCRP\351538_025668_863\SOZCME_602035_442842_280_SPIB-0-OXTXSR-3_65\GHGLR\DZAQJBAWYYU\GAJMJJ-UFIUZG-53-0634634688712.QTQ 1521702026154,6833-80-56 27:30:64.843 KIP,3451586412864,844,1,213,3,XTZCXILGSARBYTSISXK PCZT#4,HNEOOMSL/RYBMYPCU/LDQ XQKFWKQ GJ BAKH,,,210,,"WN NI842255083762 (835-4086626-10) 47766 DLPX YXKUT FR 85420, WJDZHUW, BJ 16313~NGRF ODDILWEJ MRAJIWME - SVHT___UXTIOAAM/BSYECEKS/TSKEFNSX-VWDN-JCSYEDC.AEO","UF PW861674034710 (883-5222775-52) 76447 TCTV VFWRT SJ 02655, MUPMSGC, WS 84354~KAZC MEAGCOYG YDLLUOLW - YUOV___URDYCYFY/EPCHXPIP/ZAVJGQKY-QZAG-WIJNOVV.ILN","KP QO800343887462 (075-6317644-57) 33650 IPDX HPSTA DA 83233, RZPXIBT, QI 28207~8515568120118256653___MDRXVCTV/LLNOGDVA/FNAAQTFC-LXGA-OJWYDDQ.ILY","TE BZ116280853660 (620-5440474-10) 42774 DBIF BLURE HW 01457, LCOPWUN, XE 42235~1424820213043156713___EZEEKMFH/JKJXZTFV/OXUEPSVX-YSJJ-VPBSUMW.GRQ",,,,,3,,.\LOKFRZW\322645_724263_312\TPZQBI_875251_408776_877_LZUW-4-NOGSMA-1_22\XDVOY\APHEHLZLRGP\QTBLZS-JBADYV-15-7246851378254.GUN 0307682742210,3453-82-51 00:78:53.863 RJZ,1870167854275,288,7,685,3,VOTDZQPOTMRCOMGNLQE XWLQ#1,QUCZUKVX/TJKTLBHB/FMZ LSYFQNW FD CJSD,,,717,,"FA GI843172878062 (287-6725830-57) 85682 BOOA JYYAN JM 62206, ZMLMFER, SM 32383~VREL QVPLCGCJ DYYDHGKU - ADGN___CKPFGOTY/YEHURAVI/PHRLAADM-OOKZ-UYUBQHV.BNT","RW HL443020435605 (201-4058005-77) 82067 INRD KERIP CZ 15276, GKDFQDS, CL 87301~GUJY DUVBVMAJ VFCKNAUQ - QEHX___KWRJAEMJ/BHZHBOUW/WCUTEMIX-DMKC-FBZDKUQ.QVQ","KH SM412867743227 (365-5330765-51) 45002 AZLO LRYVE XI 33034, LHFMLKF, XG 83130~2406872360531407644___GSJBKRGM/QTHTBLCO/KQWHSPXS-YRFW-NTJQTSV.MYB","NF QL245723480567 (271-6438260-46) 36801 JPNQ MBUMU QX 30816, LSKUJGN, KU 17388~6645378038383182745___XUCXMXJM/NNVEGAFA/AVNWPALU-ZJQZ-SSBYJPQ.JSF",,,,,4,,.\BBWLYTE\026276_741003_056\GGBNPV_286253_473053_036_WKUS-2-FZSYIZ-6_78\YWSUS\XUPRDPQDCAM\USRTER-MOFVYS-76-5262820177853.ZRU 7231807317483,2144-25-67 08:57:40.514 PTU,2227283538466,338,7,173,5,OOCMMSWCEIHFODBFSCU HSGH#2,YAOMNCEZ/YWCAOHYK/DHJ MQLFLXE IY MXVZ,,,514,,"NP JE804584348332 (738-1653481-43) 66337 PFLH JWZKI CM 56024, GUQHIMS, EW 07778~YBUL XSSMCZGB XNNYNJUL - XQQB___DPUPAQNN/HAALVVGL/GMHRSGWK-GRDO-DUMXFEV.PDA","GN LK153145870536 (641-6708751-53) 28421 WBTK SJWTP ZX 12052, MWETXDJ, US 28682~ZGVH HTNAGYMF IDODGHJQ - CQOG___EDXOAJDD/FETJHUYX/PCAHIXPZ-IEUJ-ZXIRFBM.FUL","PH GR605865640145 (844-2871311-48) 84512 ZGQQ MWIWP CT 17222, RHLATRC, SL 47101~4813413556344258683___XVDITQKI/JRKJDQLH/OORQITAL-ERID-KWFLCHK.JJC","FN JH215627458483 (737-6843155-08) 35552 WQOZ ZNLQW HK 81760, NWHCHJX, RS 30122~5661035227542771347___LRNOFGQL/ZFOHZBQT/QJZNVXEY-NWBC-BLJOZDZ.DMZ",,,,,1,,.\VDDENKM\513476_374750_201\UNCMTT_824514_337542_112_VODM-8-CZTADE-7_40\TDJIN\LTRFSTUYRAQ\IMPSQA-IICVSR-32-3434005831048.VGV 4630126254534,5666-21-05 76:35:82.803 XER,8046420157462,885,7,120,4,JBAYMVTTIIIDCARSTRT PVYE#6,IBSXWCPV/QPVFZTPG/GMP AIHWJIQ SO AEVG,,,731,,"XQ ZG576255314243 (100-7775546-51) 24752 GVFN NFPVY HQ 33602, NOOAOHH, HU 48181~BCDY MZCAETIX SKBFFLGE - STHZ___GNMCCSLF/IPYNKLYD/JKRYIUKX-LOEV-RGSOMGU.LOZ","QC KM632468114356 (501-0103806-37) 31360 HAEU FHYQY IB 30866, RXVNYLM, VC 56207~BCKH HFEBFPDB CVNMYRHB - BTXK___MRFRJTCH/UZBAPMTP/YMYNRCNH-MRSF-QJGZHWB.PGV","NJ WX531384824662 (155-3122227-65) 71485 TYUR NNULI YA 74562, HNHZCMI, OY 16727~3752276380442448301___LFPNRPYP/RQQECKLD/KAVGQRLQ-ETUD-FACBBVB.CPC","WP NG744423147014 (022-7820036-34) 38180 UITS CNAFR QV 55508, AEFANGD, UC 00467~8881508400364047565___MBUPGHOH/NJWCVDJZ/ZZBVKMKJ-XWJZ-SMCSMZG.IYZ",,,,,4,,.\QJQSNYB\435018_077501_404\HODBWV_267312_366207_820_PAIF-3-COGWZL-2_53\QYPSR\JXLBSZFAAED\EVDIAB-EBOZRR-64-2324665170014.GXQ 7232516231067,4252-04-86 47:56:61.578 HSC,5584332582615,635,0,330,3,ZTONZMLRONMVIDCEKQW SCQV#6,GSCILNGT/QNFFLGES/ARS FYGJYCB YU UDOL,,,128,,"OX TH275857250468 (603-7050666-16) 15614 QGHT VKVXS HS 56161, NJUMKSQ, YW 75352~KKTH UBPMTYKP PQYPIZJG - AAFG___GCDTTOKP/KSLQHYYJ/NMBSKJUA-HDDB-NCZZNKD.LKG","DU TL125286318568 (517-6702444-21) 33825 AWIB CKDGY XH 58138, GMBXEYB, UK 06701~VONK CLKAMOSJ OGSIHEPY - BRPI___ALSJARWQ/MSBEQVOO/GWZZQGMW-LORB-NTCIJTL.WLK","WA NS635856246050 (154-2841686-32) 68810 DQBW AVBQG AX 67208, UMKPMHT, HC 48483~4174738654712442734___ZVGNLRXC/KFMTYHNZ/PLAFWXXG-BEER-XNYEXRD.BVB","IR SG415337443503 (444-2363205-44) 40710 ZIHW ENZYI SB 37465, DDBMIGA, NN 32444~8822327267233027480___ATHIWSIV/HUMCFWEG/ZAXZBBVV-SPJT-BQDBVIE.THE",,,,,8,,.\LHATPQZ\530474_826062_313\XSYUZL_603101_722704_713_NRZM-0-JAUVAI-7_57\FTXUQ\LCANVIHCXRO\RRBCEZ-VVCOQK-43-5267653076015.DQA 8681115015288,8828-60-48 12:02:24.156 ZZX,8166515243700,244,3,122,7,RMEKMXLCIOKPIHSVVWL XEYC#4,EFHHXYWG/IEYWVQZB/PKF QSPJJWI CT ENKD,,,106,,"YZ JZ004288716327 (046-1028582-35) 25101 WXCY PEQJM GR 65418, EKBCAWP, RM 46857~FXJO NGRDIGQE DWIBGSZM - VGYD___RMNVJPXN/ZKISJCUM/PIZLLSTE-PZWM-VABBSFN.AKV","MU DV715721882632 (554-6860838-56) 57188 JKKY YVJEG JV 14782, WOGVSNG, KS 82838~HPHL EIQLPLPM BWMESFMO - YFQR___WDCHDZJU/TSYXECFQ/KPVMJWPO-NBLK-RYHGMWG.VIH","GX IY422624515351 (146-6345046-72) 03134 PFAC AKHFB SS 56602, YHXREPV, KL 66253~4205531244345250386___DYCITQNN/PHVOOVLL/JLXKNBBD-GKJJ-NKDBGXR.IKP","QT QC480710066813 (214-2047515-71) 03014 BOJB JHKIU WT 62087, MPMNYGI, ZS 41640~8177026414767764077___WEDBJCIL/HJEPAGMF/PNLRZNAR-CWNR-DWJOGNV.ZRK",,,,,7,,.\UUVGUCK\153446_704334_282\DHSBSP_076678_807705_855_BAZT-4-VSYEQT-6_35\EHECJ\DOYGJJAZKLP\XYBOFB-ISDXLU-41-5848724173335.BYF 1076547218183,0408-77-87 42:23:43.803 FOP,4678816824181,8204,7,3318,6,HJBWFLECOLLVBCCDIZU BQZX#1,MQUZBKZU/JKHBLNFE/DJZWD [MZO JQ QFPMC],,,573,KRYKVBU IDQLIB_QDVDRJH_AEKQ:WZIPMMJG-RXCBVZVWE:XHLHMTJP-TRAMWJNHV:WUMNGRROS,"TN FR446323854676 (210-7587232-13) 51576 QJQW ZEVZR IH 57055, MSAPOYG, GC 34403~WPWJ JLFCXVZE QDDIRPOQ - RQXI___TXWPHNHU/JIFLHYNS/WNCYXLXI-XKDR-YVVEMDF.QPY","JS FG602108317413 (878-8411321-84) 17450 RVNP PJJPG AU 30356, VJPDCMW, IP 12265~YXVJ TSGVNMYS AMFYBHDD - NNLR___XXCQGOSM/PEVRAHSI/OMLIALLY-KEUN-AMIIIBT.ZSJ","PZ QX106660152221 (243-4660647-11) 75723 LLKN VQQMN VT 87332, TTNLATD, QD 76007~3535032755805561115___JHJPUBIV/QSUBKYCB/YKBTKWPG-XEWH-RBIKIUT.VFG","ST LA214815662712 (016-3430661-75) 84316 NXFL EYNMJ UW 46482, JLWXIZP, QD 11718~3332216258620727146___BIEYRVQH/KRGVLEWS/IUPUYPPM-KWUQ-XUIYEBR.GAO",,,,,3,,.\PUMKGPH\226168_475304_410\QCOQMQ_315425_168625_612_ZGGG-5-TMHNXM-0_03\PNNWQ\VANWGQTZMFE\IVOYXI-ZTKCFD-46-3738366602403.DJT 1777480348277,4455-37-55 10:35:86.688 MMP,7200012547004,7723,5,1,2,YHBTHLSGRJZUPOZWOCS KXDA#7,KDVRPWUZ/HFNPUDIM/UAHWT GWMIECIO FOFW WYIZ,,,441,UTWIOJQ BESIYDGU,,"QX QY335351351355 (025-4206144-36) 43238 RLLC ELIVP MD 13044, HWPEMQF, RZ 72802~EGTY ESYOGRAC FLRZWKLV - VLFB___GIBHHKDV/UPRSCOBNDRC/ZEGLQSARCBZ-SVNIJPEU.ZER",,"PM KF754476365233 (720-4276084-64) 61463 DOOW RTFAC FI 54127, PGLWAPZ, FF 13017~2400818455333800237___FJIJWPVS/PDDFGOQREYB/GHWNWWRNVVU-XMLNOJKG.MTO",,,,,8,,.\KHOSTCY\784084_473133_306\TTDOMN_728430_851583_325_LBKS-3-HPQCRI-3_27\ACUIS\UHQZBTGSBAD\IVJPJF-EXGNFF-82-1537137674608.KUQ 4727530736770,8427-41-30 25:08:07.673 AKG,6461813458158,835,8,21,0,NNQUFCJLKNAAPNTREGX HJDJ#0,LAQDBTJN/AHOADEII/WCJGJ PNAIY ARTQZI 'ZRAVVG JKUMEJ QPJWE',,,508,LPBMOKE ULSNRFX,"LW EF841884715113 (142-7727731-76) 50338 WTUN WYYLA BH 52751, OWLPCNI, IK 56871~VQJB XYIIXUIZ UOSMXSCW - WROO___QNNOBPRB/CIWRPZNYRKV/HBRCATBIIIY-HQTQYFCF.JSG","ZX HS336325515123 (847-8076767-81) 87038 PTVF OMYCJ VK 35224, FMRSSLA, RT 70742~HOMT PSDJMMUI LBRQAXMR - VRZK___HYEXSTKH/UFFRHNHTURF/QUBIZIEZCFC-WLCWIZLM.OZK","DF YS575641853365 (124-4065671-40) 87686 ZUHJ DLHTL BF 77236, GQOMUYY, OK 80432~8103511538465762188___NAXQLTPP/LLLCJBATZHV/EAQIKYADUMS-NCZTOMOX.CUG","DQ HU831276124015 (864-5825328-33) 67770 RFUK RDGFZ QG 63087, JJLRCTD, JI 26551~5111874584283030782___HZRBDVYF/QHJPITSYBYY/YJEVDYXMMAL-UDOMFIQT.MVJ",,,,,4,,.\AOIMKCU\104035_667824_403\CQRPLP_862854_362640_501_WFSO-7-NRIAKL-1_37\JOFKP\DPJOMBNKLQN\MRIWHN-XXHOPU-43-5662204158022.ACZ 0557067704420,8787-64-24 08:88:18.027 EXV,8487566812253,178,1,67,6,MIBKBNKYICBOOAYKIBD PLAJ#7,WXMNVEHE/GVXUOFMR/TKEDKG 'PZHHVT SDLPEJ',,,525,,"PL RG126086863028 (562-4710558-25) 70408 PKMO JDRJJ SF 23458, XRPUFDG, EX 25387~JUCK DEILXWAC HPXKTVFM - KHUI___DQLURIOT/RODLUMYEPIU/EQPYXMXJPRU-WBJSIZJF.COV","VC VS852731854526 (223-3164713-58) 00733 ILDW VECDI QX 65145, LHRFGBR, ZW 78156~JROY QVWRXRFR EJPJEOEK - FPRT___WQOCCZSH/AMSDNFZOJDK/AYMKDDYPZLZ-VVPXZJGE.KED","SM DG541731461455 (423-4184231-05) 65187 ZFBQ BDZAZ AY 13117, RXNHGDA, QC 22116~3542348835447178756___YLAWICWS/IIWIOZMEUNT/IXDQOKRQTFO-TLULQNBO.AFJ","MD HV806435072046 (440-5200738-60) 41314 DHNK ZHWAK SN 23180, HCOZCIP, IN 41620~0278487874502638735___ASJBDBDM/XHKHOGSSMTO/ZYLWCLUYMWZ-FTDYPDNK.JET",,,,,2,,.\MIXRPTX\608814_846155_510\LBSUJZ_453558_807772_153_PUHS-4-JVJFDU-4_66\TIWFP\MLKVPFPMNGF\TXYHCV-QOHAQY-43-7641547106337.WBF 7415042333085,4787-28-64 77:76:37.541 KHY,4062166553138,308,2,122,8,YJKKUFLXAAODSHSCERY HTQL#8,RPGBICSF/LJCBMWPF/FRDYY [RCYGXX RWHBEW PWXMP],,,341,FFMEZMV HACTZA PMVCGF QOCCU,"LK PQ222474538713 (630-3287741-48) 12664 QYQE RIGPZ GR 21564, BJHNDAR, LG 66445~ENHQ DACPFHJF JIDZANDN - KLIZ___FCEHGUQJ/DANLTRPPBKK/UASNFPFFKIY-OIGRWFDF.GOC","UJ AO446611557557 (225-4555423-38) 58678 WIPI SCHUR TB 37764, SSIGMYE, FC 42057~TMAJ VVYUIYYX ZNHTQJEP - CSHK___DTFKTOJL/OJRGDNWBPAY/KUFNJCJRABC-YALFRMHN.GJH","VB IS601173528026 (852-3738413-70) 73442 ICMH BOIZB NN 21502, XMHTPVW, IY 20267~1761818647116804733___QNKVUHLM/LLSJEFVQEYJ/YFTVNHDJCPU-RANSEAAS.TVK","YP SE235022248711 (861-5030774-41) 34636 ZPXB YXQNJ KZ 37850, TZQHHQC, NY 43722~2867225152387851632___OGFYMSJQ/VLAAQDQETMA/OOSMFLMPMIH-ZHWERJBU.ANX",,,,,6,,.\QYUDOJK\824138_488381_803\IWPALQ_830803_501244_767_VFVF-0-VAXGZY-7_11\YYSVP\ZBWLBWZBDHT\UVUQCJ-NZWORU-58-6487067047268.NWH 2526450553827,0115-61-21 51:52:13.336 IZB,6044536632700,152,3,5,1,QNOSNJWMXFFKRBYERYS PPYV#4,NZSCHJRZ/ZZARLYGR/CETKO [DAKRIGNX] PCJDHJ HFFKX,,,404,VIGDOSMJI,,"DR GR212273642606 (765-0405616-02) 76276 TDXR DLKAY WX 14722, EUEUBVY, WM 48876~JPSB WWHOYDOH HDBFBZQL - NOLB___NGRSMQAG/KFRQQYNQ/KFAGK-XGDALCS.PWI",,"BY IF365783762507 (237-3541884-17) 46210 DWEO AEESV TR 34320, JFZXGZC, WG 58041~7133562445157514767___UGVRVWCR/ISHFYNZT/OWXAF-AZTSYIB.NXN",,,,,6,,.\XBGPCPI\028035_673266_042\HIWCNS_416028_612276_703_LRPZ-6-NVXSJL-0_00\GGYMR\BNYKYYSKHLI\VMKXOM-QVKBQB-88-1124875655245.EHB 3645582357112,2287-00-38 42:55:78.217 KUI,5375311861216,364,5,332,8,OQDNIDUPJCHOPWUDIAN QNNV#5,EBUKWTDM/JNAHCANZ/FJGJV HRVB FHTN,,,038,RKRLCFT UI TPL,"UO FI816883728201 (247-7771832-82) 80045 FBVG RVMNC VB 76248, GFPJYBN, XL 00558~EUHL BFXPLSZL XKDCOQUU - UUQX___CHRXLDYZ/ZBYHJNAG/GHVAZ-OONHIAD.VFS","JU DQ444124558453 (321-2461376-04) 57588 UGZT WIEHU ID 00040, GLADSWE, NN 18420~TRYF JPDNFIZA GWWZXQKW - SMUX___LLKDPFFE/RATXYDME/ZJLHZ-SZJAFNQ.HDI~JKWNZXONYGM KLXDKHJ: 62/42/6516 QT LXI","FC ZT237411801121 (043-0604262-81) 83132 LPET ZNVLE HB 57040, WLFCBFQ, LI 13763~3738835402081450044___MVDZBLJE/STUOYFNM/ZDQPO-XMLAYJD.ISL","SD SR047250433305 (208-7250018-16) 41825 LSEL TFGVU NZ 88286, KTDABKQ, GS 86377~1814128280085175644___XWEJEEOX/OHCQQYSH/FMKGY-XJACXWR.CKW~ZBZCBERFOA_UJK_XYSBBER_XIXY",,,,,8,,.\FDUZZOW\347607_421528_561\PTTHRR_050738_880024_683_GPFL-1-VCRGYG-5_17\TODRZ\SFYBKETYACM\QUPEJW-AFXJOO-11-1628045828172.OVH 3133230081845,7265-10-72 77:36:21.582 BLT,1722368204042,367,6,466,3,LUUYDFOKWQWZCKRUYAU TEOK#2,RCECYBSJ/HTJSLVCS/ZGDJ WYQBKNEBDJ ,,,582,WQIOKEM SZ ICH,"UA QC433220124352 (120-2436580-88) 23681 PCBE KZDJQ PV 76211, GVCRWFN, ES 52618~DIGA VQWCWEGB PZIBMHDK - LNSW___BQMUFQIU/WFBECUSS/NZHMO-EULARXD.YTE","UB OI830148577721 (134-8447540-85) 84215 EFHX UIKWP EZ 81824, VUDIIJC, VG 03756~XDZN QDTVDQTX VSNFTTLC - OBPI___MELCPIOR/ZKNLHORV/VTORX-VDLPTFY.HVL~ZUZZNYNSJSL DQEMIUW: 64/37/5421 AV CFF","OU JJ776705862176 (023-8661810-71) 76611 JXDL KNEGY KY 17612, YQGHBCQ, FT 65813~5773475568760125646___NCZRGPZQ/VSQRNQSG/LMXZL-JZHVTZB.NXN","SV YJ672025137653 (040-2187756-68) 27664 QSYI NDDRV LZ 37174, JAPSEKU, PL 21158~7300520656005887056___AQWDMWGA/PCPYBIVT/XYFVX-IYYLQVN.URW~GNHWNHKTPK_YQQ_FUIHSFM_PTWZ",,,,,3,,.\PBSWLXW\736361_183450_015\IKVPSN_634315_566654_561_YCGY-4-QNLILW-2_20\LOQCJ\CQCSQOLVIUD\QWPNVS-RIGGYL-16-5685703653188.AXT 7258156126348,0473-58-11 38:22:72.728 HJC,0632375766845,3517,3,352,5,AWRDKXGRUUWJZAZMPJJ RFMB#3,KMPHFOVW/AADFXTDW/DYNBI [JMZC] NSIWFAWCOHX,,,563,GLUIXLR VASW,"NC PP224600008080 (216-4462837-30) 51574 GPYZ WGLOE AG 70338, EAFKNMR, LP 35012~KVNZ NJVECGQD BJZLCOVF - GTVH___MOLHWRZW/AWBCAUZU/TDXRL-NFUWGZS.WVB~MKVVQTBHPEB IJQUWDS: 41/43/4060 RW KVU","EB CD182150024451 (026-4207565-54) 21443 ZHZU IXDNN WY 81837, TULYVBH, MR 66888~PXTQ RNWJJOVV RVOZAMGO - USQA___OFVXBCUS/QCLKPVVG/DGBFS-CNHUFZD.CUD","RZ EW650272682357 (304-5733077-24) 83234 KXWA GTOGF RF 03165, VQGYDGF, LI 73608~6715316733041543431___IDJXSNSG/XSFGTLFW/WHNUK-FVHYTQV.RGB~FJZJCPJAVQ_IZI_MVDCOVQ_AOZL","SN NG121734626307 (784-7582526-85) 51718 SZIK GMIPR KU 30855, OPAGSLT, LH 48842~7421858771734822211___LXPKNTPV/TRSMCIKR/ERCEF-AOSDRKR.EAK",,,,,7,,.\FCOYYBR\021534_705407_487\TJVFWA_335636_624360_710_DDDA-3-TXZTYT-1_73\PREFL\RCERQLLAKYD\PBFUTP-XOEPLH-16-3803644284675.BDR 8742212184713,8476-46-08 20:24:11.784 AXB,1820704107874,478,2,577,4,PUBKSFHDUZQZSFKEWSZ CNEP#0,MTXTPSVS/XOCJMIPK/TAPYN [XNSEIOEB] EKZPTLZMPTP,,,868,CHZDDCA JPYUKTGJ,"GU ZY587240667718 (624-2404382-52) 81384 MAVF VTHOS WF 55630, YQLMJJR, HN 01730~VIII YENDGSQC SPJLTTPV - RVZY___VSVSKTRC/UTTLYSAN/ZJKCS-VKIXDHC.NNP","YT LG123841815706 (521-0170404-13) 02062 TWZH ESZXY DM 75278, RUKGKIK, JH 75814~AKSV DKCSCWXY XYUPJUTH - JCCU___CGCIWUYK/BBJYMUEM/AUPZJ-AACIXKE.TRJ","LL BC230432150580 (444-7604716-80) 47852 NILH YLZQG QO 56762, AONKFIY, FE 37182~7611856458135617770___MLOCISOM/MRYANKOZ/UHXMV-GFOIMRT.WIO","SK HG837866385521 (776-1561673-17) 68830 JJCG VKJXF TT 71476, ZNBDNKC, CV 88137~5880805515316834868___MVJWZAGN/BSODCUZJ/GRLVV-UKNCLUV.AXG",,,,,0,,.\CXLVEYS\246343_351388_705\CJWYJL_562345_356472_021_HMXU-2-SKTXXG-1_30\FAWNI\WBGTZPTCLSP\HKQSUK-GNRBGK-46-7861671611244.JAJ 0738667466237,3028-82-61 80:21:57.886 HWE,6833883760631,600,0,423,6,MFWWPBXGMAWMXQTFCZW ZHUH#6,LVHSSLLL/HYLDOBFP/IDWIR [SFKAOFCZ] MPCHJ ATDHKPNOJLL,,,136,XYNHWVK XLTSYLLX,"FN GW440044755814 (706-0384730-42) 14854 BMGH RGNGC YC 25471, CXOBEVO, DV 53621~MRWI YWTJAWZG CXFYBMTW - FAAZ___JAAQEGUG/JXNUFFUQ/BJEHR-LIDOBOA.NSR","EN UB022235738022 (753-1554720-43) 16271 YULI RZHMK UU 10386, GRBIDHP, MR 34185~CPGR RPOISYAH YKFHLQOC - QHOX___RWWQRCHF/AFAVXRMW/CSVOI-CSYLHXP.TRZ","CC GA041630041442 (480-2868067-50) 13773 GOCL UKMQO LY 78101, CTGASVL, CL 48871~6022688508846206518___RFDSFDQB/EYRUVHGR/FIAWB-QGVBLWM.RBX","LY MV560368621344 (516-0546880-35) 11501 MDNF QULSQ VC 40424, OYQJBIY, ZX 00703~6216235015877683310___CVPWUUSN/WNMWUALF/DWZFE-UPPMIFA.CFP",,,,,7,,.\AVWYINN\447456_282041_743\RGNSEZ_112054_667718_840_ZHCD-0-GHIHCC-5_64\AQZWK\FVWLBBPREFE\MKLRAO-TVMGLH-03-8283024767400.KIE 4745581215741,7866-26-13 43:80:23.863 WWC,0652443150620,0647,3,8504,34,FBFJCKWYEYUYIKNCBTP DSDZ#4,UBZUQTEH/UEPTVYGK/GNYIP [YDSJDW] SSKMX,,,726,WFLXCPAOH,,"RX CP664767537814 (020-8514630-05) 18437 SBVO WQUJR SE 83845, RVWCMXS, SM 26707~LJBH OVJCCHBDP~RTTFDW___ADEMWDJQ/UVIYXZBS/CQRIF-ZKBYVIE-WTSFQBI.BFL",,"JI PS565753082171 (037-5750628-08) 44747 OROI OXUWL GK 23600, XTZZADP, OI 54113~2573516177808716460~MPQMUQ___RRSTZKDY/YCSXPLAZ/FVVQL-JARCMZE-ORKHQQI.XSU",,,,,8,,.\AKAIEWY\515032_477545_554\CQEHCS_208022_287748_726_EXOG-4-EGOMPO-0_74\URRJQ\XBMNLORQIMY\FVLNOE-VIQHMQ-68-2414003507885.ZFU 4436410216811,4658-13-14 04:57:50.527 MTZ,2031852405273,5704,4,8064,36,IKKLSELVLPPAFVXGHFR OEDV#0,AUYMIEIW/KAJZEWVF/FOSKL FESV,,,631,LCHJUABHX,,BGNQ~ZEWACDFWRI BQLLMTU~IRXF,,YZSN~3226563454007004641~UBZRQRNEQUA=8128820644450213520,,,,,7,,.\SDCYTRY\720184_027221_326\OLZTSV_627802_144710_387_ZTBY-8-NIULYD-2_82\KRYYX\OKSRDLIPRZC\AVMIFS-ITLVTO-72-7123504318465.RPW 8033220047636,8365-30-12 10:83:53.640 EIE,5812161618070,,8,3,6,WNOEIUZWRFZVUQWGDNP CPND#1,LYRPVLHM/INEFCSMQ/RDBXI [UPHHR YPXCBA],,,617,BFDBITI NLZHB OEVKUO: VFTTR LKASWB,GGHO~OCBTUUMWNI QQRGIBY~IAAL,BKXK~RMTVEBLGVC HEKDMAR~CIVF,TKSM~4770456604700882441~XANNNSJBGBH=7451318458847214644,IABK~2885258658202715316~URPXAZXBOMC=5586504563722000128,,,,,1,,.\FYAFFQG\175825_863316_831\WFJXNF_738442_607664_415_OZSQ-7-DPMDND-0_13\KKYTK\YHEXYAFZIPY\BLOMZK-ONGGQN-35-7275234343060.PRO 8784852121145,1542-23-05 17:04:22.537 MOF,2131214146084,006,0,4,2,LNBSVCFZBDVQQKNVEAZ ONAO#7,MHAMVLLL/CBOUWWMP/NJARIT SCXJNWTDWJ ZD ODNKHS ,,,441,FIDATMU ZYQXH CNWXXY: XVLBV YJOSBA,DGVF~FXSLOGDCFQ VFXJJKY~DXXO,VWDT~KVIFUMGNLU DHIOBUP~GFQC,FTSW~3746107880678685781~YJMUJWEYJSU=0713844542040411856,VNKI~8380188884258663378~UNQPHFJUYPA=7780800351122070108,,,,,3,,.\BUEBUDN\676871_013637_804\JHWGBX_714352_236415_600_VTEV-8-JFSKSH-4_47\BHCJR\LQSSKZTGMEL\DNNZJV-HLLKZH-05-0273606334072.NIP 8252213772473,3177-27-17 73:68:67.771 IMW,,,,,,POVABDWMOFRPHJHWFOB JYBB#2,DKRKPBKK/NYNHHOAF/AIPOG [RSLSYJ],,,713,,,,,,,,,,1,HMZXQB [XHTJCI] YHU GLOW MLUMNCI,.\SEPOBJQ\346028_074540_783\ZYJRSK_842244_438522_422_WKBS-5-GOFJCK-8_87\FVSJJ\AJRYXHQOJGV\GCBEZO-XWQLOE-50-6505303128461.SVF 3126548270463,1452-82-67 17:32:75.141 FUQ,8317621633243,1223,2,2003,34,ZTBMTZKSEDPICLJMFYA BNDM#5,JFMPEOZR/LYOILPFQ/DAGMZM VTVRXBFY HK XLJRCKFL GVFM,,,804,TPFTYWW ZIZMDI: RXHCUS,,"KD IG174512532685 (677-3277136-65) 64135 ZXWY BREXB QY 02010, CSCFDSJ, ZT 87578~EYQA BZNMGSZPQ~TYGSVZY",,"ZQ HO065062650433 (181-2624507-83) 51501 GTAV YOANB AS 37734, THUGCWE, VX 00808~4021156648675880641~QHPPACE",,,,,3,,.\PXZIVQC\111515_675142_467\LDALLX_232434_341618_400_PBCW-5-YUJNNP-2_42\ACKGH\DINEBNJHNHW\KSEFYR-ERXFHY-31-1573620028085.XEL 7870246111838,2380-42-73 18:50:64.011 FDN,0134171805587,6841,2,315,23,ICIAWKUFPLXYSUOWGTX BUOG#1,ILMFYDHI/RAJMLVMD/EUYNQKIZYQ FFDYR/BYYIN 'TXYKKNXXFE' VMRR,,,843,PNKHZOF ZVWRFRAGTH,"EY IX186746883033 (676-1828083-82) 27424 DWMB KMCIM ZV 18081, TSDROLW, HW 82637~XLBE AEYKOHICD~HQKATLO","BB XU020706186827 (867-5421673-55) 17165 LDBU DVEJI TU 66483, JBNFFLU, TB 04266~HYRY ZJNPRRDTR~NBFHSM","LA FX133131586844 (566-8555147-38) 05611 JYTK DSCCH RH 61881, NMEZEQK, QL 08077~4688351814075064127~DQZXKOY","XP DR267444076671 (704-7664026-13) 56044 QCDI CDEYW LQ 60303, CHIJJXZ, RF 50130~8360352240807474144~LFCQTXZFWRZ=8071774134355486080",,,,,3,,.\FESZUYQ\655265_722456_412\HXOEZF_773510_716018_516_WMFG-3-QEELMO-1_82\VQXTA\DQCFYAJNIMG\KBKNQQ-QSZXHH-28-6126351812624.AVN 5012687737874,4702-74-00 37:65:41.776 USX,5160234354237,855,7,210,0,YBOWARBOLGZJSZQXBKD DTRS#5,KFOUTKOM/XSTCQQKB/NMDIIWYPJP WLMHL/DFJLX OHDM WEQX,,,515,GKUIETM RT AFP,"FZ TX211082134223 (824-8771237-38) 30043 GEHP SDOBV OW 61562, MUTPWDF, XF 82425~BKBC ATAFTNLVG~UGIEHW___KSBAHWLP/CJGGWXPW/DRJECZPEUH-PLXIXIR.CXW","JX OP232625176316 (738-8145736-60) 58774 QESC RHFJN QG 04244, SOTUVKY, SS 12654~ORQN THFPULXIW~HYSMLH___POOKJOXM/FLJRODKU/ETKANFLSTQ-BKECHXD.WDO~FHABVHRJMAM PNYSNJN: 32/74/2686 NZ UCF","II JH508272275163 (082-6722106-23) 17056 SFOK ELVNM VZ 78811, RZOCAEN, VO 85378~2321068076636332722~GYXYIDSVTWL=5536057543617356564___AGULBGGV/KGRZEEFA/KTCIEWMTOR-NJLLTDH.EXR","SG UJ670774872178 (442-5071282-80) 87183 AESY ZBFFP DT 62201, RZSDFAZ, BF 48476~2381657188284774035~JDNLMFYVANW=0784110705453576006___QCOFLOIY/TRKNDOHB/BSEVDJFCAK-CKVDBNX.SST~MEYOOQPEQK_SEL_BYQGOYI_KZTZ",,,,,5,,.\LJMUALF\520748_100660_561\UVUSWF_341256_178884_001_FCVV-5-IEJPXQ-8_11\DSMBH\ZVSRJNDCYVY\CDRKUV-TGLDJO-86-6741828737373.WVP 0011630368332,8186-27-25 35:44:55.355 TKX,3813041381558,,4,0,1,ISGJFVBTAJATAHFPWJP XOZU#7,XWATPVFO/BJOOMOQS/IHCKBSNRRY MQHBZ/AUMZ HOHPZLCOAS ,,,188,"HDGYHPV FIELMW WVABPRTU YMBBE/FZWWFX IUUP AMF HR PPZ 7 VHV SPOY 2 PECU KHD TBVDZ OWKDYHOAQON HZ KSAV MWT ZBBCI LD EVJ 0 LZUJIQA VXH KSXODHJYFW EZKFZQVWF NNFSU XPMWAZISMZ IGFKM/UTIF RFETT VWJ UB KX (BNUYFQ/BHOPM/PHRZH) QGP QQ UI PKY KV FTUWO QKHPS ","KB AP214231031275 (530-1075503-26) 38317 PXQJ SHPNG AQ 34076, WXKLZTA, WH 07608~POMK RMVLBLRPU~IATQSB___IZCHLQRH/UJDUEVHS/QTSMSTPBYN-JYRVJNX.WWZ~CCKCCVSZWON GFLNXLU: 67/70/4240 FH GQV","KY XU402285135217 (888-3177514-26) 68066 FXLF IBSTI BN 76452, XUYFEZQ, RG 73453~QBMT NWCPVRCGX~GXQLFN___KMDFNNHE/MSIAXMFB/GDKTZYFCOU-ZNPHYPL.YUH~CCJLPWBAIPT OTQPFUM: 77/56/5845 JL OHS","QE ND415515266832 (682-0708848-08) 68104 ECGY OBFSJ FD 83048, PAMAXUM, BJ 11216~6411374808780764184~YKBEKHEDXFL=6632288775760173083___NVPCKXFX/OUIMXEEV/KVXWRWTWBR-PXFGDHU.BEK~CUFEKPVXQC_XFM_MFLXNLZ_JDQT","ET PE741504646333 (444-5578044-71) 40241 OOER HJFUW ZE 62418, QZECYKQ, UT 45751~3317632818512406135~VBTZQABCPVX=2817113257826205515___VLIVRDOK/GJNWFXOD/HTOKJXPUHL-NNTXLCQ.INH~CCMOQNAOZD_LST_JXNFFCK_METX",,,,,0,,.\WITGKQD\265770_874467_477\JQPVNH_772025_554713_223_ZBSW-4-MNUMYU-3_64\RLZFL\LQGUVKPIVAW\IREAIQ-BJUCHY-75-2513118136606.CMR 7377583384701,3548-88-00 73:61:46.050 OGL,0186206430318,7717,5,2425,3,GODOTTOHMOXTRLVLHBK JLNR#5,VJQGGIUS/LPOQRVEG/OUXCJVBZFF AEPQM/HFXWS [SJVZ],,,784,"ABGOLIL CLWGET CAOACSK SITZZ/YFZVTU GNDF VRZ LR ZMS 5 BXZ ZXGB 0 NEFJ OKC SUWDT OHOCQQIEAHP VQ RBQI OVK MMYWD XF SAR 0 WFJHUQJ ZIQ EMDDTZEHAG AXIUWOFTB PZHGF ETGIOHCFFB MCTAK/EVPN LFRHC FZK HE YJ RJP OK WGBRX VHUEP ","DO WS831357155628 (056-5554417-48) 75251 ETUP VBEDG QD 37033, WYEUKEA, YL 04116~QDZJ FHWGEPQUR~QKKGCS___NZLNTRQI/JPVNDKAX/KLXKGTGBXR-CAHVZBJ.WYM~OZIGJJBKIUF YCWMVOQ: 33/58/8571 CK KBB","HA KY505360512762 (680-4065343-64) 78733 JYTR QCZJO GB 83350, SRASJAQ, VB 26420~LGMR EITQAGMUP~KVYDRA___FHHDSAHX/FYDHBMOM/QYKTMEKLZO-BSTXZWE.OPQ","KK PX117102136762 (373-8136233-05) 34056 EHHA QJDUL DK 41215, UTHPDJM, ZN 44313~4336774580807536260~AOULSQHNOMN=2655830651246115421___VFQVFSAB/FFZVBJNY/UTFTWWIMNB-XPTBQAB.HBV~KGFGBEYZJH_YDC_UABZEHX_GPGC","EF DT302687238300 (371-7438872-56) 81082 PEEL FMVXJ KT 30585, FZUVDQF, KL 20861~7011570627807645856~JAHDKEKKISO=3388470808006285204___ZNKWDZOE/JRILYGAW/POWPENQZJW-BAXUXIK.VFK",,,,,6,,.\GPREVBH\778034_262365_125\GYLGXB_401470_522202_321_SEOX-6-WKHWSJ-7_57\NPCIL\XRWDDAFRHMY\VSHZTK-FRHKFF-65-1328128022847.OSE 3440245203140,1843-88-54 62:88:02.362 SPI,1460242042547,7153,2,5151,31,ZUQBFFYUFFDRMDLYKCR CIFR#3,HSHQMFWO/OTZNKWJY/UZZHSMNQHP SYDSN/LRSYF [LYOV],,,815,ROFVBIKZL,,"XD DV344177415638 (257-4772580-02) 15466 YJAE GDZRE FB 17817, ZAXEHVX, UY 20037~SCMX VQGMKIOQJ~CVBQLC___EPCLSZJU/PHXYRLDE/UCILY-VHZISOK-YCJJHPD.AAH",,"JA AM577245403111 (584-1267301-00) 35022 BPJA TVRJO IW 28073, RPMEKJH, KF 26884~8755680828012142411~NMNVSJ___YIZLSNFM/FQYNIKXY/FCLOU-NMVLGGM-WRFUEZJ.QSH",,,,,4,,.\YAPCNXJ\004570_850034_757\VWBZSS_848482_600874_487_PEKT-6-KQTVIL-7_30\IRVQT\HUZWLBSJYHZ\XFWPXQ-WSPJHC-00-0770000855383.KKZ 1305220146734,6638-34-75 55:33:34.683 XZK,2272308212843,8568,6,7753,28,ZEINNPEWODONFJMLZXU MSAP#6,OUDISUSZ/GME GQ IPPGYWKU,,,328,KHDXCKADL,,"NB QU874223884561 (284-0182873-42) 85881 SKTJ EVRFF TU 06804, HSWGZOB, EU 00870~RGVK GZVEDIMQK~MQDHGE___GIROOBYE/YLXUQARG/RIFMW-UBJXOER-CYESQQU.AVW",,"KU EG227356858232 (468-0307602-28) 56312 ACNN TOJZI RL 73046, BSJYDCU, ZZ 04600~5117107224272107675~SIQARH___IGNCLREI/JGGHDYDV/GLIJF-MXMJRSL-TKSGMWQ.YPD",,,,,6,, data.table/inst/tests/doublequote_newline.csv0000644000175100001440000000033213172210047021167 0ustar hornikusersA,B 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,"embedded ""field"" with some embedded new lines as well" 2,"not this one" 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a 1,a data.table/inst/tests/536_fread_fill_1.txt0000644000175100001440000000021713172210047020050 0ustar hornikusersa,b,c 1,2,qq 1,2,qq 1,2,qq 1,2,qq 4,5 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1 1 1,2,qq 1,2,qq 1,2,qq 1,2,er data.table/inst/tests/ch11b.dat0000644000175100001440000000406613172210047016001 0ustar hornikusers001 307 0930 36.58 0 002 307 0940 36.73 0 003 307 0950 36.93 0 004 307 1000 37.15 0 005 307 1010 37.23 0 006 307 1020 37.24 0 007 307 1030 37.24 0 008 307 1040 36.90 0 009 307 1050 36.95 0 010 307 1100 36.89 0 011 307 1110 36.95 0 012 307 1120 37.00 0 013 307 1130 36.90 0 014 307 1140 36.99 0 015 307 1150 36.99 0 016 307 1200 37.01 0 017 307 1210 37.04 0 018 307 1220 37.04 0 019 307 1230 37.14 0 020 307 1240 37.07 0 021 307 1250 36.98 0 022 307 1300 37.01 0 023 307 1310 36.97 0 024 307 1320 36.97 0 025 307 1330 37.12 0 026 307 1340 37.13 0 027 307 1350 37.14 0 028 307 1400 37.15 0 029 307 1410 37.17 0 030 307 1420 37.12 0 031 307 1430 37.12 0 032 307 1440 37.17 0 033 307 1450 37.28 0 034 307 1500 37.28 0 035 307 1510 37.44 0 036 307 1520 37.51 0 037 307 1530 37.64 0 038 307 1540 37.51 0 039 307 1550 37.98 1 040 307 1600 38.02 1 041 307 1610 38.00 1 042 307 1620 38.24 1 043 307 1630 38.10 1 044 307 1640 38.24 1 045 307 1650 38.11 1 046 307 1700 38.02 1 047 307 1710 38.11 1 048 307 1720 38.01 1 049 307 1730 37.91 1 050 307 1740 37.96 1 051 307 1750 38.03 1 052 307 1800 38.17 1 053 307 1810 38.19 1 054 307 1820 38.18 1 055 307 1830 38.15 1 056 307 1840 38.04 1 057 307 1850 37.96 1 058 307 1900 37.84 1 059 307 1910 37.83 1 060 307 1920 37.84 1 061 307 1930 37.74 1 062 307 1940 37.76 1 063 307 1950 37.76 1 064 307 2000 37.64 1 065 307 2010 37.63 1 066 307 2020 38.06 1 067 307 2030 38.19 1 068 307 2040 38.35 1 069 307 2050 38.25 1 070 307 2100 37.86 1 071 307 2110 37.95 1 072 307 2120 37.95 1 073 307 2130 37.76 1 074 307 2140 37.60 1 075 307 2150 37.89 1 076 307 2200 37.86 1 077 307 2210 37.71 1 078 307 2220 37.78 1 079 307 2230 37.82 1 080 307 2240 37.76 1 081 307 2250 37.81 1 082 307 2300 37.84 1 083 307 2310 38.01 1 084 307 2320 38.10 1 085 307 2330 38.15 1 086 307 2340 37.92 1 087 307 2350 37.64 1 088 308 0000 37.70 1 089 308 0010 37.46 1 090 308 0020 37.41 1 091 308 0030 37.46 1 092 308 0040 37.56 1 093 308 0050 37.55 1 094 308 0100 37.75 1 095 308 0110 37.76 1 096 308 0120 37.73 1 097 308 0130 37.77 1 098 308 0140 38.01 1 099 308 0150 38.04 1 100 308 0200 38.07 1 data.table/inst/tests/melt-warning-1752.tsv0000644000175100001440000001324013172210047020141 0ustar hornikusersId Id2 Geography RECORD CODES - File Identification RECORD CODES - State/US-Abbreviation (USPS) RECORD CODES - Summary Level RECORD CODES - Geographic Component RECORD CODES - Characteristic Iteration RECORD CODES - Characteristic Iteration File Sequence Number RECORD CODES - Logical Record Number GEOGRAPHIC AREA CODES - Region GEOGRAPHIC AREA CODES - Division GEOGRAPHIC AREA CODES - State (FIPS) GEOGRAPHIC AREA CODES - County GEOGRAPHIC AREA CODES - FIPS County Class Code GEOGRAPHIC AREA CODES - County Size Code GEOGRAPHIC AREA CODES - County Subdivision (FIPS) GEOGRAPHIC AREA CODES - FIPS County Subdivision Class Code GEOGRAPHIC AREA CODES - County Subdivision Size Code GEOGRAPHIC AREA CODES - Place (FIPS) GEOGRAPHIC AREA CODES - FIPS Place Class Code GEOGRAPHIC AREA CODES - Place Size Code GEOGRAPHIC AREA CODES - Census Tract GEOGRAPHIC AREA CODES - Block Group GEOGRAPHIC AREA CODES - Block GEOGRAPHIC AREA CODES - Internal Use Code GEOGRAPHIC AREA CODES - Consolidated City (FIPS) GEOGRAPHIC AREA CODES - FIPS Consolidated City Class Code GEOGRAPHIC AREA CODES - Consolidated City Size Code GEOGRAPHIC AREA CODES - American Indian Area/Alaska Native Area/Hawaiian Home Land (Census) GEOGRAPHIC AREA CODES - American Indian Area/Alaska Native Area/Hawaiian Home Land (FIPS) GEOGRAPHIC AREA CODES - FIPS American Indian Area/Alaska Native Area/Hawaiian Home Land Class Code GEOGRAPHIC AREA CODES - American Indian Trust Land/Hawaiian Home Land Indicator GEOGRAPHIC AREA CODES - American Indian Tribal Subdivision (Census) GEOGRAPHIC AREA CODES - American Indian Tribal Subdivision (FIPS) GEOGRAPHIC AREA CODES - FIPS American Indian Tribal Subdivision Class Code GEOGRAPHIC AREA CODES - Tribal Census Tract GEOGRAPHIC AREA CODES - Tribal Block Group GEOGRAPHIC AREA CODES - Alaska Native Regional Corporation (FIPS) GEOGRAPHIC AREA CODES - FIPS Alaska Native Regional Corporation Class Code GEOGRAPHIC AREA CODES - Metropolitan Statistical Area/Micropolitan Statistical Area GEOGRAPHIC AREA CODES - Metropolitan Statistical Area/Micropolitan Statistical Area Size Code GEOGRAPHIC AREA CODES - Metropolitan Division GEOGRAPHIC AREA CODES - Combined Statistical Area GEOGRAPHIC AREA CODES - New England City and Town Area GEOGRAPHIC AREA CODES - New England City and Town Area Size Code GEOGRAPHIC AREA CODES - New England City and Town Area Division GEOGRAPHIC AREA CODES - Combined New England City and Town Area GEOGRAPHIC AREA CODES - Metropolitan Statistical Area/Micropolitan Statistical Area Principal City Indicator GEOGRAPHIC AREA CODES - New England City and Town Area Principal City Indicator GEOGRAPHIC AREA CODES - Urban Area GEOGRAPHIC AREA CODES - Urban Area Size Code GEOGRAPHIC AREA CODES - Urban Area Type GEOGRAPHIC AREA CODES - Urban/Rural GEOGRAPHIC AREA CODES - Congressional District (111th) GEOGRAPHIC AREA CODES - State Legislative District (Upper Chamber) (Year 1) GEOGRAPHIC AREA CODES - State Legislative District (Lower Chamber) (Year 1) GEOGRAPHIC AREA CODES - Voting District GEOGRAPHIC AREA CODES - Voting District Indicator GEOGRAPHIC AREA CODES - Reserved GEOGRAPHIC AREA CODES - ZIP Code Tabulation Area (5 digit) GEOGRAPHIC AREA CODES - Subminor Civil Division (FIPS) GEOGRAPHIC AREA CODES - FIPS Subminor Civil Division Class Code GEOGRAPHIC AREA CODES - School District (Elementary) GEOGRAPHIC AREA CODES - School District (Secondary) GEOGRAPHIC AREA CODES - School District (Unified) AREA CHARACTERISTICS - Area (Land) AREA CHARACTERISTICS - Area (Water) AREA CHARACTERISTICS - Area Name-Legal/Statistical Area Description (LSAD) Term-Part Indicator AREA CHARACTERISTICS - Functional Status Code AREA CHARACTERISTICS - Geographic Change User Note Indicator AREA CHARACTERISTICS - Population Count (100%) AREA CHARACTERISTICS - Housing Unit Count (100%) AREA CHARACTERISTICS - Internal Point (Latitude) AREA CHARACTERISTICS - Internal Point (Longitude) AREA CHARACTERISTICS - Legal/Statistical Area Description Code AREA CHARACTERISTICS - Part Flag SPECIAL AREA CODES - Reserved SPECIAL AREA CODES - Urban Growth Area SPECIAL AREA CODES - State (ANSI) SPECIAL AREA CODES - County (ANSI) SPECIAL AREA CODES - County Subdivision (ANSI) SPECIAL AREA CODES - Place (ANSI) SPECIAL AREA CODES - Consolidated City (ANSI) SPECIAL AREA CODES - American Indian Area/Alaska Native Area/Hawaiian Home Land (ANSI) SPECIAL AREA CODES - American Indian Tribal Subdivision (ANSI) SPECIAL AREA CODES - Alaska Native Regional Corporation (ANSI) SPECIAL AREA CODES - Subminor Civil Division (ANSI) SPECIAL AREA CODES - Congressional District (113th) SPECIAL AREA CODES - Congressional District (114th) SPECIAL AREA CODES - Congressional District (115th) SPECIAL AREA CODES - State Legislative District (Upper Chamber) (Year 2) SPECIAL AREA CODES - State Legislative District (Upper Chamber) (Year 3) SPECIAL AREA CODES - State Legislative District (Upper Chamber) (Year 4) SPECIAL AREA CODES - State Legislative District (Lower Chamber) (Year 2) SPECIAL AREA CODES - State Legislative District (Lower Chamber) (Year 3) SPECIAL AREA CODES - State Legislative District (Lower Chamber) (Year 4) SPECIAL AREA CODES - American Indian Area/Alaska Native Area/Hawaiian Home Land Size Code SPECIAL AREA CODES - Combined Statistical Area Size Code SPECIAL AREA CODES - Combined NECTA Size Code SPECIAL AREA CODES - Metropolitan Micropolitan Indicator SPECIAL AREA CODES - NECTA Metropolitan Micropolitan Indicator SPECIAL AREA CODES - Public Use Microdata Area SPECIAL AREA CODES - Reserved 310M100US10180 10180 Abilene, TX Metro Area UR1US US 310 0 0 407913 10180 18 999 ç.).0-*(+,))+(0(E-314 ç.).0-*(+,))+(0(E-316 Abilene, TX Metro Area S 165252 69721 3.24520222E1 -9.97187428E1 M1 0 1 data.table/inst/tests/issue_1330_fread.txt0000644000175100001440000000003213172210047020076 0ustar hornikusersa b 1 1 2 2 3 3 4 4 5 5 data.table/inst/tests/536_fread_fill_2.txt0000644000175100001440000000021613172210047020050 0ustar hornikusersa,b,c 1,2,qq 1,2,qq 1,2,qq 1,2,qq 4,5 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1,2,qq 1,2,qq 1,2,qq 1,2,qq 1 1 1 1,2,qq 1,2,qq 1,2,qq 1,2,er data.table/inst/tests/fread_blank2.txt0000644000175100001440000000007613172210047017461 0ustar hornikusersa,b,c 1,2,3 1,2,3 1,2,3 1,2,3 1,2,3 data.table/inst/tests/issue_1087_utf8_bom.csv0000644000175100001440000000001713172210047020530 0ustar hornikusersa,b,c 1,2,3 data.table/inst/tests/issue_1095_fread.txt0000644000175100001440000013166013172210047020122 0ustar hornikusers2013130413CN02422 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,03/22/2013,F ,1309,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,06, 2,4 PARTRIDGE WAY (DRIVEWAY) , , , , , ,09,01,02,01,01,01, , , , , , , , , , 0, , , ,N,NONE ,84049 2013130413CN02826 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,04/02/2013,TU,1658,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,11, 1,375 SR 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01, , , , , , , , , , 0, , , ,N,NONE ,84057 2013130413CN03163 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,04/11/2013,TH,0853,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,410 SR 34 (PARKING LOT) , , , , , ,09,01,02,02,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84057 2013130413CN04103 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,05/09/2013,TH,2007,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,Y,N,08, 2,420 SR 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84054 2013130413CN05020 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,06/08/2013,SA,0911,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,410 ST RT 34 (PARKING LOT) , , , , , ,09,01,02,01,01,05, ,01, , , , , , , , 0, , , ,N,NONE ,84055 2013130413CN05207 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,06/15/2013,SA,0148,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,Y,N,06, 2,6 RT 537 WEST (PARKING LOT) , , , , , ,09,01,02,01,06,01, , , , , , , , , , 0, , , ,N,NONE ,84049 2013130413CN05391 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,06/21/2013,F ,1437,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,03, 2,273 RT 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84027 2013130413CN05395 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,06/21/2013,F ,1541,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,06, 2,1 TRUMP NATIONAL BLVD , , , , , ,09,01,05,01,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84046 2013130413CN05534 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,06/26/2013,W ,1727,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,06, 2,9 PROFESSIONAL CIR (PARKING LOT) , , , , , ,09,01,02,01,01,01, , , , , , , , , ,25, , , ,N,NONE ,84050 2013130413CN05748 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,07/03/2013,W ,2035,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,11, 1,WHITE OAK DRIVE , , , , , ,07,04,02,02,03,02, , , 350,FE,W,ACORN PLACE , , , ,25, , , ,N,200 FEET OF GROSS ON THE PROPERTY OF 7 WHITE DRIVE. DANIEL CLIFFORD HOMEOWNER. ,84046 2013130413CN06672 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,08/03/2013,SA,2330,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,420 SR 34 (PARKING LOT) , , , , , ,09,01,02,01,06,01,05, , , , , , , , , 0, , , ,N,NONE ,84050 2013130413CN07165 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,08/21/2013,W ,0344,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,11, 1,BUCKS MILL RD , , , , , ,07,01,02,01,05,01,05, , 100,FE,S,CR 537 , , , ,25,50, , ,N,JCP&L POLE JC971CN ,84057 2013130413CN07206 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,08/22/2013,TH,1204,01,COLTS NECK PD , , 0, 1, 0, 0,I,B,N,N,01, 2,FIVE POINT RD , , , , , ,07,01,02,01,01,01,05, , 528,FE,S,CR 537 , , , ,35,50, , ,N, ,84050 2013130413CN07311 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,08/25/2013,S ,1737,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,320 SR 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01, , , , , , , , , , 0, , , ,N,NONE ,84036 2013130413CN08261 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,09/25/2013,W ,0954,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,410 SR 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84054 2013130413CN08334 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,09/27/2013,F ,1057,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,Y,N,08, 2,340 RT 34 (PARKING LOT) , , , , , ,09,01,02,01,01,01,05,01, , , , , , , , 0, , , ,N,NONE ,84045 2013130413CN09374 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,11/01/2013,F ,2140,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,59 FIVE POINT RD , , , , , ,09,01,02,02,06,01,05, , , , , , , , , 0, , , ,N,NONE ,84057 2013130413CN09445 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,11/03/2013,S ,1301,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,11, 1,15 BRANDYWINE LN , , , , , ,09,01,02,01,01,01, , , , , , , , , , 0, , , ,N,NONE ,84044 2013130413CN09783 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,11/13/2013,W ,1331,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,08, 2,IRETON KEY , , , , , ,09,01,02,01,01,01,05,01, 150,FE,E,COLTS NECK BLVD , , , ,25,25, , ,N,NONE ,84045 2013130413CN10896 ,MONMOUTH ,ATLANTIC HIGHLANDS BORO ,12/23/2013,M ,2140,01,COLTS NECK PD , , 0, 0, 0, 0,P,B,N,N,06, 2,420 SR 34 (PARKING LOT) , , , , , ,09,01,02,02,07,02,05, , , , , , , , , 0, , , ,N,NONE ,84057 2013130513-000795 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,01/26/2013,SA,1502,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,03, 2,BAYSHORE PLAZA PARKING LOT ,E, , , , ,09,01,02,01,01,01,05,01, , , , , , , ,25, , , ,N,? ,137 2013130513-001195 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,02/06/2013,W ,1451,01,ATLANTIC HIGHLANDS PD , , 0, 1, 0, 1,I,B,N,N,08, 2,11 STATE HIGHWAY RT 36 PARKING LOT , , , , , ,09,01,02,01,01,01,05,01, , ,W,FIRST AVENUE , , , ,45,25, , ,N, ,136 2013130513-001416 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,02/13/2013,W ,0944,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,11, 1,9 STATE HIGHWAY 36 PARKING LOT ,W, , , , ,09,01,02,01,01,01,05,01, 500,FE,W,FIRST AVENUE , , , ,10,25, , ,N, ,134 2013130513-003219 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,04/07/2013,S ,1555,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,06, 2,BAY SHORE PLAZA , , , , , ,09,01,02,01,01,01,05,01, , ,S,STATE HIGHWAY 36 , , , ,25,45, , ,N, ,137 2013130513-003432 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,04/13/2013,SA,1236,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,9 STATE HIGHWAY 36 , , , , , ,09,01,02,01,01,01,05,01, , ,W,FIRST AVENUE , , , ,25,25, , ,N, ,#136 2013130513-004366 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,05/10/2013,F ,1727,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,9 STATE HIGHWAY 36 , , , , , ,09,01,02,01,01,01,05,01, , ,W,FIRST AVENUE , , , ,25,25, , ,N, ,#136 2013130513-004558 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,05/16/2013,TH,0757,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,96 EAST AVENUE , , , , , ,09,01,02,01,01,01,05,01, , ,E,MANYMIND AVENUE , , , ,25,25, , ,N, ,136 2013130513-005034 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,05/28/2013,TU,1703,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,999 STATE HIGHWAY 36 PARKING LOT ,N, , , , ,09,01,02,02,01,02,05,01, 50,FE,S,WEST GARFIEND AVENUE , , , ,10,25, , ,N, ,134 2013130513-005212 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,06/02/2013,S ,1607,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,BAYSHORE PLAZA 9 STATE HIGHWAY 36 ,N, , , , ,09,01,02,01,01,01,05,01, , ,S,STATE HIGHWAY ROUTE 36 , , , ,25,45, , ,N, ,130 2013130513-005725 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,06/17/2013,M ,0929,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,PARKING LOT OF 153 GRAND AVENUE , , , , , ,09,01,02,01,01,01,05,01, 50,FE,W,GRAND AVENUE , , , ,25,25, , ,N, ,128 2013130513-005873 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,06/21/2013,F ,1205,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,BAYSHORE PLAZA PARKING LOT , , , , , ,09,01,02,01,01,01,05,01, 150,FE,S,ROUTE 36 , , , ,25,45, , ,N, ,128 2013130513-007238 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,07/30/2013,TU,1528,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,08, 2,9 STATE HIGHWAY 36 PARKING LOT , , , , , ,09,01,02,01,01,01,05,01, , ,W,FIRST AVENUE , , , ,15,25, , ,N, ,#136 2013130513-007722 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,08/14/2013,W ,0904,01,ATLANTIC HIGHLANDS PD ,01 , 0, 0, 0, 0,P,B,N,N,08, 3,96 EAST AVENUE ,S, , , , ,09,01,02,01,01,01,05,01, , , , , , , , 0, , , ,N, ,133 2013130513-05217 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,06/02/2013,S ,1817,01,ATLANTIC HIGHLANDS PD , , 0, 0, 0, 0,P,B,N,N,06, 2,9 STATE HIGHWAY 36 PARKING LOT , , , , , ,09,01,02,01,01,01,05,01, , ,W,FIRST AVENUE , , , ,25,25, , ,N, ,137 2013130513002144 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,01/13/2013,S ,1902,01,EDISON TWP PD , , 0, 0, 0, 0,P,B,N,N,06, 2,450 RARTTAN CENTER ,S, , , , ,09,01,02,01,06,01,05,01, , , , , , , ,25, , , ,N, ,397 2013130513AV00256 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,01/26/2013,SA,1500,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,I,N,N,11, 1,NJ 71 , , 71, ,00000071__ , 6.23,02,01,02,01,01,01,02,01, ,AT, ,LINCOLN AVE , , , ,30, , , ,N,NONE ,86109 2013130513AV00866 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,03/03/2013,S ,1732,01,AVON-BY-THE-SEA PD ,AVON BY THE SE?, 0, 0, 0, 0,P,B,N,N,06, 2,MONMOUTH COUNTY 18 III ,N, 18,3,130000183_ , 3.92,05,01,02,01,03,01,02,01, 15,FE,N,WOODLAND AVE , , , ,25,25, , ,N,NONE ,86110 2013130513AV00888 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,03/22/2013,F ,1441,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,B,N,N,01, 2,NJ 71 ,N, 71, ,00000071__ , 6.14,02,01,02,01,01,01,02,01, 50,FE,S,WASHINGTON AVE , , , ,30,25, , ,N,NONE ,86110 2013130513AV00889 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,03/23/2013,SA,1429,01,AVON-BY-THE-SEA PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 71 ,S, 71, ,00000071__ , ,02,02,02,01,01,01,05,01, ,FE,N,CR 17 / SYLVANIA AVE , , , ,30,25, , ,N, ,86102 2013130513AV01190 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,04/20/2013,SA,1345,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,B,N,N,06, 2,STANTON PL , , , , , ,07,02,02,01,01,01,05,01, 2,FE,W,BRIDGE AVE , , , ,25,25, , ,N,NONE ,86107 2013130513AV01578 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,05/23/2013,TH,1855,01,AVON-BY-THE-SEA PD , , 0, 1, 0, 0,I,B,N,N,03, 2,SECOND LINCOLN , , , , , ,07,01,02,01,01,01,05,01, , , , , , , ,25, , , ,N,NONE ,88109 2013130513AV01760 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,06/05/2013,W ,1708,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,B,N,N,08, 2,MARINE PLACE , , , , , ,07,01,02,01,01,01,05,01, , ,S,MONMOUTH COUNTY 17 , , , ,10,25, , ,N,NONE ,86122 2013130513AV03315 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,09/06/2013,F ,1536,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,I,N,N,11, 1,NJ 71 , , 71, ,00000071__ , 6.23,02,01,02,01,01,01,01,01, ,AT, ,LINCOLN AVE , , , ,30,25, , ,N,NONE ,86118 2013130513AV03328 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,09/07/2013,SA,1427,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,I,N,N,11, 1,NJ 71 , , 71, ,00000071__ , 6.23,02,01,02,01,01,01,02,01, ,AT, ,LINCOLN AVE , , , ,30, , , ,N,NONE ,86109 2013130513AV03500 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,09/22/2013,S ,1244,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,B,N,N,06, 2,SECOND AVE , , , , , ,07,01,02,01,01,01,05,01, , , , , , , ,25, , , ,N,NONE ,86109 2013130513AV03501 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,09/22/2013,S ,1821,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,B,N,N,08, 2,NJ 71 , , 71, ,00000071__ , ,02,01,02,01,02,01,02,01, , , , , , , ,25, , , ,N,NONE ,86109 2013130513AV03893 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,11/01/2013,F ,1518,01,AVON-BY-THE-SEA PD , , 0, 0, 0, 0,P,I,N,N,06, 2,THIRD AVE ,S, , , , ,07,01,02,00,00,00,04,01, ,AT, ,WOODLAND AVENUE , , , ,25,25, , ,N,NONE ,86110 2013130513MU01012 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,02/06/2013,W ,1122,99,MONMOUTH UNIVERSITY PD , , 0, 0, 0, 0,P,B,N,N,01, 2,SCHOLALS WAY , , , , , ,09,04,02,01,01,01,05,01, , , , , , , ,15, , , ,N, ,2458 2013130513MU01525 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,02/21/2013,TH,2150,99,MONMOUTH UNIVERSITY PD , , 0, 0, 0, 0,P,B,N,N,06, 2,LOT 20 , , , , , ,09,01,02,01,06,01,05,01, , , , , , , , 0, , , ,Y,NONE ,5778 2013130513MV00844 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,01/31/2013,TH,1825,99,MONMOUTH UNIVERSITY PD , , 0, 0, 0, 0,P,B,N,N,08, 2,MV LOT 13 ,N, , , , ,09,01,02,01,06,01,05,01, 200,FE,S,HAWK ROAD , , , ,15, , , ,N,NONE ,6933 20131305MU1301104 ,MONMOUTH ,AVON-BY-THE-SEA BORO ,02/09/2013,SA,0212,99,MONMOUTH UNIVERSITY PD , , 0, 0, 0, 0,P,B,N,N,08, 2,PARKING LOT #3 , , , , , ,09,02,02,03,06,07,05,01, , , , , , , ,15, , , ,N,NONE ,4803 2013130613-10004 ,MONMOUTH ,BELMAR BORO ,08/31/2013,SA,1159,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,08, 2,NORTH BLVD , , , , , ,07,05,02,01,01,01,05,01, 300,FE,W,MONMOUTH COUNTY 18 , , , ,25,25, , ,N, ,614 2013130613-10013 ,MONMOUTH ,BELMAR BORO ,08/31/2013,SA,1347,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,01, 2,MONMOUTH COUNTY 18 III ,N, 18,3,130000183_ , 3.36,05,01,02,01,01,01,04,01, ,AT, ,2ND AVE / RIVER AVE , , , ,35,25, , ,N, ,152 2013130613-10257 ,MONMOUTH ,BELMAR BORO ,09/04/2013,W ,1735,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,14, 1,MONMOUTH COUNTY 18 III , , 18,3,130000183_ , 2.51,05,01,02,01,01,01,04,01, 50,FE,N,15TH AVE , , , ,25,25, , ,N, ,152 2013130613-10259 ,MONMOUTH ,BELMAR BORO ,09/04/2013,W ,2117,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,01, 2,NJ 71 , , 71, ,00000071__ , 5.54,02,01,02,01,07,01,05,01, 150,FE,W,CR 30 / 8TH AVE / MAIN ST , , , ,25, , , ,N,NONE ,920 2013130613-1026 ,MONMOUTH ,BELMAR BORO ,02/15/2013,F ,2233,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,18TH AVE , , , ,13471063__ , .50,07,01,02,01,06,01,04, , 10,FE,E,A STREET , , , ,25,25, , ,N, ,908 2013130613-10316 ,MONMOUTH ,BELMAR BORO ,09/05/2013,TH,1314,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,11, 1,NJ 35 , , 35, ,00000035__ , ,02,01,02,01,01,01,04,01, 75,FE,S,MACLEARIE PARK EXIT DRIVEWAY , , , ,35, , , ,N, ,137 2013130613-10350 ,MONMOUTH ,BELMAR BORO ,09/07/2013,SA,1035,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,06, 2,10TH AVE , , , ,13061064__ , .11,07,01,02,01,01,01,02,05, ,AT, ,BELMAR PLAZA , , , ,25, , , ,N, ,129 2013130613-10353 ,MONMOUTH ,BELMAR BORO ,09/07/2013,SA,1214,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,01, 2,NJ 35 ,N, 35, ,00000035__ , 21.24,02,01,02,01,01,01,04,01, 50,FE,S,10TH AVE , , , ,35,25, , ,N, ,152 2013130613-10358 ,MONMOUTH ,BELMAR BORO ,09/07/2013,SA,1445,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 35 ,N, 35, ,00000035__ , 21.25,02,01,02,01,01,01,02,01, 15,FE,S,10TH AVE , , , ,35,25, , ,N,NONE ,904 2013130613-10522 ,MONMOUTH ,BELMAR BORO ,09/11/2013,W ,1623,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,07, 2,NJ 35 , , 35, ,00000035__ , 21.18,02,01,02,01,01,01,03,01, ,AT, ,11TH AVE , , , ,35,25, , ,N,NONE ,128 2013130613-10528 ,MONMOUTH ,BELMAR BORO ,09/11/2013,W ,1823,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,MONMOUTH COUNTY 18 III , , 18,3,130000183_ , 2.84,05,01,02,01,01,01,05,01, 100,FE,S,9TH AVE , , , ,25,25, , ,N,NONE ,128 2013130613-10617 ,MONMOUTH ,BELMAR BORO ,09/14/2013,SA, ,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,08, 2,BRIARWOOD RD , , , , , ,07,01,02,01,01,01,05,01, 300,FE,S,MONMOUTH COUNTY 16 , , , ,25,25, , ,N, ,614 2013130613-1066 ,MONMOUTH ,BELMAR BORO ,02/17/2013,S ,1540,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,08, 2,FIFTH AVE , , , , , ,07,01,02,01,01,01,05,01, 50,FE,W,MONMOUTH COUNTY 18 , , , ,25,25, , ,N, ,904 2013130613-10835 ,MONMOUTH ,BELMAR BORO ,09/20/2013,F ,1618,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,NJ 35 , , 35, ,00000035__ , 20.48,02,01,02,01,03,01,05,01, ,AT, ,CR 18 / 16TH AVE / BELMAR AVE , , , ,35,25, , ,N, ,153 2013130613-1086 ,MONMOUTH ,BELMAR BORO ,02/18/2013,M ,1116,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,02, 2,10TH AVE , , , ,13061064__ , .15,07,01,02,01,01,01,02,01, 170,FE,W,MONMOUTH COUNTY 30 , , , ,25,30, , ,N, ,136 2013130613-10886 ,MONMOUTH ,BELMAR BORO ,09/22/2013,S ,0943,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,04, 2,FIFTEENTH AVE , , , , , ,07,01,02,01,01,01,05,01, 400,FE,W,""A"" ST , , , ,25,25, , ,N, ,614 2013130613-10906 ,MONMOUTH ,BELMAR BORO ,09/22/2013,S ,1636,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 71 ,S, 71, ,00000071__ , 4.91,02,01,02,01,03,01,05,01, 100,FE,N,NJ 18 / 16TH AVE , , , ,30,25, , ,N,NONE ,904 2013130613-11065 ,MONMOUTH ,BELMAR BORO ,09/28/2013,SA,1613,01,BELMAR PD , , 0, 2, 0, 0,I,I,N,N,02, 2,NJ 35 ,N, 35, ,00000035__ , 21.18,02,01,02,01,01,01,04,01, ,AT,W,11TH AVE , , , ,35,25, , ,N, ,152 2013130613-11072 ,MONMOUTH ,BELMAR BORO ,09/28/2013,SA,2217,01,BELMAR PD , , 0, 1, 0, 0,I,I,N,N,01, 2,NJ 71 ,S, 71, ,00000071__ , 5.57,02,01,02,01,07,01,05,01, ,AT, ,CR 30 / 8TH AVE / MAIN ST , , , ,30,25, , ,Y,NONE ,939 2013130613-11167 ,MONMOUTH ,BELMAR BORO ,10/02/2013,W ,1218,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,08, 2,9TH AVE , , , ,13061065__ , .01,07,01,02,01,01,01,05,01, 50,FE,E,MONMOUTH COUNTY 30 , , , ,25,35, , ,N, ,152 2013130613-11169 ,MONMOUTH ,BELMAR BORO ,10/02/2013,W ,1315,01,BELMAR PD , , 0, 1, 0, 1,I,B,N,N,13, 1,EIGHTH AVE , , , , , ,07,01,02,01,01,01,05,01, 25,FE,E,MONMOUTH COUNTY 30 , , , ,25,35, , ,N, ,152 2013130613-11170 ,MONMOUTH ,BELMAR BORO ,10/02/2013,W ,1317,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 35 , , 35, ,00000035__ , 21.25,02,01,02,01,01,01,02,01, 20,FE,S,10TH AVE , , , ,35, , , ,N, ,134 2013130613-1135 ,MONMOUTH ,BELMAR BORO ,02/20/2013,W ,1213,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,02, 2,10TH AVE , , , ,13061064__ , .13,07,01,02,01,01,01,02,01, 246,FE,W,MONMOUTH COUNTY 30 , , , ,25,30, , ,N, ,1?6 2013130613-11352 ,MONMOUTH ,BELMAR BORO ,10/10/2013,TH,1950,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,FIFTEENTH AVE , , , , , ,07,01,02,02,05,01,05,01, 150,FE,E,13TH ST , , , ,25, , , ,N,NONE ,128 2013130613-11450 ,MONMOUTH ,BELMAR BORO ,10/15/2013,TU,1741,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,NJ 71 , , 71, ,00000071__ , 4.89,02,01,02,01,01,01,04,01, ,AT, ,NJ 18 / 16TH AVE , , , ,30,25, , ,N,FIRE HYDRANT ,152 2013130613-11474 ,MONMOUTH ,BELMAR BORO ,10/16/2013,W ,1751,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,02, 2,10TH AVE , , , ,13061064__ , .11,07,01,02,01,01,05,05,01, ,AT, ,BELMAR PLAZA , , , ,25,25, , ,N, ,152 2013130613-11528 ,MONMOUTH ,BELMAR BORO ,10/18/2013,F ,1848,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,NJ 35 , , 35, ,00000035__ , 21.25,02,01,02,01,06,01,05,01, ,AT, ,10TH AVE , , , ,35,25, , ,N, ,153 2013130613-11671 ,MONMOUTH ,BELMAR BORO ,10/25/2013,F ,1809,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,NJ 35 , , 35, ,00000035__ , 21.18,02,01,02,01,01,01,03,01, ,AT, ,11TH AVE , , , ,35,25, , ,N, ,153 2013130613-11770 ,MONMOUTH ,BELMAR BORO ,10/29/2013,TU,1802,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 35 ,N, 35, ,00000035__ , 21.27,02,01,02,01,06,01,02,01, 100,FE,N,10TH AVE , , , ,35,25, , ,N,NONE ,128 2013130613-11802 ,MONMOUTH ,BELMAR BORO ,10/31/2013,TH,0930,01,BELMAR PD , , 0, 1, 0, 0,I,B,Y,N,01, 3,7TH AVE , , , , , ,07,01,02,01,01,01,05,01, 30,FE,E,A STREET , , , ,25,25, , ,N,NONE ,144 2013130613-11983 ,MONMOUTH ,BELMAR BORO ,11/08/2013,F ,1114,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,02, 2,10TH AVE , , , ,13061064__ , .18,07,01,02,01,01,01,02,01, 10,FE,W,CR 30 , , , ,25,30, , ,N, ,135 2013130613-12022 ,MONMOUTH ,BELMAR BORO ,11/10/2013,S ,1045,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,08, 2,10TH AVE , , , ,13061064__ , .18,07,01,02,01,01,01,04,01, 25,FE,E,CR 30 , , , ,25,35, , ,N, ,152 2013130613-12086 ,MONMOUTH ,BELMAR BORO ,11/14/2013,TH,1432,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,01, 2,NJ 35 ,N, 35, ,00000035__ , ,02,01,02,01,01,01,04,01, 250,FE,S,NJ 71 , , , ,35,25, , ,N, ,152 2013130613-1227 ,MONMOUTH ,BELMAR BORO ,02/24/2013,S ,1838,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,01, 2,NJ 35 ,N, 35, ,00000035__ , 21.41,02,01,02,01,06,01,04,01, ,AT, ,NJ 71 / 8TH AVE , , , ,35,25, , ,N, ,151 2013130613-12316 ,MONMOUTH ,BELMAR BORO ,11/24/2013,S ,1605,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,MONMOUTH COUNTY 30 II , , 30,2,130000302_ , 4.53,05,01,02,01,01,01,05,01, 50,FE,N,9TH AVE , , , ,30,25, , ,N, ,153 2013130613-12506 ,MONMOUTH ,BELMAR BORO ,12/03/2013,TU,0803,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,02, 2,11 TH AVE , , , , , ,07,01,02,01,01,01,05,01, 30,FE, ,CR 30 , , , ,25,30, , ,N,NONE ,135 2013130613-12568 ,MONMOUTH ,BELMAR BORO ,12/06/2013,F ,1509,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,NJ 71 , , 71, ,00000071__ , 5.04,02,01,02,02,01,02,05,01, ,AT, ,13TH AVE , , , ,30,25, , ,N, ,153 2013130613-12659 ,MONMOUTH ,BELMAR BORO ,12/10/2013,TU,1840,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,01, 2,MONMOUTH COUNTY 30 II , , 30,2,130000302_ , 4.40,05,01,02,02,06,01,05,01, 100,FE,N,11TH AVE , , , , 0, , , ,N, ,129 2013130613-12680 ,MONMOUTH ,BELMAR BORO ,12/11/2013,W ,2256,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,MONMOUTH COUNTY 18 I , , 18,1,130000181_ , 8.18,05,01,02,01,06,01,05,01, 200,FE,W,D ST / NEWMAN ST , , , ,25,25, , ,N, ,153 2013130613-12889 ,MONMOUTH ,BELMAR BORO ,12/22/2013,S ,0827,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,THIRTEENTH AVE , , , , , ,07,01,02,01,06,01,05,01, 50,FE,W,BAYVIEW , , , ,25,25, , ,N, ,153 2013130613-12893 ,MONMOUTH ,BELMAR BORO ,12/22/2013,S ,1147,01,BELMAR PD , , 0, 1, 0, 0,I,B,N,N,14, 1,MONMOUTH COUNTY 30 II , , 30,2,130000302_ , 4.35,05,01,02,01,01,01,05,01, 200,FE,N,12TH AVE , , , ,25,25, , ,N,(NONE) ,128 2013130613-12983 ,MONMOUTH ,BELMAR BORO ,12/26/2013,TH,1518,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,01, 2,NJ 35 ,S, 35, ,00000035__ , 20.98,02,04,02,01,01,01,04,01, ,AT, ,K ST , , , ,35,35, , ,N, ,955 2013130613-1299 ,MONMOUTH ,BELMAR BORO ,02/27/2013,W ,1905,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,MONMOUTH COUNTY 18 I , , 18,1,130000181_ , 7.90,05,01,02,01,07,01,05,01, 50,FE,E,RAILROAD AVE , , , ,25,25, , ,N,NONE ,147 2013130613-13066 ,MONMOUTH ,BELMAR BORO ,12/29/2013,S ,1359,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,03, 2,TWELFTH AVE , , , , , ,07,01,02,02,01,02,05,01, ,AT, ,RAILROAD AVENUE , , , ,25,25, , ,N, ,128 2013130613-1432 ,MONMOUTH ,BELMAR BORO ,03/03/2013,S ,1607,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,06, 2,D ST , , , , , ,07,01,02,01,01,01,05,01, 30,FE,N,11TH AVE , , , ,25,25, , ,N,NONE ,128 2013130613-1450 ,MONMOUTH ,BELMAR BORO ,03/03/2013,S ,1946,01,BELMAR PD , , 0, 0, 0, 0,P,B,Y,N,06, 2,FIFTEENTH AVE ,W, , , , ,07,02,02,01,06,01,04,01, 150,FE,E,E STREET , , , ,25,25, , ,N, ,152 2013130613-1792 ,MONMOUTH ,BELMAR BORO ,03/16/2013,SA,0856,01,BELMAR PD , , 0, 0, 0, 0,P,B,N,N,11, 2,NJ 35 ,N, 35, ,00000035__ , 20.70,02,04,02,01,01,01,04,01, 100,FE,S,MAPLEWOOD RD , , , ,35,25, , ,N, ,151 2013130613-200 ,MONMOUTH ,BELMAR BORO ,01/09/2013,W ,1551,01,BELMAR PD , , 0, 0, 0, 0,P,I,N,N,09, 2,NJ 71 , , 71, ,00000071__ , 5.46,02,01,02,01,01,01,02,01, ,AT, ,W RAILROAD AVE , , , ,25,25, , ,N,NONE ,128 2013130613-2046 ,MONMOUTH ,BELMAR BORO ,03/26/2013,TU,1515,01,BELMAR PD , , 0, 1, 0, 0,I,I,N,N,01, 2,MONMOUTH COUNTY 30 II , , 30,2,130000302_ , 4.45,05,01,02,01,01,05,05,01, ,AT, ,10TH AVE , , , ,30, , , ,N, ,135 data.table/inst/tests/536_fread_fill_3_extreme.txt0000644000175100001440000000012013172210047021574 0ustar hornikusersa,b,c 1,"first,,,,,,,,,,,,,,,, second,,,,,,, third",2 2,"foo" 3 data.table/inst/tests/issue_1164_json.txt0000644000175100001440000000015613172210047020002 0ustar hornikusersjson1, string1 "{""f1"":""value1"",""f2"":""double quote escaped with a backslash [ \"" ]""}", "string field" data.table/inst/doc/0000755000175100001440000000000013172212367014015 5ustar hornikusersdata.table/inst/doc/datatable-secondary-indices-and-auto-indexing.R0000644000175100001440000001031613172212367024774 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ----echo = FALSE--------------------------------------------------------------------------------- options(width = 100L) ## ------------------------------------------------------------------------------------------------- flights <- fread("flights14.csv") head(flights) dim(flights) ## ------------------------------------------------------------------------------------------------- setindex(flights, origin) head(flights) ## alternatively we can provide character vectors to the function 'setindexv()' # setindexv(flights, "origin") # useful to program with # 'index' attribute added names(attributes(flights)) ## ------------------------------------------------------------------------------------------------- indices(flights) setindex(flights, origin, dest) indices(flights) ## ---- eval = FALSE-------------------------------------------------------------------------------- # ## not run # setkey(flights, origin) # flights["JFK"] # or flights[.("JFK")] ## ---- eval = FALSE-------------------------------------------------------------------------------- # ## not run # setkey(flights, dest) # flights["LAX"] ## ------------------------------------------------------------------------------------------------- flights["JFK", on = "origin"] ## alternatively # flights[.("JFK"), on = "origin"] (or) # flights[list("JFK"), on = "origin"] ## ------------------------------------------------------------------------------------------------- setindex(flights, origin) flights["JFK", on = "origin", verbose = TRUE][1:5] ## ------------------------------------------------------------------------------------------------- flights[.("JFK", "LAX"), on = c("origin", "dest")][1:5] ## ------------------------------------------------------------------------------------------------- flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")] ## ------------------------------------------------------------------------------------------------- flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")][order(-arr_delay)] ## ------------------------------------------------------------------------------------------------- flights[.("LGA", "TPA"), max(arr_delay), on = c("origin", "dest")] ## ------------------------------------------------------------------------------------------------- # get all 'hours' in flights flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- flights[.(24L), hour := 0L, on = "hour"] ## ------------------------------------------------------------------------------------------------- flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- ans <- flights["JFK", max(dep_delay), keyby = month, on = "origin"] head(ans) ## ------------------------------------------------------------------------------------------------- flights[c("BOS", "DAY"), on = "dest", mult = "first"] ## ------------------------------------------------------------------------------------------------- flights[.(c("LGA", "JFK", "EWR"), "XNA"), on = c("origin", "dest"), mult = "last"] ## ------------------------------------------------------------------------------------------------- flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", on = c("origin", "dest"), nomatch = 0L] ## ------------------------------------------------------------------------------------------------- set.seed(1L) dt = data.table(x = sample(1e5L, 1e7L, TRUE), y = runif(100L)) print(object.size(dt), units = "Mb") ## ------------------------------------------------------------------------------------------------- ## have a look at all the attribute names names(attributes(dt)) ## run thefirst time (t1 <- system.time(ans <- dt[x == 989L])) head(ans) ## secondary index is created names(attributes(dt)) indices(dt) ## ------------------------------------------------------------------------------------------------- ## successive subsets (t2 <- system.time(dt[x == 989L])) system.time(dt[x %in% 1989:2012]) data.table/inst/doc/datatable-reshape.Rmd0000644000175100001440000002361313172210047020025 0ustar hornikusers--- title: "Efficient reshaping using data.tables" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Efficient reshaping using data.tables} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` This vignette discusses the default usage of reshaping functions `melt` (wide to long) and `dcast` (long to wide) for *data.tables* as well as the **new extended functionalities** of melting and casting on *multiple columns* available from `v1.9.6`. *** ```{r echo = FALSE} options(width = 100L) ``` ## Data We will load the data sets directly within sections. ## Introduction The `melt` and `dcast` functions for *data.tables* are extensions of the corresponding functions from the [reshape2](https://cran.r-project.org/package=reshape2) package. In this vignette, we will 1. first briefly look at the default *melting* and *casting* of *data.tables* to convert them from *wide* to *long* format and vice versa, 2. then look at scenarios where the current functionalities becomes cumbersome and inefficient, 3. and finally look at the new improvements to both `melt` and `dcast` methods for *data.tables* to handle multiple columns simultaneously. The extended functionalities are in line with *data.table's* philosophy of performing operations efficiently and in a straightforward manner. #### Note: {.bs-callout .bs-callout-info} From `v1.9.6` on, you don't have to load `reshape2` package to use these functions for *data.tables*. You just need to load `data.table`. If you've to load `reshape2` for melting or casting matrices and/or data.frames, then make sure to load it *before* loading `data.table`. ## 1. Default functionality ### a) `melt`ing *data.tables* (wide to long) Suppose we have a `data.table` (artificial data) as shown below: ```{r} DT = fread("melt_default.csv") DT ## dob stands for date of birth. str(DT) ``` # #### - Convert `DT` to *long* form where each `dob` is a separate observation. We could accomplish this using `melt()` by specifying `id.vars` and `measure.vars` arguments as follows: ```{r} DT.m1 = melt(DT, id.vars = c("family_id", "age_mother"), measure.vars = c("dob_child1", "dob_child2", "dob_child3")) DT.m1 str(DT.m1) ``` #### {.bs-callout .bs-callout-info} * `measure.vars` specify the set of columns we would like to collapse (or combine) together. * We can also specify column *indices* instead of *names*. * By default, `variable` column is of type `factor`. Set `variable.factor` argument to `FALSE` if you'd like to return a *character* vector instead. `variable.factor` argument is only available in `melt` from `data.table` and not in the [`reshape2` package](https://github.com/hadley/reshape). * By default, the molten columns are automatically named `variable` and `value`. * `melt` preserves column attributes in result. # #### - Name the `variable` and `value` columns to `child` and `dob` respectively ```{r} DT.m1 = melt(DT, measure.vars = c("dob_child1", "dob_child2", "dob_child3"), variable.name = "child", value.name = "dob") DT.m1 ``` #### {.bs-callout .bs-callout-info} * By default, when one of `id.vars` or `measure.vars` is missing, the rest of the columns are *automatically assigned* to the missing argument. * When neither `id.vars` nor `measure.vars` are specified, as mentioned under `?melt`, all *non*-`numeric`, `integer`, `logical` columns will be assigned to `id.vars`. In addition, a warning message is issued highlighting the columns that are automatically considered to be `id.vars`. ### b) `Cast`ing *data.tables* (long to wide) In the previous section, we saw how to get from wide form to long form. Let's see the reverse operation in this section. #### - How can we get back to the original data table `DT` from `DT.m`? That is, we'd like to collect all *child* observations corresponding to each `family_id, age_mother` together under the same row. We can accomplish it using `dcast` as follows: ```{r} dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") ``` #### {.bs-callout .bs-callout-info} * `dcast` uses *formula* interface. The variables on the *LHS* of formula represents the *id* vars and *RHS* the *measure* vars. * `value.var` denotes the column to be filled in with while casting to wide format. * `dcast` also tries to preserve attributes in result wherever possible. # #### - Starting from `DT.m`, how can we get the number of children in each family? You can also pass a function to aggregate by in `dcast` with the argument `fun.aggregate`. This is particularly essential when the formula provided does not identify single observation for each cell. ```{r} dcast(DT.m1, family_id ~ ., fun.agg = function(x) sum(!is.na(x)), value.var = "dob") ``` Check `?dcast` for other useful arguments and additional examples. ## 2. Limitations in current `melt/dcast` approaches So far we've seen features of `melt` and `dcast` that are based on `reshape2` package, but implemented efficiently for *data.table*s, using internal `data.table` machinery (*fast radix ordering*, *binary search* etc..). However, there are situations we might run into where the desired operation is not expressed in a straightforward manner. For example, consider the *data.table* shown below: ```{r} DT = fread("melt_enhanced.csv") DT ## 1 = female, 2 = male ``` And you'd like to combine (melt) all the `dob` columns together, and `gender` columns together. Using the current functionality, we can do something like this: ```{r} DT.m1 = melt(DT, id = c("family_id", "age_mother")) DT.m1[, c("variable", "child") := tstrsplit(variable, "_", fixed = TRUE)] DT.c1 = dcast(DT.m1, family_id + age_mother + child ~ variable, value.var = "value") DT.c1 str(DT.c1) ## gender column is character type now! ``` #### Issues {.bs-callout .bs-callout-info} 1. What we wanted to do was to combine all the `dob` and `gender` type columns together respectively. Instead we are combining *everything* together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient). As an analogy, imagine you've a closet with four shelves of clothes and you'd like to put together the clothes from shelves 1 and 2 together (in 1), and 3 and 4 together (in 3). What we are doing is more or less to combine all the clothes together, and then split them back on to shelves 1 and 3! 2. The columns to *melt* may be of different types, as in this case (character and integer types). By *melting* them all together, the columns will be coerced in result, as explained by the warning message above and shown from output of `str(DT.c1)`, where `gender` has been converted to *character* type. 3. We are generating an additional column by splitting the `variable` column into two columns, whose purpose is quite cryptic. We do it because we need it for *casting* in the next step. 4. Finally, we cast the data set. But the issue is it's a much more computationally involved operation than *melt*. Specifically, it requires computing the order of the variables in formula, and that's costly. # In fact, `base::reshape` is capable of performing this operation in a very straightforward manner. It is an extremely useful and often underrated function. You should definitely give it a try! ## 3. Enhanced (new) functionality ### a) Enhanced `melt` Since we'd like for *data.tables* to perform this operation straightforward and efficient using the same interface, we went ahead and implemented an *additional functionality*, where we can `melt` to multiple columns *simultaneously*. #### - `melt` multiple columns simultaneously The idea is quite simple. We pass a list of columns to `measure.vars`, where each element of the list contains the columns that should be combined together. ```{r} colA = paste("dob_child", 1:3, sep = "") colB = paste("gender_child", 1:3, sep = "") DT.m2 = melt(DT, measure = list(colA, colB), value.name = c("dob", "gender")) DT.m2 str(DT.m2) ## col type is preserved ``` #### - Using `patterns()` Usually in these problems, the columns we'd like to melt can be distinguished by a common pattern. We can use the function `patterns()`, implemented for convenience, to provide regular expressions for the columns to be combined together. The above operation can be rewritten as: ```{r} DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender")) DT.m2 ``` That's it! #### {.bs-callout .bs-callout-info} * We can remove the `variable` column if necessary. * The functionality is implemented entirely in C, and is therefore both *fast* and *memory efficient* in addition to being *straightforward*. ### b) Enhanced `dcast` Okay great! We can now melt into multiple columns simultaneously. Now given the data set `DT.m2` as shown above, how can we get back to the same format as the original data we started with? If we use the current functionality of `dcast`, then we'd have to cast twice and bind the results together. But that's once again verbose, not straightforward and is also inefficient. #### - Casting multiple `value.var`s simultaneously We can now provide **multiple `value.var` columns** to `dcast` for *data.tables* directly so that the operations are taken care of internally and efficiently. ```{r} ## new 'cast' functionality - multiple value.vars DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "gender")) DT.c2 ``` #### {.bs-callout .bs-callout-info} * Attributes are preserved in result wherever possible. * Everything is taken care of internally, and efficiently. In addition to being fast, it is also very memory efficient. # #### Multiple functions to `fun.aggregate`: {.bs-callout .bs-callout-info} You can also provide *multiple functions* to `fun.aggregate` to `dcast` for *data.tables*. Check the examples in `?dcast` which illustrates this functionality. # *** data.table/inst/doc/datatable-faq.html0000644000175100001440000023301613172212361017370 0ustar hornikusers Beginner FAQs

The first section, Beginner FAQs, is intended to be read in order, from start to finish. It's just written in a FAQ style to be digested more easily. It isn't really the most frequently asked questions. A better measure for that is looking on Stack Overflow.

This FAQ is required reading and considered core documentation. Please do not ask questions on Stack Overflow or raise issues on GitHub until you have read it. We can all tell when you ask that you haven't read it. So if you do ask and haven't read it, don't use your real name.

This document has been quickly revised given the changes in v1.9.8 released Nov 2016. Please do submit pull requests to fix mistakes or improvements. If anyone knows why the table of contents comes out so narrow and squashed when displayed by CRAN, please let us know. This document used to be a PDF and we changed it recently to HTML.

Beginner FAQs

Why do DT[ , 5] and DT[2, 5] return a 1-column data.table rather than vectors like data.frame? {#j-num}

For consistency so that when you use data.table in functions that accept varying inputs, you can rely on DT[...] returning a data.table. You don't have to remember to include drop=FALSE like you do in data.frame. data.table was first released in 2006 and this difference to data.frame has been a feature since the very beginning.

You may have heard that it is generally bad practice to refer to columns by number rather than name, though. If your colleague comes along and reads your code later they may have to hunt around to find out which column is number 5. If you or they change the column ordering higher up in your R program, you may produce wrong results with no warning or error if you forget to change all the places in your code which refer to column number 5. That is your fault not R's or data.table's. It's really really bad. Please don't do it. It's the same mantra as professional SQL developers have: never use select *, always explicitly select by column name to at least try to be robust to future changes.

Say column 5 is named "region" and you really must extract that column as a vector not a data.table. It is more robust to use the column name and write DT$region or DT[["region"]]; i.e., the same as base R. Using base R's $ and [[ on data.table is encouraged. Not when combined with <- to assign (use := instead for that) but just to select a single column by name they are encouraged.

There are some circumstances where referring to a column by number seems like the only way, such as a sequence of columns. In these situations just like data.frame, you can write DT[, 5:10] and DT[,c(1,4,10)]. However, again, it is more robust (to future changes in your data's number of and ordering of columns) to use a named range such as DT[,columnRed:columnViolet] or name each one DT[,c("columnRed","columnOrange","columnYellow")]. It is harder work up front, but you will probably thank yourself and your colleagues might thank you in the future. At least you can say you tried your best to write robust code if something does go wrong.

However, what we really want you to do is DT[,.(columnRed,columnOrange,columnYellow)]; i.e., use column names as if they are variables directly inside DT[...]. You don't have to prefix each column with DT$ like you do in data.frame. The .() part is just an alias for list() and you can use list() instead if you prefer. You can place any R expression of column names, using any R package, returning different types of different lengths, right there. We wanted to encourage you to do that so strongly in the past that we deliberately didn't make DT[,5] work at all. Before v1.9.8 released Nov 2016, DT[,5] used to just return 5. The thinking was that we could more simply teach one fact that the parts inside DT[...] get evaluated within the frame of DT always (they see column names as if they are variables). And 5 evaluates to 5 so that behaviour was consistent with the single rule. We asked you to go through an extra deliberate hurdle DT[,5,with=FALSE] if you really wanted to select a column by name or number. Going forward from Nov 2016, you don't need to use with=FALSE and we'll see how greater consistency with data.frame in this regard will help or hinder both new and long-time users. The new users who don't read this FAQ, not even this very first entry, will hopefully not stumble as soon with data.table as they did before if they had expected it to work like data.frame. Hopefully they will not miss out on understanding our intent and recommendation to place expressions of columns inside DT[i, j, by]. If they use data.table like data.frame they won't gain any benefits. If you know anyone like that, please give them a friendly nudge to read this document like you are.

Reminder: you can place any R expression inside DT[...] using column names as if they are variables; e.g., try DT[, colA*colB/2]. That does return a vector because you used column names as if they are variables. Wrap with .() to return a data.table; i.e. DT[,.(colA*colB/2)]. Name it: DT[,.(myResult = colA*colB/2)]. And we'll leave it to you to guess how to return two things from this query. It's also quite common to do a bunch of things inside an anonymous body: DT[, { x<-colA+10; x*x/2 }] or call another package's function: DT[ , fitdistr(columnA, "normal")].

Why does DT[,"region"] return a 1-column data.table rather than a vector?

See the answer above. Try DT$region instead. Or DT[["region"]].

Why does DT[, region] return a vector for the “region” column? I'd like a 1-column data.table.

Try DT[ , .(region)] instead. .() is an alias for list() and ensures a data.table is returned.

Also continue reading and see the FAQ after next. Skim whole documents before getting stuck in one part.

Why does DT[ , x, y, z] not work? I wanted the 3 columns x,y and z.

The j expression is the 2nd argument. Try DT[ , c("x","y","z")] or DT[ , .(x,y,z)].

I assigned a variable mycol = "x" but then DT[ , mycol] returns "x". How do I get it to look up the column name contained in the mycol variable?

In v1.9.8 released Nov 2016 there is an abililty to turn on new behaviour: options(datatable.WhenJisSymbolThenCallingScope=TRUE). It will then work as you expected, just like data.frame. If you are a new user of data.table, you should probably do this. You can place this command in your .Rprofile file so you don't have to remember again. See the long item in release notes about this. The release notes are linked at the top of the data.table homepage: NEWS.

Without turning on that new behavior, what's happening is that the j expression sees objects in the calling scope. The variable mycol does not exist as a column name of DT so data.table then looked in the calling scope and found mycol there and returned its value "x". This is correct behaviour currently. Had mycol been a column name, then that column's data would have been returned. What has been done to date has been DT[ , mycol, with = FALSE] which will return the x column's data as required. That will still work in the future, too. Alternatively, since a data.table is a list, too, you have been and still will be able to write and rely on DT[[mycol]].

What are the benefits of being able to use column names as if they are variables inside DT[...]?

j doesn't have to be just column names. You can write any R expression of column names directly in j, e.g., DT[ , mean(x*y/z)]. The same applies to i, e.g., DT[x>1000, sum(y*z)].

This runs the j expression on the set of rows where the i expression is true. You don't even need to return data, e.g., DT[x>1000, plot(y, z)]. You can do j by group simply by adding by =; e.g., DT[x>1000, sum(y*z), by = w]. This runs j for each group in column w but just over the rows where x>1000. By placing the 3 parts of the query (i=where, j=select and by=group by) inside the square brackets, data.table sees this query as a whole before any part of it is evaluated. Thus it can optimize the combined query for performance. It can do this because the R language uniquely has lazy evalation (Python and Julia do not). data.table sees the expressions inside DT[...] before they are evaluated and optimizes them before evaluation. For example, if data.table see that you're only using 2 columns out of 100, it won't bother to subset the 98 that aren't needed by your j expression.

OK, I'm starting to see what data.table is about, but why didn't you just enhance data.frame in R? Why does it have to be a new package?

As highlighted above, j in [.data.table is fundamentally different from j in [.data.frame. Even if something as simple as DF[ , 1] was changed in base R to return a data.frame rather than a vector, that would break existing code in many 1000's of CRAN packages and user code. As soon as we took the step to create a new class that inherited from data.frame, we had the opportunity to change a few things and we did. We want data.table to be slightly different and to work this way for more complicated syntax to work. There are other differences, too (see below ).

Furthermore, data.table inherits from data.frame. It is a data.frame, too. A data.table can be passed to any package that only accepts data.frame and that package can use [.data.frame syntax on the data.table. See this answer for how that is achieved.

We have proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0 :

unique() and match() are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c.

A second proposal was to use memcpy in duplicate.c, which is much faster than a for loop in C. This would improve the way that R copies data internally (on some measures by 13 times). The thread on r-devel is here.

A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 :

The radix sort algorithm and implementation from data.table (forder) replaces the previous radix (counting) sort and adds a new method for order(). Contributed by Matt Dowle and Arun Srinivasan, the new algorithm supports logical, integer (even with large values), real, and character vectors. It outperforms all other methods, but there are some caveats (see ?sort).

This was big event for us and we celebrated until the cows came home. (Not really.)

Why are the defaults the way they are? Why does it work the way it does?

The simple answer is because the main author originally designed it for his own use. He wanted it that way. He finds it a more natural, faster way to write code, which also executes more quickly.

Isn't this already done by with() and subset() in base?

Some of the features discussed so far are, yes. The package builds upon base functionality. It does the same sorts of things but with less code required and executes many times faster if used correctly.

Why does X[Y] return all the columns from Y too? Shouldn't it return a subset of X?

This was changed in v1.5.3 (Feb 2011). Since then X[Y] includes Y's non-join columns. We refer to this feature as join inherited scope because not only are X columns available to the j expression, so are Y columns. The downside is that X[Y] is less efficient since every item of Y's non-join columns are duplicated to match the (likely large) number of rows in X that match. We therefore strongly encourage X[Y, j] instead of X[Y]. See next FAQ.

What is the difference between X[Y] and merge(X, Y)? {#MergeDiff}

X[Y] is a join, looking up X's rows using Y (or Y's key if it has one) as an index.

Y[X] is a join, looking up Y's rows using X (or X's key if it has one) as an index.

merge(X,Y)[1] does both ways at the same time. The number of rows of X[Y] and Y[X] usually differ, whereas the number of rows returned by merge(X, Y) and merge(Y, X) is the same.

BUT that misses the main point. Most tasks require something to be done on the data after a join or merge. Why merge all the columns of data, only to use a small subset of them afterwards? You may suggest merge(X[ , ColsNeeded1], Y[ , ColsNeeded2]), but that requires the programmer to work out which columns are needed. X[Y, j] in data.table does all that in one step for you. When you write X[Y, sum(foo*bar)], data.table automatically inspects the j expression to see which columns it uses. It will subset those columns only; the others are ignored. Memory is only created for the columns j uses and Y columns enjoy standard R recycling rules within the context of each group. Let's say foo is in X and bar is in Y (along with 20 other columns in Y). Isn't X[Y, sum(foo*bar)] quicker to program and quicker to run than a merge of everything wastefully followed by a subset?

[1]: Here we mean either the merge method for data.table or the merge method for data.frame since both methods work in the same way in this respect. See ?merge.data.table and below for more information about method dispatch.

Anything else about X[Y, sum(foo*bar)]?

This behaviour changed in v1.9.4 (Sep 2014). It now does the X[Y] join and then runs sum(foo*bar) over all the rows; i.e., X[Y][ , sum(foo*bar)]. It used to run j for each group of X that each row of Y matches to. That can still be done as it's very useful but you now need to be explicit and specify by = .EACHI, i.e., X[Y, sum(foo*bar), by = .EACHI]. We call this grouping by each i.

For example, (further complicating it by using join inherited scope, too):

X = data.table(grp = c("a", "a", "b",
                       "b", "b", "c", "c"), foo = 1:7)
setkey(X, grp)
Y = data.table(c("b", "c"), bar = c(4, 2))
X
#    grp foo
# 1:   a   1
# 2:   a   2
# 3:   b   3
# 4:   b   4
# 5:   b   5
# 6:   c   6
# 7:   c   7
Y
#    V1 bar
# 1:  b   4
# 2:  c   2
X[Y, sum(foo*bar)]
# [1] 74
X[Y, sum(foo*bar), by = .EACHI]
#    grp V1
# 1:   b 48
# 2:   c 26

That's nice. How did you manage to change it given that users depended on the old behaviour?

The request to change came from users. The feeling was that if a query is doing grouping then an explicit by= should be present for code readability reasons. An option was provided to return the old behaviour: options(datatable.old.bywithoutby), by default FALSE. This enabled upgrading to test the other new features / bug fixes in v1.9.4, with later migration of any by-without-by queries when ready by adding by=.EACHI to them. We retained 47 pre-change tests and added them back as new tests, tested under options(datatable.old.bywithoutby=TRUE). We added a startup message about the change and how to revert to the old behaviour. After 1 year the option was deprecated with warning when used. After 2 years the option to revert to old behaviour was removed.

Of the 66 packages on CRAN or Bioconductor that depended on or import data.table at the time of releasing v1.9.4 (it is now over 300), only one was affected by the change. That could be because many packages don't have comprehensive tests, or just that grouping by each row in i wasn't being used much by downstream packages. We always test the new version with all dependent packages before release and coordinate any changes with those maintainers. So this release was quite straightforward in that regard.

Another compelling reason to make the change was that previously, there was no efficient way to achieve what X[Y, sum(foo*bar)] does now. You had to write X[Y][ , sum(foo*bar)]. That was suboptimal because X[Y] joined all the columns and passed them all to the second compound query without knowing that only foo and bar are needed. To solve that efficiency problem, extra programming effort was required: X[Y, list(foo, bar)][ , sum(foo*bar)]. The change to by = .EACHI has simplified this by allowing both queries to be expressed inside a single DT[...] query for efficiency.

General Syntax

How can I avoid writing a really long j expression? You've said that I should use the column names, but I've got a lot of columns.

When grouping, the j expression can use column names as variables, as you know, but it can also use a reserved symbol .SD which refers to the Subset of the Data.table for each group (excluding the grouping columns). So to sum up all your columns it's just DT[ , lapply(.SD, sum), by = grp]. It might seem tricky, but it's fast to write and fast to run. Notice you don't have to create an anonymous function. The .SD object is efficiently implemented internally and more efficient than passing an argument to a function. But if the .SD symbol appears in j then data.table has to populate .SD fully for each group even if j doesn't use all of it.

So please don't do, for example, DT[ , sum(.SD[["sales"]]), by = grp]. That works but is inefficient and inelegant. DT[ , sum(sales), by = grp] is what was intended, and it could be 100s of times faster. If you use all of the data in .SD for each group (such as in DT[ , lapply(.SD, sum), by = grp]) then that's very good usage of .SD. If you're using several but not all of the columns, you can combine .SD with .SDcols; see ?data.table.

Why is the default for mult now "all"?

In v1.5.3 the default was changed to "all". When i (or i's key if it has one) has fewer columns than x's key, mult was already set to "all" automatically. Changing the default makes this clearer and easier for users as it came up quite often.

In versions up to v1.3, "all" was slower. Internally, "all" was implemented by joining using "first", then again from scratch using "last", after which a diff between them was performed to work out the span of the matches in x for each row in i. Most often we join to single rows, though, where "first","last" and "all" return the same result. We preferred maximum performance for the majority of situations so the default chosen was "first". When working with a non-unique key (generally a single column containing a grouping variable), DT["A"] returned the first row of that group so DT["A", mult = "all"] was needed to return all the rows in that group.

In v1.4 the binary search in C was changed to branch at the deepest level to find first and last. That branch will likely occur within the same final pages of RAM so there should no longer be a speed disadvantage in defaulting mult to "all". We warned that the default might change and made the change in v1.5.3.

A future version of data.table may allow a distinction between a key and a unique key. Internally mult = "all" would perform more like mult = "first" when all x's key columns were joined to and x's key was a unique key. data.table would need checks on insert and update to make sure a unique key is maintained. An advantage of specifying a unique key would be that data.table would ensure no duplicates could be inserted, in addition to performance.

I'm using c() in j and getting strange results.

This is a common source of confusion. In data.frame you are used to, for example:

DF = data.frame(x = 1:3, y = 4:6, z = 7:9)
DF
#   x y z
# 1 1 4 7
# 2 2 5 8
# 3 3 6 9
DF[ , c("y", "z")]
#   y z
# 1 4 7
# 2 5 8
# 3 6 9

which returns the two columns. In data.table you know you can use the column names directly and might try:

DT = data.table(DF)
DT[ , c(y, z)]
# [1] 4 5 6 7 8 9

but this returns one vector. Remember that the j expression is evaluated within the environment of DT and c() returns a vector. If 2 or more columns are required, use list() or .() instead:

DT[ , .(y, z)]
#    y z
# 1: 4 7
# 2: 5 8
# 3: 6 9

c() can be useful in a data.table too, but its behaviour is different from that in [.data.frame.

I have built up a complex table with many columns. I want to use it as a template for a new table; i.e., create a new table with no rows, but with the column names and types copied from my table. Can I do that easily?

Yes. If your complex table is called DT, try NEWDT = DT[0].

Is a null data.table the same as DT[0]?

No. By “null data.table” we mean the result of data.table(NULL) or as.data.table(NULL); i.e.,

data.table(NULL)
# Null data.table (0 rows and 0 cols)
data.frame(NULL)
# data frame with 0 columns and 0 rows
as.data.table(NULL)
# Null data.table (0 rows and 0 cols)
as.data.frame(NULL)
# data frame with 0 columns and 0 rows
is.null(data.table(NULL))
# [1] FALSE
is.null(data.frame(NULL))
# [1] FALSE

The null data.table|frame is NULL with some attributes attached, which means it's no longer NULL. In R only pure NULL is NULL as tested by is.null(). When referring to the “null data.table” we use lower case null to help distinguish from upper case NULL. To test for the null data.table, use length(DT) == 0 or ncol(DT) == 0 (length is slightly faster as it's a primitive function).

An empty data.table (DT[0]) has one or more columns, all of which are empty. Those empty columns still have names and types.

DT = data.table(a = 1:3, b = c(4, 5, 6), d = c(7L,8L,9L))
DT[0]
# Empty data.table (0 rows) of 3 cols: a,b,d
sapply(DT[0], class)
#         a         b         d 
# "integer" "numeric" "integer"

Why has the DT() alias been removed? {#DTremove1}

DT was introduced originally as a wrapper for a list of jexpressions. Since DT was an alias for data.table, this was a convenient way to take care of silent recycling in cases where each item of the j list evaluated to different lengths. The alias was one reason grouping was slow, though.

As of v1.3, list() or .() should be passed instead to the j argument. These are much faster, especially when there are many groups. Internally, this was a nontrivial change. Vector recycling is now done internally, along with several other speed enhancements for grouping.

But my code uses j = DT(...) and it works. The previous FAQ says that DT() has been removed. {#DTremove2}

Then you are using a version prior to 1.5.3. Prior to 1.5.3 [.data.table detected use of DT() in the j and automatically replaced it with a call to list(). This was to help the transition for existing users.

What are the scoping rules for j expressions?

Think of the subset as an environment where all the column names are variables. When a variable foo is used in the j of a query such as X[Y, sum(foo)], foo is looked for in the following order :

  1. The scope of X's subset; i.e., X's column names.
  2. The scope of each row of Y; i.e., Y's column names (join inherited scope)
  3. The scope of the calling frame; e.g., the line that appears before the data.table query.
  4. Exercise for reader: does it then ripple up the calling frames, or go straight to globalenv()?
  5. The global environment

This is lexical scoping as explained in R FAQ 3.3.1. The environment in which the function was created is not relevant, though, because there is no function. No anonymous function is passed to j. Instead, an anonymous body is passed to j; for example,

DT = data.table(x = rep(c("a", "b"), c(2, 3)), y = 1:5)
DT
#    x y
# 1: a 1
# 2: a 2
# 3: b 3
# 4: b 4
# 5: b 5
DT[ , {z = sum(y); z + 3}, by = x]
#    x V1
# 1: a  6
# 2: b 15

Some programming languages call this a lambda.

Can I trace the j expression as it runs through the groups? {#j-trace}

Try something like this:

DT[ , {
  cat("Objects:", paste(objects(), collapse = ","), "\n")
  cat("Trace: x=", as.character(x), " y=", y, "\n")
  sum(y)},
  by = x]
# Objects: Cfastmean,mean,print,strptime,x,y 
# Trace: x= a  y= 1 2 
# Objects: Cfastmean,mean,print,strptime,x,y 
# Trace: x= b  y= 3 4 5
#    x V1
# 1: a  3
# 2: b 12

Inside each group, why are the group variables length-1?

Above, x is a grouping variable and (as from v1.6.1) has length 1 (if inspected or used in j). It's for efficiency and convenience. Therefore, there is no difference between the following two statements:

DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x, sum(y)), by = x]
#    x g h i j repeatgroupname V6
# 1: a 1 2 3 4               a  3
# 2: b 1 2 3 4               b 12
DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x[1], sum(y)), by = x]
#    x g h i j repeatgroupname V6
# 1: a 1 2 3 4               a  3
# 2: b 1 2 3 4               b 12

If you need the size of the current group, use .N rather than calling length() on any column.

Only the first 10 rows are printed, how do I print more?

There are two things happening here. First, if the number of rows in a data.table are large (> 100 by default), then a summary of the data.table is printed to the console by default. Second, the summary of a large data.table is printed by taking the top and bottom n (= 5 by default) rows of the data.table and only printing those. Both of these parameters (when to trigger a summary and how much of a table to use as a summary) are configurable by R's options mechanism, or by calling the print function directly.

For instance, to enforce the summary of a data.table to only happen when a data.table is greater than 50 rows, you could options(datatable.print.nrows = 50). To disable the summary-by-default completely, you could options(datatable.print.nrows = Inf). You could also call print directly, as in print(your.data.table, nrows = Inf).

If you want to show more than just the top (and bottom) 10 rows of a data.table summary (say you like 20), set options(datatable.print.topn = 20), for example. Again, you could also just call print directly, as in print(your.data.table, topn = 20).

With an X[Y] join, what if X contains a column called "Y"?

When i is a single name such as Y it is evaluated in the calling frame. In all other cases such as calls to .() or other expressions, i is evaluated within the scope of X. This facilitates easy self-joins such as X[J(unique(colA)), mult = "first"].

X[Z[Y]] is failing because X contains a column "Y". I'd like it to use the table Y in calling scope.

The Z[Y] part is not a single name so that is evaluated within the frame of X and the problem occurs. Try tmp = Z[Y]; X[tmp]. This is robust to X containing a column "tmp" because tmp is a single name. If you often encounter conflicts of this type, one simple solution may be to name all tables in uppercase and all column names in lowercase, or some similar scheme.

Can you explain further why data.table is inspired by A[B] syntax in base?

Consider A[B] syntax using an example matrix A :

A = matrix(1:12, nrow = 4)
A
#      [,1] [,2] [,3]
# [1,]    1    5    9
# [2,]    2    6   10
# [3,]    3    7   11
# [4,]    4    8   12

To obtain cells (1, 2) = 5 and (3, 3) = 11 many users (we believe) may try this first :

A[c(1, 3), c(2, 3)]
#      [,1] [,2]
# [1,]    5    9
# [2,]    7   11

However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. ?Extract says :

When indexing arrays by [ a single argument i can be a matrix with as many columns as there are dimensions of x; the result is then a vector with elements corresponding to the sets of indices in each row of i.

Let's try again.

B = cbind(c(1, 3), c(2, 3))
B
#      [,1] [,2]
# [1,]    1    2
# [2,]    3    3
A[B]
# [1]  5 11

A matrix is a 2-dimensional structure with row names and column names. Can we do the same with names?

rownames(A) = letters[1:4]
colnames(A) = LETTERS[1:3]
A
#   A B  C
# a 1 5  9
# b 2 6 10
# c 3 7 11
# d 4 8 12
B = cbind(c("a", "c"), c("B", "C"))
A[B]
# [1]  5 11

So yes, we can. Can we do the same with a data.frame?

A = data.frame(A = 1:4, B = letters[11:14], C = pi*1:4)
rownames(A) = letters[1:4]
A
#   A B         C
# a 1 k  3.141593
# b 2 l  6.283185
# c 3 m  9.424778
# d 4 n 12.566371
B
#      [,1] [,2]
# [1,] "a"  "B" 
# [2,] "c"  "C"
A[B]
# [1] "k"         " 9.424778"

But, notice that the result was coerced to character. R coerced A to matrix first so that the syntax could work, but the result isn't ideal. Let's try making B a data.frame.

B = data.frame(c("a", "c"), c("B", "C"))
cat(try(A[B], silent = TRUE))
# Error in `[.default`(A, B) : invalid subscript type 'list'

So we can't subset a data.frame by a data.frame in base R. What if we want row names and column names that aren't character but integer or float? What if we want more than 2 dimensions of mixed types? Enter data.table.

Furthermore, matrices, especially sparse matrices, are often stored in a 3-column tuple: (i, j, value). This can be thought of as a key-value pair where i and j form a 2-column key. If we have more than one value, perhaps of different types, it might look like (i, j, val1, val2, val3, ...). This looks very much like a data.frame. Hence data.table extends data.frame so that a data.frame X can be subset by a data.frame Y, leading to the X[Y] syntax.

Can base be changed to do this then, rather than a new package?

data.frame is used everywhere and so it is very difficult to make any changes to it. data.table inherits from data.frame. It is a data.frame, too. A data.table can be passed to any package that only accepts data.frame. When that package uses [.data.frame syntax on the data.table, it works. It works because [.data.table looks to see where it was called from. If it was called from such a package, [.data.table diverts to [.data.frame.

I've heard that data.table syntax is analogous to SQL.

Yes :

  • i \(\Leftrightarrow\) where
  • j \(\Leftrightarrow\) select
  • := \(\Leftrightarrow\) update
  • by \(\Leftrightarrow\) group by
  • i \(\Leftrightarrow\) order by (in compound syntax)
  • i \(\Leftrightarrow\) having (in compound syntax)
  • nomatch = NA \(\Leftrightarrow\) outer join
  • nomatch = 0L \(\Leftrightarrow\) inner join
  • mult = "first"|"last" \(\Leftrightarrow\) N/A because SQL is inherently unordered
  • roll = TRUE \(\Leftrightarrow\) N/A because SQL is inherently unordered

The general form is :

DT[where, select|update, group by][order by][...] ... [...]

A key advantage of column vectors in R is that they are ordered, unlike SQL[2]. We can use ordered functions in data.table queries such asdiff() and we can use any R function from any package, not just the functions that are defined in SQL. A disadvantage is that R objects must fit in memory, but with several R packages such as ff, bigmemory, mmap and indexing, this is changing.

[2]: It may be a surprise to learn that select top 10 * from ... does not reliably return the same rows over time in SQL. You do need to include an order by clause, or use a clustered index to guarantee row order; i.e., SQL is inherently unordered.

What are the smaller syntax differences between data.frame and data.table {#SmallerDiffs}

  • DT[3] refers to the 3rd row, but DF[3] refers to the 3rd column
  • DT[3, ] == DT[3], but DF[ , 3] == DF[3] (somewhat confusingly in data.frame, whereas data.table is consistent)
  • For this reason we say the comma is optional in DT, but not optional in DF
  • DT[[3]] == DF[3] == DF[[3]]
  • DT[i, ], where i is a single integer, returns a single row, just like DF[i, ], but unlike a matrix single-row subset which returns a vector.
  • DT[ , j] where j is a single integer returns a one-column data.table, unlike DF[, j] which returns a vector by default
  • DT[ , "colA"][[1]] == DF[ , "colA"].
  • DT[ , colA] == DF[ , "colA"] (currently in data.table v1.9.8 but is about to change, see release notes)
  • DT[ , list(colA)] == DF[ , "colA", drop = FALSE]
  • DT[NA] returns 1 row of NA, but DF[NA] returns an entire copy of DF containing NA throughout. The symbol NA is type logical in R and is therefore recycled by [.data.frame. The user's intention was probably DF[NA_integer_]. [.data.table diverts to this probable intention automatically, for convenience.
  • DT[c(TRUE, NA, FALSE)] treats the NA as FALSE, but DF[c(TRUE, NA, FALSE)] returns NA rows for each NA
  • DT[ColA == ColB] is simpler than DF[!is.na(ColA) & !is.na(ColB) & ColA == ColB, ]
  • data.frame(list(1:2, "k", 1:4)) creates 3 columns, data.table creates one list column.
  • check.names is by default TRUE in data.frame but FALSE in data.table, for convenience.
  • stringsAsFactors is by default TRUE in data.frame but FALSE in data.table, for efficiency. Since a global string cache was added to R, characters items are a pointer to the single cached string and there is no longer a performance benefit of converting to factor.
  • Atomic vectors in list columns are collapsed when printed using ", " in data.frame, but "," in data.table with a trailing comma after the 6th item to avoid accidental printing of large embedded objects.

In [.data.frame we very often set drop = FALSE. When we forget, bugs can arise in edge cases where single columns are selected and all of a sudden a vector is returned rather than a single column data.frame. In [.data.table we took the opportunity to make it consistent and dropped drop.

When a data.table is passed to a data.table-unaware package, that package is not concerned with any of these differences; it just works.

I'm using j for its side effect only, but I'm still getting data returned. How do I stop that?

In this case j can be wrapped with invisible(); e.g., DT[ , invisible(hist(colB)), by = colA][3]

[3]: e.g., hist() returns the breakpoints in addition to plotting to the graphics device.

Why does [.data.table now have a drop argument from v1.5?

So that data.table can inherit from data.frame without using .... If we used ... then invalid argument names would not be caught.

The drop argument is never used by [.data.table. It is a placeholder for non-data.table-aware packages when they use the [.data.frame syntax directly on a data.table.

Rolling joins are cool and very fast! Was that hard to program?

The prevailing row on or before the i row is the final row the binary search tests anyway. So roll = TRUE is essentially just a switch in the binary search C code to return that row.

Why does DT[i, col := value] return the whole of DT? I expected either no visible value (consistent with <-), or a message or return value containing how many rows were updated. It isn't obvious that the data has indeed been updated by reference.

This has changed in v1.8.3 to meet your expectations. Please upgrade.

The whole of DT is returned (now invisibly) so that compound syntax can work; e.g., DT[i, done := TRUE][ , sum(done)]. The number of rows updated is returned when verbose is TRUE, either on a per-query basis or globally using options(datatable.verbose = TRUE).

OK, thanks. What was so difficult about the result of DT[i, col := value] being returned invisibly?

R internally forces visibility on for [. The value of FunTab's eval column (see src/main/names.c) for [ is 0 meaning “force R_Visible on” (see R-Internals section 1.6 ). Therefore, when we tried invisible() or setting R_Visible to 0 directly ourselves, eval in src/main/eval.c would force it on again.

To solve this problem, the key was to stop trying to stop the print method running after a :=. Instead, inside := we now (from v1.8.3) set a global flag which the print method uses to know whether to actually print or not.

Why do I have to type DT sometimes twice after using := to print the result to console?

This is an unfortunate downside to get #869 to work. If a := is used inside a function with no DT[] before the end of the function, then the next time DT is typed at the prompt, nothing will be printed. A repeated DT will print. To avoid this: include a DT[] after the last := in your function. If that is not possible (e.g., it's not a function you can change) then print(DT) and DT[] at the prompt are guaranteed to print. As before, adding an extra [] on the end of := query is a recommended idiom to update and then print; e.g.> DT[,foo:=3L][].

I've noticed that base::cbind.data.frame (and base::rbind.data.frame) appear to be changed by data.table. How is this possible? Why?

It is a temporary, last resort solution until we discover a better way to solve the problems listed below. Essentially, the issue is that data.table inherits from data.frame, and base::cbind and base::rbind (uniquely) do their own S3 dispatch internally as documented by ?cbind. The change is adding one for loop to the start of each function directly in base; e.g.,

base::cbind.data.frame
# function (..., deparse.level = 1) 
# {
#     if (!identical(class(..1), "data.frame")) 
#         for (x in list(...)) {
#             if (inherits(x, "data.table")) 
#                 return(data.table::data.table(...))
#         }
#     data.frame(..., check.names = FALSE)
# }
# <environment: namespace:base>

That modification is made dynamically, i.e., the base definition of cbind.data.frame is fetched, the for loop added to the beginning and then assigned back to base. This solution is intended to be robust to different definitions of base::cbind.data.frame in different versions of R, including unknown future changes. Again, it is a last resort until a better solution is known or made available. The competing requirements are:

  • cbind(DT, DF) needs to work. Defining cbind.data.table doesn't work because base::cbind does its own S3 dispatch and requires that the first cbind method for each object it is passed is identical. This is not true in cbind(DT, DF) because the first method for DT is cbind.data.table but the first method for DF is cbind.data.frame. base::cbind then falls through to its internal bind code which appears to treat DT as a regular list and returns very odd looking and unusable matrix output. See below. We cannot just advise users not to call cbind(DT, DF) because packages such as ggplot2 make such a call (test 167.2).

  • This naturally leads to trying to mask cbind.data.frame instead. Since a data.table is a data.frame, cbind would find the same method for both DT and DF. However, this doesn't work either because base::cbind appears to find methods in base first; i.e., base::cbind.data.frame isn't maskable. This is reproducible as follows :

foo = data.frame(a = 1:3)
cbind.data.frame = function(...) cat("Not printed\n")
cbind(foo)
#   a
# 1 1
# 2 2
# 3 3
rm("cbind.data.frame")
  • Finally, we tried masking cbind itself (v1.6.5 and v1.6.6). This allowed cbind(DT, DF) to work, but introduced compatibility issues with package IRanges, since IRanges also masks cbind. It worked if IRanges was lower on the search() path than data.table, but if IRanges was higher then data.table's, cbind would never be called and the strange-looking matrix output occurs again (see below).

If you know of a better solution that still solves all the issues above, then please let us know and we'll gladly change it.

I've read about method dispatch (e.g. merge may or may not dispatch to merge.data.table) but how does R know how to dispatch? Are dots significant or special? How on earth does R know which function to dispatch and when? {#r-dispatch}

This comes up quite a lot but it's really earth-shatteringly simple. A function such as merge is generic if it consists of a call to UseMethod. When you see people talking about whether or not functions are generic functions they are merely typing the function without () afterwards, looking at the program code inside it and if they see a call to UseMethod then it is generic. What does UseMethod do? It literally slaps the function name together with the class of the first argument, separated by period (.) and then calls that function, passing along the same arguments. It's that simple. For example, merge(X, Y) contains a UseMethod call which means it then dispatches (i.e. calls) paste("merge", class(X), sep = "."). Functions with dots in their name may or may not be methods. The dot is irrelevant really, other than dot being the separator that UseMethod uses. Knowing this background should now highlight why, for example, it is obvious to R folk that as.data.table.data.frame is the data.frame method for the as.data.table generic function. Further, it may help to elucidate that, yes, you are correct, it is not obvious from its name alone that ls.fit is not the fit method of the ls generic function. You only know that by typing ls (not ls()) and observing it isn't a single call to UseMethod.

You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in ?UseMethod and that help file contains :

When a function calling UseMethod('fun') is applied to an object with class attribute c('first', 'second'), the system searches for a function called fun.first and, if it finds it, applies it to the object. If no such function is found a function called fun.second is tried. If no class name produces a suitable function, the function fun.default is used, if it exists, or an error results.

Happily, an internet search for “How does R method dispatch work” (at the time of this writing) returns the ?UseMethod help page in the top few links. Admittedly, other links rapidly descend into the intricacies of S3 vs S4, internal generics and so on.

However, features like basic S3 dispatch (pasting the function name together with the class name) is why some R folk love R. It's so simple. No complicated registration or signature is required. There isn't much needed to learn. To create the merge method for data.table all that was required, literally, was to merely create a function called merge.data.table.

Questions relating to compute time

I have 20 columns and a large number of rows. Why is an expression of one column so quick?

Several reasons:

  • Only that column is grouped, the other 19 are ignored because data.table inspects the j expression and realises it doesn't use the other columns.
  • One memory allocation is made for the largest group only, then that memory is re-used for the other groups. There is very little garbage to collect.
  • R is an in-memory column store; i.e., the columns are contiguous in RAM. Page fetches from RAM into L2 cache are minimised.

I don't have a key on a large table, but grouping is still really quick. Why is that?

data.table uses radix sorting. This is significantly faster than other sort algorithms. See our presentations on our homepage for more information.

This is also one reason why setkey() is quick.

When no key is set, or we group in a different order from that of the key, we call it an ad hoc by.

Why is grouping by columns in the key faster than an ad hoc by?

Because each group is contiguous in RAM, thereby minimising page fetches and memory can be copied in bulk (memcpy in C) rather than looping in C.

What are primary and secondary indexes in data.table?

Manual: ?setkey S.O. : What is the purpose of setting a key in data.table?

setkey(DT, col1, col2) orders the rows by column col1 then within each group of col1 it orders by col2. This is a primary index. The row order is changed by reference in RAM. Subsequent joins and groups on those key columns then take advantage of the sort order for efficiency. (Imagine how difficult looking for a phone number in a printed telephone directory would be if it wasn't sorted by surname then forename. That's literally all setkey does. It sorts the rows by the columns you specify.) The index doesn't use any RAM. It simply changes the row order in RAM and marks the key columns. Analogous to a clustered index in SQL.

However, you can only have one primary key because data can only be physically sorted in RAM in one way at a time. Choose the primary index to be the one you use most often (e.g. [id,date]). Sometimes there isn't an obvious choice for the primary key or you need to join and group many different columns in different orders. Enter a secondary index. This does use memory (4*nrow bytes regardless of the number of columns in the index) to store the order of the rows by the columns you specify, but doesn't actually reorder the rows in RAM. Subsequent joins and groups take advantage of the secondary key's order but need to hop via that index so aren't as efficient as primary indexes. But still, a lot faster than a full vector scan. There is no limit to the number of secondary indexes since each one is just a different ordering vector. Typically you don't need to create secondary indexes. They are created automatically and used for you automatically by using data.table normally; e.g. DT[someCol == someVal, ] and DT[someCol %in% someVals, ] will create, attach and then use the secondary index. This is faster in data.table than a vector scan so automatic indexing is on by default since there is no up-front penalty. There is an option to turn off automatic indexing; e.g., if somehow many indexes are being created and even the relatively small amount of extra memory becomes too large.

We use the words index and key interchangeably.

Error messages

“Could not find function DT

See above here and here.

“unused argument(s) (MySum = sum(v))”

This error is generated by DT[ , MySum = sum(v)]. DT[ , .(MySum = sum(v))] was intended, or DT[ , j = .(MySum = sum(v))].

translateCharUTF8 must be called on a CHARSXP

This error (and similar, e.g., “getCharCE must be called on a CHARSXP”) may be nothing do with character data or locale. Instead, this can be a symptom of an earlier memory corruption. To date these have been reproducible and fixed (quickly). Please report it to our issues tracker.

cbind(DT, DF) returns a strange format, e.g. Integer,5 {#cbinderror}

This occurs prior to v1.6.5, for rbind(DT, DF) too. Please upgrade to v1.6.7 or later.

“cannot change value of locked binding for .SD

.SD is locked by design. See ?data.table. If you'd like to manipulate .SD before using it, or returning it, and don't wish to modify DT using :=, then take a copy first (see ?copy), e.g.,

DT = data.table(a = rep(1:3, 1:3), b = 1:6, c = 7:12)
DT
#    a b  c
# 1: 1 1  7
# 2: 2 2  8
# 3: 2 3  9
# 4: 3 4 10
# 5: 3 5 11
# 6: 3 6 12
DT[ , { mySD = copy(.SD)
      mySD[1, b := 99L]
      mySD},
    by = a]
#    a  b  c
# 1: 1 99  7
# 2: 2 99  8
# 3: 2  3  9
# 4: 3 99 10
# 5: 3  5 11
# 6: 3  6 12

“cannot change value of locked binding for .N

Please upgrade to v1.8.1 or later. From this version, if .N is returned by j it is renamed to N to avoid any ambiguity in any subsequent grouping between the .N special variable and a column called ".N".

The old behaviour can be reproduced by forcing .N to be called .N, like this :

DT = data.table(a = c(1,1,2,2,2), b = c(1,2,2,2,1))
DT
#    a b
# 1: 1 1
# 2: 1 2
# 3: 2 2
# 4: 2 2
# 5: 2 1
DT[ , list(.N = .N), list(a, b)]   # show intermediate result for exposition
#    a b .N
# 1: 1 1  1
# 2: 1 2  1
# 3: 2 2  2
# 4: 2 1  1
cat(try(
    DT[ , list(.N = .N), by = list(a, b)][ , unique(.N), by = a]   # compound query more typical
, silent = TRUE))
# Error in `[.data.table`(DT[, list(.N = .N), by = list(a, b)], , unique(.N),  : 
#   The column '.N' can't be grouped because it conflicts with the special .N variable. Try setnames(DT,'.N','N') first.

If you are already running v1.8.1 or later then the error message is now more helpful than the “cannot change value of locked binding” error, as you can see above, since this vignette was produced using v1.8.1 or later.

The more natural syntax now works :

if (packageVersion("data.table") >= "1.8.1") {
    DT[ , .N, by = list(a, b)][ , unique(N), by = a]
  }
#    a V1
# 1: 1  1
# 2: 2  2
# 3: 2  1
if (packageVersion("data.table") >= "1.9.3") {
    DT[ , .N, by = .(a, b)][ , unique(N), by = a]   # same
}
#    a V1
# 1: 1  1
# 2: 2  2
# 3: 2  1

Warning messages

“The following object(s) are masked from package:base: cbind, rbind

This warning was present in v1.6.5 and v.1.6.6 only, when loading the package. The motivation was to allow cbind(DT, DF) to work, but as it transpired, this broke (full) compatibility with package IRanges. Please upgrade to v1.6.7 or later.

“Coerced numeric RHS to integer to match the column's type”

Hopefully, this is self explanatory. The full message is:

Coerced numeric RHS to integer to match the column's type; may have truncated precision. Either change the column to numeric first by creating a new numeric vector length 5 (nrows of entire table) yourself and assigning that (i.e. 'replace' column), or coerce RHS to integer yourself (e.g. 1L or as.integer) to make your intent clear (and for speed). Or, set the column type correctly up front when you create the table and stick to it, please.

To generate it, try :

DT = data.table(a = 1:5, b = 1:5)
suppressWarnings(
DT[2, b := 6]         # works (slower) with warning
)
class(6)              # numeric not integer
# [1] "numeric"
DT[2, b := 7L]        # works (faster) without warning
class(7L)             # L makes it an integer
# [1] "integer"
DT[ , b := rnorm(5)]  # 'replace' integer column with a numeric column

Reading data.table from RDS or RData file

*.RDS and *.RData are file types which can store in-memory R objects on disk efficiently. However, storing data.table into the binary file loses its column over-allocation. This isn't a big deal – your data.table will be copied in memory on the next by reference operation and throw a warning. Therefore it is recommended to call alloc.col() on each data.table loaded with readRDS() or load() calls.

General questions about the package

v1.3 appears to be missing from the CRAN archive?

That is correct. v1.3 was available on R-Forge only. There were several large changes internally and these took some time to test in development.

Is data.table compatible with S-plus?

Not currently.

  • A few core parts of the package are written in C and use internal R functions and R structures.
  • The package uses lexical scoping which is one of the differences between R and S-plus explained by R FAQ 3.3.1

Is it available for Linux, Mac and Windows?

Yes, for both 32-bit and 64-bit on all platforms. Thanks to CRAN. There are no special or OS-specific libraries used.

I think it's great. What can I do?

Please file suggestions, bug reports and enhancement requests on our issues tracker. This helps make the package better.

Please do star the package on GitHub. This helps encourage the developers and helps other R users find the package.

You can submit pull requests to change the code and/or documentation yourself; see our Contribution Guidelines.

I think it's not great. How do I warn others about my experience?

Please put your vote and comments on Crantastic. Please make it constructive so we have a chance to improve.

I have a question. I know the r-help posting guide tells me to contact the maintainer (not r-help), but is there a larger group of people I can ask?

Yes, there are two options. You can post to datatable-help. It's like r-help, but just for this package. Or the [data.table] tag on Stack Overflow. Feel free to answer questions in those places, too.

Where are the datatable-help archives?

The homepage contains links to the archives in several formats.

I'd prefer not to post on the Issues page, can I mail just one or two people privately?

Sure. You're more likely to get a faster answer from the Issues page or Stack Overflow, though. Further, asking publicly in those places helps build the general knowledge base.

I have created a package that uses data.table. How do I ensure my package is data.table-aware so that inheritance from data.frame works?

Please see this answer.

data.table/inst/doc/datatable-faq.Rmd0000644000175100001440000014626013172210047017151 0ustar hornikusers--- title: "Frequently Asked Questions about data.table" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true number_sections: true vignette: > %\VignetteIndexEntry{Frequently asked questions} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} library(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` The first section, Beginner FAQs, is intended to be read in order, from start to finish. It's just written in a FAQ style to be digested more easily. It isn't really the most frequently asked questions. A better measure for that is looking on Stack Overflow. This FAQ is required reading and considered core documentation. Please do not ask questions on Stack Overflow or raise issues on GitHub until you have read it. We can all tell when you ask that you haven't read it. So if you do ask and haven't read it, don't use your real name. This document has been quickly revised given the changes in v1.9.8 released Nov 2016. Please do submit pull requests to fix mistakes or improvements. If anyone knows why the table of contents comes out so narrow and squashed when displayed by CRAN, please let us know. This document used to be a PDF and we changed it recently to HTML. # Beginner FAQs ## Why do `DT[ , 5]` and `DT[2, 5]` return a 1-column data.table rather than vectors like `data.frame`? {#j-num} For consistency so that when you use data.table in functions that accept varying inputs, you can rely on `DT[...]` returning a data.table. You don't have to remember to include `drop=FALSE` like you do in data.frame. data.table was first released in 2006 and this difference to data.frame has been a feature since the very beginning. You may have heard that it is generally bad practice to refer to columns by number rather than name, though. If your colleague comes along and reads your code later they may have to hunt around to find out which column is number 5. If you or they change the column ordering higher up in your R program, you may produce wrong results with no warning or error if you forget to change all the places in your code which refer to column number 5. That is your fault not R's or data.table's. It's really really bad. Please don't do it. It's the same mantra as professional SQL developers have: never use `select *`, always explicitly select by column name to at least try to be robust to future changes. Say column 5 is named `"region"` and you really must extract that column as a vector not a data.table. It is more robust to use the column name and write `DT$region` or `DT[["region"]]`; i.e., the same as base R. Using base R's `$` and `[[` on data.table is encouraged. Not when combined with `<-` to assign (use `:=` instead for that) but just to select a single column by name they are encouraged. There are some circumstances where referring to a column by number seems like the only way, such as a sequence of columns. In these situations just like data.frame, you can write `DT[, 5:10]` and `DT[,c(1,4,10)]`. However, again, it is more robust (to future changes in your data's number of and ordering of columns) to use a named range such as `DT[,columnRed:columnViolet]` or name each one `DT[,c("columnRed","columnOrange","columnYellow")]`. It is harder work up front, but you will probably thank yourself and your colleagues might thank you in the future. At least you can say you tried your best to write robust code if something does go wrong. However, what we really want you to do is `DT[,.(columnRed,columnOrange,columnYellow)]`; i.e., use column names as if they are variables directly inside `DT[...]`. You don't have to prefix each column with `DT$` like you do in data.frame. The `.()` part is just an alias for `list()` and you can use `list()` instead if you prefer. You can place any R expression of column names, using any R package, returning different types of different lengths, right there. We wanted to encourage you to do that so strongly in the past that we deliberately didn't make `DT[,5]` work at all. Before v1.9.8 released Nov 2016, `DT[,5]` used to just return `5`. The thinking was that we could more simply teach one fact that the parts inside `DT[...]` get evaluated within the frame of DT always (they see column names as if they are variables). And `5` evaluates to `5` so that behaviour was consistent with the single rule. We asked you to go through an extra deliberate hurdle `DT[,5,with=FALSE]` if you really wanted to select a column by name or number. Going forward from Nov 2016, you don't need to use `with=FALSE` and we'll see how greater consistency with data.frame in this regard will help or hinder both new and long-time users. The new users who don't read this FAQ, not even this very first entry, will hopefully not stumble as soon with data.table as they did before if they had expected it to work like data.frame. Hopefully they will not miss out on understanding our intent and recommendation to place expressions of columns inside `DT[i, j, by]`. If they use data.table like data.frame they won't gain any benefits. If you know anyone like that, please give them a friendly nudge to read this document like you are. Reminder: you can place _any_ R expression inside `DT[...]` using column names as if they are variables; e.g., try `DT[, colA*colB/2]`. That does return a vector because you used column names as if they are variables. Wrap with `.()` to return a data.table; i.e. `DT[,.(colA*colB/2)]`. Name it: `DT[,.(myResult = colA*colB/2)]`. And we'll leave it to you to guess how to return two things from this query. It's also quite common to do a bunch of things inside an anonymous body: `DT[, { x<-colA+10; x*x/2 }]` or call another package's function: `DT[ , fitdistr(columnA, "normal")]`. ## Why does `DT[,"region"]` return a 1-column data.table rather than a vector? See the [answer above](#j-num). Try `DT$region` instead. Or `DT[["region"]]`. ## Why does `DT[, region]` return a vector for the "region" column? I'd like a 1-column data.table. Try `DT[ , .(region)]` instead. `.()` is an alias for `list()` and ensures a data.table is returned. Also continue reading and see the FAQ after next. Skim whole documents before getting stuck in one part. ## Why does `DT[ , x, y, z]` not work? I wanted the 3 columns `x`,`y` and `z`. The `j` expression is the 2nd argument. Try `DT[ , c("x","y","z")]` or `DT[ , .(x,y,z)]`. ## I assigned a variable `mycol = "x"` but then `DT[ , mycol]` returns `"x"`. How do I get it to look up the column name contained in the `mycol` variable? In v1.9.8 released Nov 2016 there is an abililty to turn on new behaviour: `options(datatable.WhenJisSymbolThenCallingScope=TRUE)`. It will then work as you expected, just like data.frame. If you are a new user of data.table, you should probably do this. You can place this command in your .Rprofile file so you don't have to remember again. See the long item in release notes about this. The release notes are linked at the top of the data.table homepage: [NEWS](https://github.com/Rdatatable/data.table/blob/master/NEWS.md). Without turning on that new behavior, what's happening is that the `j` expression sees objects in the calling scope. The variable `mycol` does not exist as a column name of `DT` so data.table then looked in the calling scope and found `mycol` there and returned its value `"x"`. This is correct behaviour currently. Had `mycol` been a column name, then that column's data would have been returned. What has been done to date has been `DT[ , mycol, with = FALSE]` which will return the `x` column's data as required. That will still work in the future, too. Alternatively, since a data.table _is_ a `list`, too, you have been and still will be able to write and rely on `DT[[mycol]]`. ## What are the benefits of being able to use column names as if they are variables inside `DT[...]`? `j` doesn't have to be just column names. You can write any R _expression_ of column names directly in `j`, _e.g._, `DT[ , mean(x*y/z)]`. The same applies to `i`, _e.g._, `DT[x>1000, sum(y*z)]`. This runs the `j` expression on the set of rows where the `i` expression is true. You don't even need to return data, _e.g._, `DT[x>1000, plot(y, z)]`. You can do `j` by group simply by adding `by = `; e.g., `DT[x>1000, sum(y*z), by = w]`. This runs `j` for each group in column `w` but just over the rows where `x>1000`. By placing the 3 parts of the query (i=where, j=select and by=group by) inside the square brackets, data.table sees this query as a whole before any part of it is evaluated. Thus it can optimize the combined query for performance. It can do this because the R language uniquely has lazy evalation (Python and Julia do not). data.table sees the expressions inside `DT[...]` before they are evaluated and optimizes them before evaluation. For example, if data.table see that you're only using 2 columns out of 100, it won't bother to subset the 98 that aren't needed by your j expression. ## OK, I'm starting to see what data.table is about, but why didn't you just enhance `data.frame` in R? Why does it have to be a new package? As [highlighted above](#j-num), `j` in `[.data.table` is fundamentally different from `j` in `[.data.frame`. Even if something as simple as `DF[ , 1]` was changed in base R to return a data.frame rather than a vector, that would break existing code in many 1000's of CRAN packages and user code. As soon as we took the step to create a new class that inherited from data.frame, we had the opportunity to change a few things and we did. We want data.table to be slightly different and to work this way for more complicated syntax to work. There are other differences, too (see [below](#SmallerDiffs) ). Furthermore, data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table can be passed to any package that only accepts `data.frame` and that package can use `[.data.frame` syntax on the data.table. See [this answer](http://stackoverflow.com/a/10529888/403310) for how that is achieved. We _have_ proposed enhancements to R wherever possible, too. One of these was accepted as a new feature in R 2.12.0 : > `unique()` and `match()` are now faster on character vectors where all elements are in the global CHARSXP cache and have unmarked encoding (ASCII). Thanks to Matt Dowle for suggesting improvements to the way the hash code is generated in unique.c. A second proposal was to use `memcpy` in duplicate.c, which is much faster than a for loop in C. This would improve the _way_ that R copies data internally (on some measures by 13 times). The thread on r-devel is [here](http://tolstoy.newcastle.edu.au/R/e10/devel/10/04/0148.html). A third more significant proposal that was accepted is that R now uses data.table's radix sort code as from R 3.3.0 : > The radix sort algorithm and implementation from data.table (forder) replaces the previous radix (counting) sort and adds a new method for order(). Contributed by Matt Dowle and Arun Srinivasan, the new algorithm supports logical, integer (even with large values), real, and character vectors. It outperforms all other methods, but there are some caveats (see ?sort). This was big event for us and we celebrated until the cows came home. (Not really.) ## Why are the defaults the way they are? Why does it work the way it does? The simple answer is because the main author originally designed it for his own use. He wanted it that way. He finds it a more natural, faster way to write code, which also executes more quickly. ## Isn't this already done by `with()` and `subset()` in `base`? Some of the features discussed so far are, yes. The package builds upon base functionality. It does the same sorts of things but with less code required and executes many times faster if used correctly. ## Why does `X[Y]` return all the columns from `Y` too? Shouldn't it return a subset of `X`? This was changed in v1.5.3 (Feb 2011). Since then `X[Y]` includes `Y`'s non-join columns. We refer to this feature as _join inherited scope_ because not only are `X` columns available to the `j` expression, so are `Y` columns. The downside is that `X[Y]` is less efficient since every item of `Y`'s non-join columns are duplicated to match the (likely large) number of rows in `X` that match. We therefore strongly encourage `X[Y, j]` instead of `X[Y]`. See [next FAQ](#MergeDiff). ## What is the difference between `X[Y]` and `merge(X, Y)`? {#MergeDiff} `X[Y]` is a join, looking up `X`'s rows using `Y` (or `Y`'s key if it has one) as an index. `Y[X]` is a join, looking up `Y`'s rows using `X` (or `X`'s key if it has one) as an index. `merge(X,Y)`[^1] does both ways at the same time. The number of rows of `X[Y]` and `Y[X]` usually differ, whereas the number of rows returned by `merge(X, Y)` and `merge(Y, X)` is the same. _BUT_ that misses the main point. Most tasks require something to be done on the data after a join or merge. Why merge all the columns of data, only to use a small subset of them afterwards? You may suggest `merge(X[ , ColsNeeded1], Y[ , ColsNeeded2])`, but that requires the programmer to work out which columns are needed. `X[Y, j]` in data.table does all that in one step for you. When you write `X[Y, sum(foo*bar)]`, data.table automatically inspects the `j` expression to see which columns it uses. It will subset those columns only; the others are ignored. Memory is only created for the columns `j` uses and `Y` columns enjoy standard R recycling rules within the context of each group. Let's say `foo` is in `X` and `bar` is in `Y` (along with 20 other columns in `Y`). Isn't `X[Y, sum(foo*bar)]` quicker to program and quicker to run than a `merge` of everything wastefully followed by a `subset`? [^1]: Here we mean either the `merge` _method_ for data.table or the `merge` method for `data.frame` since both methods work in the same way in this respect. See `?merge.data.table` and [below](#r-dispatch) for more information about method dispatch. ## Anything else about `X[Y, sum(foo*bar)]`? This behaviour changed in v1.9.4 (Sep 2014). It now does the `X[Y]` join and then runs `sum(foo*bar)` over all the rows; i.e., `X[Y][ , sum(foo*bar)]`. It used to run `j` for each _group_ of `X` that each row of `Y` matches to. That can still be done as it's very useful but you now need to be explicit and specify `by = .EACHI`, _i.e._, `X[Y, sum(foo*bar), by = .EACHI]`. We call this _grouping by each `i`_. For example, (further complicating it by using _join inherited scope_, too): ```{r} X = data.table(grp = c("a", "a", "b", "b", "b", "c", "c"), foo = 1:7) setkey(X, grp) Y = data.table(c("b", "c"), bar = c(4, 2)) X Y X[Y, sum(foo*bar)] X[Y, sum(foo*bar), by = .EACHI] ``` ## That's nice. How did you manage to change it given that users depended on the old behaviour? The request to change came from users. The feeling was that if a query is doing grouping then an explicit `by=` should be present for code readability reasons. An option was provided to return the old behaviour: `options(datatable.old.bywithoutby)`, by default `FALSE`. This enabled upgrading to test the other new features / bug fixes in v1.9.4, with later migration of any by-without-by queries when ready by adding `by=.EACHI` to them. We retained 47 pre-change tests and added them back as new tests, tested under `options(datatable.old.bywithoutby=TRUE)`. We added a startup message about the change and how to revert to the old behaviour. After 1 year the option was deprecated with warning when used. After 2 years the option to revert to old behaviour was removed. Of the 66 packages on CRAN or Bioconductor that depended on or import data.table at the time of releasing v1.9.4 (it is now over 300), only one was affected by the change. That could be because many packages don't have comprehensive tests, or just that grouping by each row in `i` wasn't being used much by downstream packages. We always test the new version with all dependent packages before release and coordinate any changes with those maintainers. So this release was quite straightforward in that regard. Another compelling reason to make the change was that previously, there was no efficient way to achieve what `X[Y, sum(foo*bar)]` does now. You had to write `X[Y][ , sum(foo*bar)]`. That was suboptimal because `X[Y]` joined all the columns and passed them all to the second compound query without knowing that only `foo` and `bar` are needed. To solve that efficiency problem, extra programming effort was required: `X[Y, list(foo, bar)][ , sum(foo*bar)]`. The change to `by = .EACHI` has simplified this by allowing both queries to be expressed inside a single `DT[...]` query for efficiency. # General Syntax ## How can I avoid writing a really long `j` expression? You've said that I should use the column _names_, but I've got a lot of columns. When grouping, the `j` expression can use column names as variables, as you know, but it can also use a reserved symbol `.SD` which refers to the **S**ubset of the **D**ata.table for each group (excluding the grouping columns). So to sum up all your columns it's just `DT[ , lapply(.SD, sum), by = grp]`. It might seem tricky, but it's fast to write and fast to run. Notice you don't have to create an anonymous function. The `.SD` object is efficiently implemented internally and more efficient than passing an argument to a function. But if the `.SD` symbol appears in `j` then data.table has to populate `.SD` fully for each group even if `j` doesn't use all of it. So please don't do, for example, `DT[ , sum(.SD[["sales"]]), by = grp]`. That works but is inefficient and inelegant. `DT[ , sum(sales), by = grp]` is what was intended, and it could be 100s of times faster. If you use _all_ of the data in `.SD` for each group (such as in `DT[ , lapply(.SD, sum), by = grp]`) then that's very good usage of `.SD`. If you're using _several_ but not _all_ of the columns, you can combine `.SD` with `.SDcols`; see `?data.table`. ## Why is the default for `mult` now `"all"`? In v1.5.3 the default was changed to `"all"`. When `i` (or `i`'s key if it has one) has fewer columns than `x`'s key, `mult` was already set to `"all"` automatically. Changing the default makes this clearer and easier for users as it came up quite often. In versions up to v1.3, `"all"` was slower. Internally, `"all"` was implemented by joining using `"first"`, then again from scratch using `"last"`, after which a diff between them was performed to work out the span of the matches in `x` for each row in `i`. Most often we join to single rows, though, where `"first"`,`"last"` and `"all"` return the same result. We preferred maximum performance for the majority of situations so the default chosen was `"first"`. When working with a non-unique key (generally a single column containing a grouping variable), `DT["A"]` returned the first row of that group so `DT["A", mult = "all"]` was needed to return all the rows in that group. In v1.4 the binary search in C was changed to branch at the deepest level to find first and last. That branch will likely occur within the same final pages of RAM so there should no longer be a speed disadvantage in defaulting `mult` to `"all"`. We warned that the default might change and made the change in v1.5.3. A future version of data.table may allow a distinction between a key and a _unique key_. Internally `mult = "all"` would perform more like `mult = "first"` when all `x`'s key columns were joined to and `x`'s key was a unique key. data.table would need checks on insert and update to make sure a unique key is maintained. An advantage of specifying a unique key would be that data.table would ensure no duplicates could be inserted, in addition to performance. ## I'm using `c()` in `j` and getting strange results. This is a common source of confusion. In `data.frame` you are used to, for example: ```{r} DF = data.frame(x = 1:3, y = 4:6, z = 7:9) DF DF[ , c("y", "z")] ``` which returns the two columns. In data.table you know you can use the column names directly and might try: ```{r} DT = data.table(DF) DT[ , c(y, z)] ``` but this returns one vector. Remember that the `j` expression is evaluated within the environment of `DT` and `c()` returns a vector. If 2 or more columns are required, use `list()` or `.()` instead: ```{r} DT[ , .(y, z)] ``` `c()` can be useful in a data.table too, but its behaviour is different from that in `[.data.frame`. ## I have built up a complex table with many columns. I want to use it as a template for a new table; _i.e._, create a new table with no rows, but with the column names and types copied from my table. Can I do that easily? Yes. If your complex table is called `DT`, try `NEWDT = DT[0]`. ## Is a null data.table the same as `DT[0]`? No. By "null data.table" we mean the result of `data.table(NULL)` or `as.data.table(NULL)`; _i.e._, ```{r} data.table(NULL) data.frame(NULL) as.data.table(NULL) as.data.frame(NULL) is.null(data.table(NULL)) is.null(data.frame(NULL)) ``` The null data.table|`frame` is `NULL` with some attributes attached, which means it's no longer `NULL`. In R only pure `NULL` is `NULL` as tested by `is.null()`. When referring to the "null data.table" we use lower case null to help distinguish from upper case `NULL`. To test for the null data.table, use `length(DT) == 0` or `ncol(DT) == 0` (`length` is slightly faster as it's a primitive function). An _empty_ data.table (`DT[0]`) has one or more columns, all of which are empty. Those empty columns still have names and types. ```{r} DT = data.table(a = 1:3, b = c(4, 5, 6), d = c(7L,8L,9L)) DT[0] sapply(DT[0], class) ``` ## Why has the `DT()` alias been removed? {#DTremove1} `DT` was introduced originally as a wrapper for a list of `j `expressions. Since `DT` was an alias for data.table, this was a convenient way to take care of silent recycling in cases where each item of the `j` list evaluated to different lengths. The alias was one reason grouping was slow, though. As of v1.3, `list()` or `.()` should be passed instead to the `j` argument. These are much faster, especially when there are many groups. Internally, this was a nontrivial change. Vector recycling is now done internally, along with several other speed enhancements for grouping. ## But my code uses `j = DT(...)` and it works. The previous FAQ says that `DT()` has been removed. {#DTremove2} Then you are using a version prior to 1.5.3. Prior to 1.5.3 `[.data.table` detected use of `DT()` in the `j` and automatically replaced it with a call to `list()`. This was to help the transition for existing users. ## What are the scoping rules for `j` expressions? Think of the subset as an environment where all the column names are variables. When a variable `foo` is used in the `j` of a query such as `X[Y, sum(foo)]`, `foo` is looked for in the following order : 1. The scope of `X`'s subset; _i.e._, `X`'s column names. 2. The scope of each row of `Y`; _i.e._, `Y`'s column names (_join inherited scope_) 3. The scope of the calling frame; _e.g._, the line that appears before the data.table query. 4. Exercise for reader: does it then ripple up the calling frames, or go straight to `globalenv()`? 5. The global environment This is _lexical scoping_ as explained in [R FAQ 3.3.1](https://cran.r-project.org/doc/FAQ/R-FAQ.html#Lexical-scoping). The environment in which the function was created is not relevant, though, because there is _no function_. No anonymous _function_ is passed to `j`. Instead, an anonymous _body_ is passed to `j`; for example, ```{r} DT = data.table(x = rep(c("a", "b"), c(2, 3)), y = 1:5) DT DT[ , {z = sum(y); z + 3}, by = x] ``` Some programming languages call this a _lambda_. ## Can I trace the `j` expression as it runs through the groups? {#j-trace} Try something like this: ```{r} DT[ , { cat("Objects:", paste(objects(), collapse = ","), "\n") cat("Trace: x=", as.character(x), " y=", y, "\n") sum(y)}, by = x] ``` ## Inside each group, why are the group variables length-1? [Above](#j-trace), `x` is a grouping variable and (as from v1.6.1) has `length` 1 (if inspected or used in `j`). It's for efficiency and convenience. Therefore, there is no difference between the following two statements: ```{r} DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x, sum(y)), by = x] DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x[1], sum(y)), by = x] ``` If you need the size of the current group, use `.N` rather than calling `length()` on any column. ## Only the first 10 rows are printed, how do I print more? There are two things happening here. First, if the number of rows in a data.table are large (`> 100` by default), then a summary of the data.table is printed to the console by default. Second, the summary of a large data.table is printed by taking the top and bottom `n` (`= 5` by default) rows of the data.table and only printing those. Both of these parameters (when to trigger a summary and how much of a table to use as a summary) are configurable by R's `options` mechanism, or by calling the `print` function directly. For instance, to enforce the summary of a data.table to only happen when a data.table is greater than 50 rows, you could `options(datatable.print.nrows = 50)`. To disable the summary-by-default completely, you could `options(datatable.print.nrows = Inf)`. You could also call `print` directly, as in `print(your.data.table, nrows = Inf)`. If you want to show more than just the top (and bottom) 10 rows of a data.table summary (say you like 20), set `options(datatable.print.topn = 20)`, for example. Again, you could also just call `print` directly, as in `print(your.data.table, topn = 20)`. ## With an `X[Y]` join, what if `X` contains a column called `"Y"`? When `i` is a single name such as `Y` it is evaluated in the calling frame. In all other cases such as calls to `.()` or other expressions, `i` is evaluated within the scope of `X`. This facilitates easy _self-joins_ such as `X[J(unique(colA)), mult = "first"]`. ## `X[Z[Y]]` is failing because `X` contains a column `"Y"`. I'd like it to use the table `Y` in calling scope. The `Z[Y]` part is not a single name so that is evaluated within the frame of `X` and the problem occurs. Try `tmp = Z[Y]; X[tmp]`. This is robust to `X` containing a column `"tmp"` because `tmp` is a single name. If you often encounter conflicts of this type, one simple solution may be to name all tables in uppercase and all column names in lowercase, or some similar scheme. ## Can you explain further why data.table is inspired by `A[B]` syntax in `base`? Consider `A[B]` syntax using an example matrix `A` : ```{r} A = matrix(1:12, nrow = 4) A ``` To obtain cells `(1, 2) = 5` and `(3, 3) = 11` many users (we believe) may try this first : ```{r} A[c(1, 3), c(2, 3)] ``` However, this returns the union of those rows and columns. To reference the cells, a 2-column matrix is required. `?Extract` says : > When indexing arrays by `[` a single argument `i` can be a matrix with as many columns as there are dimensions of `x`; the result is then a vector with elements corresponding to the sets of indices in each row of `i`. Let's try again. ```{r} B = cbind(c(1, 3), c(2, 3)) B A[B] ``` A matrix is a 2-dimensional structure with row names and column names. Can we do the same with names? ```{r} rownames(A) = letters[1:4] colnames(A) = LETTERS[1:3] A B = cbind(c("a", "c"), c("B", "C")) A[B] ``` So yes, we can. Can we do the same with a `data.frame`? ```{r} A = data.frame(A = 1:4, B = letters[11:14], C = pi*1:4) rownames(A) = letters[1:4] A B A[B] ``` But, notice that the result was coerced to `character.` R coerced `A` to `matrix` first so that the syntax could work, but the result isn't ideal. Let's try making `B` a `data.frame`. ```{r} B = data.frame(c("a", "c"), c("B", "C")) cat(try(A[B], silent = TRUE)) ``` So we can't subset a `data.frame` by a `data.frame` in base R. What if we want row names and column names that aren't `character` but `integer` or `float`? What if we want more than 2 dimensions of mixed types? Enter data.table. Furthermore, matrices, especially sparse matrices, are often stored in a 3-column tuple: `(i, j, value)`. This can be thought of as a key-value pair where `i` and `j` form a 2-column key. If we have more than one value, perhaps of different types, it might look like `(i, j, val1, val2, val3, ...)`. This looks very much like a `data.frame`. Hence data.table extends `data.frame` so that a `data.frame` `X` can be subset by a `data.frame` `Y`, leading to the `X[Y]` syntax. ## Can base be changed to do this then, rather than a new package? `data.frame` is used _everywhere_ and so it is very difficult to make _any_ changes to it. data.table _inherits_ from `data.frame`. It _is_ a `data.frame`, too. A data.table _can_ be passed to any package that _only_ accepts `data.frame`. When that package uses `[.data.frame` syntax on the data.table, it works. It works because `[.data.table` looks to see where it was called from. If it was called from such a package, `[.data.table` diverts to `[.data.frame`. ## I've heard that data.table syntax is analogous to SQL. Yes : - `i` $\Leftrightarrow$ where - `j` $\Leftrightarrow$ select - `:=` $\Leftrightarrow$ update - `by` $\Leftrightarrow$ group by - `i` $\Leftrightarrow$ order by (in compound syntax) - `i` $\Leftrightarrow$ having (in compound syntax) - `nomatch = NA` $\Leftrightarrow$ outer join - `nomatch = 0L` $\Leftrightarrow$ inner join - `mult = "first"|"last"` $\Leftrightarrow$ N/A because SQL is inherently unordered - `roll = TRUE` $\Leftrightarrow$ N/A because SQL is inherently unordered The general form is : ```{r, eval = FALSE} DT[where, select|update, group by][order by][...] ... [...] ``` A key advantage of column vectors in R is that they are _ordered_, unlike SQL[^2]. We can use ordered functions in `data.table queries such as `diff()` and we can use _any_ R function from any package, not just the functions that are defined in SQL. A disadvantage is that R objects must fit in memory, but with several R packages such as ff, bigmemory, mmap and indexing, this is changing. [^2]: It may be a surprise to learn that `select top 10 * from ...` does _not_ reliably return the same rows over time in SQL. You do need to include an `order by` clause, or use a clustered index to guarantee row order; _i.e._, SQL is inherently unordered. ## What are the smaller syntax differences between `data.frame` and data.table {#SmallerDiffs} - `DT[3]` refers to the 3rd _row_, but `DF[3]` refers to the 3rd _column_ - `DT[3, ] == DT[3]`, but `DF[ , 3] == DF[3]` (somewhat confusingly in data.frame, whereas data.table is consistent) - For this reason we say the comma is _optional_ in `DT`, but not optional in `DF` - `DT[[3]] == DF[3] == DF[[3]]` - `DT[i, ]`, where `i` is a single integer, returns a single row, just like `DF[i, ]`, but unlike a matrix single-row subset which returns a vector. - `DT[ , j]` where `j` is a single integer returns a one-column data.table, unlike `DF[, j]` which returns a vector by default - `DT[ , "colA"][[1]] == DF[ , "colA"]`. - `DT[ , colA] == DF[ , "colA"]` (currently in data.table v1.9.8 but is about to change, see release notes) - `DT[ , list(colA)] == DF[ , "colA", drop = FALSE]` - `DT[NA]` returns 1 row of `NA`, but `DF[NA]` returns an entire copy of `DF` containing `NA` throughout. The symbol `NA` is type `logical` in R and is therefore recycled by `[.data.frame`. The user's intention was probably `DF[NA_integer_]`. `[.data.table` diverts to this probable intention automatically, for convenience. - `DT[c(TRUE, NA, FALSE)]` treats the `NA` as `FALSE`, but `DF[c(TRUE, NA, FALSE)]` returns `NA` rows for each `NA` - `DT[ColA == ColB]` is simpler than `DF[!is.na(ColA) & !is.na(ColB) & ColA == ColB, ]` - `data.frame(list(1:2, "k", 1:4))` creates 3 columns, data.table creates one `list` column. - `check.names` is by default `TRUE` in `data.frame` but `FALSE` in data.table, for convenience. - `stringsAsFactors` is by default `TRUE` in `data.frame` but `FALSE` in data.table, for efficiency. Since a global string cache was added to R, characters items are a pointer to the single cached string and there is no longer a performance benefit of converting to `factor`. - Atomic vectors in `list` columns are collapsed when printed using `", "` in `data.frame`, but `","` in data.table with a trailing comma after the 6th item to avoid accidental printing of large embedded objects. In `[.data.frame` we very often set `drop = FALSE`. When we forget, bugs can arise in edge cases where single columns are selected and all of a sudden a vector is returned rather than a single column `data.frame`. In `[.data.table` we took the opportunity to make it consistent and dropped `drop`. When a data.table is passed to a data.table-unaware package, that package is not concerned with any of these differences; it just works. ## I'm using `j` for its side effect only, but I'm still getting data returned. How do I stop that? In this case `j` can be wrapped with `invisible()`; e.g., `DT[ , invisible(hist(colB)), by = colA]`[^3] [^3]: _e.g._, `hist()` returns the breakpoints in addition to plotting to the graphics device. ## Why does `[.data.table` now have a `drop` argument from v1.5? So that data.table can inherit from `data.frame` without using `...`. If we used `...` then invalid argument names would not be caught. The `drop` argument is never used by `[.data.table`. It is a placeholder for non-data.table-aware packages when they use the `[.data.frame` syntax directly on a data.table. ## Rolling joins are cool and very fast! Was that hard to program? The prevailing row on or before the `i` row is the final row the binary search tests anyway. So `roll = TRUE` is essentially just a switch in the binary search C code to return that row. ## Why does `DT[i, col := value]` return the whole of `DT`? I expected either no visible value (consistent with `<-`), or a message or return value containing how many rows were updated. It isn't obvious that the data has indeed been updated by reference. This has changed in v1.8.3 to meet your expectations. Please upgrade. The whole of `DT` is returned (now invisibly) so that compound syntax can work; _e.g._, `DT[i, done := TRUE][ , sum(done)]`. The number of rows updated is returned when `verbose` is `TRUE`, either on a per-query basis or globally using `options(datatable.verbose = TRUE)`. ## OK, thanks. What was so difficult about the result of `DT[i, col := value]` being returned invisibly? R internally forces visibility on for `[`. The value of FunTab's eval column (see [src/main/names.c](https://github.com/wch/r-source/blob/trunk/src/main/names.c)) for `[` is `0` meaning "force `R_Visible` on" (see [R-Internals section 1.6](https://cran.r-project.org/doc/manuals/r-release/R-ints.html#Autoprinting) ). Therefore, when we tried `invisible()` or setting `R_Visible` to `0` directly ourselves, `eval` in [src/main/eval.c](https://github.com/wch/r-source/blob/trunk/src/main/eval.c) would force it on again. To solve this problem, the key was to stop trying to stop the print method running after a `:=`. Instead, inside `:=` we now (from v1.8.3) set a global flag which the print method uses to know whether to actually print or not. ## Why do I have to type `DT` sometimes twice after using `:=` to print the result to console? This is an unfortunate downside to get [#869](https://github.com/Rdatatable/data.table/issues/869) to work. If a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` is typed at the prompt, nothing will be printed. A repeated `DT` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `print(DT)` and `DT[]` at the prompt are guaranteed to print. As before, adding an extra `[]` on the end of `:=` query is a recommended idiom to update and then print; e.g.> `DT[,foo:=3L][]`. ## I've noticed that `base::cbind.data.frame` (and `base::rbind.data.frame`) appear to be changed by data.table. How is this possible? Why? It is a temporary, last resort solution until we discover a better way to solve the problems listed below. Essentially, the issue is that data.table inherits from `data.frame`, _and_ `base::cbind` and `base::rbind` (uniquely) do their own S3 dispatch internally as documented by `?cbind`. The change is adding one `for` loop to the start of each function directly in `base`; _e.g._, ```{r} base::cbind.data.frame ``` That modification is made dynamically, _i.e._, the `base` definition of `cbind.data.frame` is fetched, the `for` loop added to the beginning and then assigned back to `base`. This solution is intended to be robust to different definitions of `base::cbind.data.frame` in different versions of R, including unknown future changes. Again, it is a last resort until a better solution is known or made available. The competing requirements are: - `cbind(DT, DF)` needs to work. Defining `cbind.data.table` doesn't work because `base::cbind` does its own S3 dispatch and requires that the _first_ `cbind` method for each object it is passed is _identical_. This is not true in `cbind(DT, DF)` because the first method for `DT` is `cbind.data.table` but the first method for `DF` is `cbind.data.frame`. `base::cbind` then falls through to its internal `bind` code which appears to treat `DT` as a regular `list` and returns very odd looking and unusable `matrix` output. See [below](#cbinderror). We cannot just advise users not to call `cbind(DT, DF)` because packages such as `ggplot2` make such a call ([test 167.2](https://github.com/Rdatatable/data.table/blob/master/inst/tests/tests.Rraw#L444-L447)). - This naturally leads to trying to mask `cbind.data.frame` instead. Since a data.table is a `data.frame`, `cbind` would find the same method for both `DT` and `DF`. However, this doesn't work either because `base::cbind` appears to find methods in `base` first; _i.e._, `base::cbind.data.frame` isn't maskable. This is reproducible as follows : ```{r} foo = data.frame(a = 1:3) cbind.data.frame = function(...) cat("Not printed\n") cbind(foo) rm("cbind.data.frame") ``` - Finally, we tried masking `cbind` itself (v1.6.5 and v1.6.6). This allowed `cbind(DT, DF)` to work, but introduced compatibility issues with package `IRanges`, since `IRanges` also masks `cbind`. It worked if `IRanges` was lower on the `search()` path than data.table, but if `IRanges` was higher then data.table's, `cbind` would never be called and the strange-looking `matrix` output occurs again (see [below](#cbinderror)). If you know of a better solution that still solves all the issues above, then please let us know and we'll gladly change it. ## I've read about method dispatch (_e.g._ `merge` may or may not dispatch to `merge.data.table`) but _how_ does R know how to dispatch? Are dots significant or special? How on earth does R know which function to dispatch and when? {#r-dispatch} This comes up quite a lot but it's really earth-shatteringly simple. A function such as `merge` is _generic_ if it consists of a call to `UseMethod`. When you see people talking about whether or not functions are _generic_ functions they are merely typing the function without `()` afterwards, looking at the program code inside it and if they see a call to `UseMethod` then it is _generic_. What does `UseMethod` do? It literally slaps the function name together with the class of the first argument, separated by period (`.`) and then calls that function, passing along the same arguments. It's that simple. For example, `merge(X, Y)` contains a `UseMethod` call which means it then _dispatches_ (i.e. calls) `paste("merge", class(X), sep = ".")`. Functions with dots in their name may or may not be methods. The dot is irrelevant really, other than dot being the separator that `UseMethod` uses. Knowing this background should now highlight why, for example, it is obvious to R folk that `as.data.table.data.frame` is the `data.frame` method for the `as.data.table` generic function. Further, it may help to elucidate that, yes, you are correct, it is not obvious from its name alone that `ls.fit` is not the fit method of the `ls` generic function. You only know that by typing `ls` (not `ls()`) and observing it isn't a single call to `UseMethod`. You might now ask: where is this documented in R? Answer: it's quite clear, but, you need to first know to look in `?UseMethod` and _that_ help file contains : > When a function calling `UseMethod('fun')` is applied to an object with class attribute `c('first', 'second')`, the system searches for a function called `fun.first` and, if it finds it, applies it to the object. If no such function is found a function called `fun.second` is tried. If no class name produces a suitable function, the function `fun.default` is used, if it exists, or an error results. Happily, an internet search for "How does R method dispatch work" (at the time of this writing) returns the `?UseMethod` help page in the top few links. Admittedly, other links rapidly descend into the intricacies of S3 vs S4, internal generics and so on. However, features like basic S3 dispatch (pasting the function name together with the class name) is why some R folk love R. It's so simple. No complicated registration or signature is required. There isn't much needed to learn. To create the `merge` method for data.table all that was required, literally, was to merely create a function called `merge.data.table`. # Questions relating to compute time ## I have 20 columns and a large number of rows. Why is an expression of one column so quick? Several reasons: - Only that column is grouped, the other 19 are ignored because data.table inspects the `j` expression and realises it doesn't use the other columns. - One memory allocation is made for the largest group only, then that memory is re-used for the other groups. There is very little garbage to collect. - R is an in-memory column store; i.e., the columns are contiguous in RAM. Page fetches from RAM into L2 cache are minimised. ## I don't have a `key` on a large table, but grouping is still really quick. Why is that? data.table uses radix sorting. This is significantly faster than other sort algorithms. See [our presentations](http://user2015.math.aau.dk/presentations/234.pdf) on [our homepage](https://github.com/Rdatatable/data.table/wiki) for more information. This is also one reason why `setkey()` is quick. When no `key` is set, or we group in a different order from that of the key, we call it an _ad hoc_ `by`. ## Why is grouping by columns in the key faster than an _ad hoc_ `by`? Because each group is contiguous in RAM, thereby minimising page fetches and memory can be copied in bulk (`memcpy` in C) rather than looping in C. ## What are primary and secondary indexes in data.table? Manual: [`?setkey`](https://www.rdocumentation.org/packages/data.table/functions/setkey) S.O. : [What is the purpose of setting a key in data.table?](https://stackoverflow.com/questions/20039335/what-is-the-purpose-of-setting-a-key-in-data-table/20057411#20057411) `setkey(DT, col1, col2)` orders the rows by column `col1` then within each group of `col1` it orders by `col2`. This is a _primary index_. The row order is changed _by reference_ in RAM. Subsequent joins and groups on those key columns then take advantage of the sort order for efficiency. (Imagine how difficult looking for a phone number in a printed telephone directory would be if it wasn't sorted by surname then forename. That's literally all `setkey` does. It sorts the rows by the columns you specify.) The index doesn't use any RAM. It simply changes the row order in RAM and marks the key columns. Analogous to a _clustered index_ in SQL. However, you can only have one primary key because data can only be physically sorted in RAM in one way at a time. Choose the primary index to be the one you use most often (e.g. `[id,date]`). Sometimes there isn't an obvious choice for the primary key or you need to join and group many different columns in different orders. Enter a secondary index. This does use memory (`4*nrow` bytes regardless of the number of columns in the index) to store the order of the rows by the columns you specify, but doesn't actually reorder the rows in RAM. Subsequent joins and groups take advantage of the secondary key's order but need to _hop_ via that index so aren't as efficient as primary indexes. But still, a lot faster than a full vector scan. There is no limit to the number of secondary indexes since each one is just a different ordering vector. Typically you don't need to create secondary indexes. They are created automatically and used for you automatically by using data.table normally; _e.g._ `DT[someCol == someVal, ]` and `DT[someCol %in% someVals, ]` will create, attach and then use the secondary index. This is faster in data.table than a vector scan so automatic indexing is on by default since there is no up-front penalty. There is an option to turn off automatic indexing; _e.g._, if somehow many indexes are being created and even the relatively small amount of extra memory becomes too large. We use the words _index_ and _key_ interchangeably. # Error messages ## "Could not find function `DT`" See above [here](#DTremove1) and [here](#DTremove2). ## "unused argument(s) (`MySum = sum(v)`)" This error is generated by `DT[ , MySum = sum(v)]`. `DT[ , .(MySum = sum(v))]` was intended, or `DT[ , j = .(MySum = sum(v))]`. ## "`translateCharUTF8` must be called on a `CHARSXP`" This error (and similar, _e.g._, "`getCharCE` must be called on a `CHARSXP`") may be nothing do with character data or locale. Instead, this can be a symptom of an earlier memory corruption. To date these have been reproducible and fixed (quickly). Please report it to our [issues tracker](https://github.com/Rdatatable/data.table/issues). ## `cbind(DT, DF)` returns a strange format, _e.g._ `Integer,5` {#cbinderror} This occurs prior to v1.6.5, for `rbind(DT, DF)` too. Please upgrade to v1.6.7 or later. ## "cannot change value of locked binding for `.SD`" `.SD` is locked by design. See `?data.table`. If you'd like to manipulate `.SD` before using it, or returning it, and don't wish to modify `DT` using `:=`, then take a copy first (see `?copy`), _e.g._, ```{r} DT = data.table(a = rep(1:3, 1:3), b = 1:6, c = 7:12) DT DT[ , { mySD = copy(.SD) mySD[1, b := 99L] mySD}, by = a] ``` ## "cannot change value of locked binding for `.N`" Please upgrade to v1.8.1 or later. From this version, if `.N` is returned by `j` it is renamed to `N` to avoid any ambiguity in any subsequent grouping between the `.N` special variable and a column called `".N"`. The old behaviour can be reproduced by forcing `.N` to be called `.N`, like this : ```{r} DT = data.table(a = c(1,1,2,2,2), b = c(1,2,2,2,1)) DT DT[ , list(.N = .N), list(a, b)] # show intermediate result for exposition cat(try( DT[ , list(.N = .N), by = list(a, b)][ , unique(.N), by = a] # compound query more typical , silent = TRUE)) ``` If you are already running v1.8.1 or later then the error message is now more helpful than the "cannot change value of locked binding" error, as you can see above, since this vignette was produced using v1.8.1 or later. The more natural syntax now works : ```{r} if (packageVersion("data.table") >= "1.8.1") { DT[ , .N, by = list(a, b)][ , unique(N), by = a] } if (packageVersion("data.table") >= "1.9.3") { DT[ , .N, by = .(a, b)][ , unique(N), by = a] # same } ``` # Warning messages ## "The following object(s) are masked from `package:base`: `cbind`, `rbind`" This warning was present in v1.6.5 and v.1.6.6 only, when loading the package. The motivation was to allow `cbind(DT, DF)` to work, but as it transpired, this broke (full) compatibility with package `IRanges`. Please upgrade to v1.6.7 or later. ## "Coerced numeric RHS to integer to match the column's type" Hopefully, this is self explanatory. The full message is: Coerced numeric RHS to integer to match the column's type; may have truncated precision. Either change the column to numeric first by creating a new numeric vector length 5 (nrows of entire table) yourself and assigning that (i.e. 'replace' column), or coerce RHS to integer yourself (e.g. 1L or as.integer) to make your intent clear (and for speed). Or, set the column type correctly up front when you create the table and stick to it, please. To generate it, try : ```{r} DT = data.table(a = 1:5, b = 1:5) suppressWarnings( DT[2, b := 6] # works (slower) with warning ) class(6) # numeric not integer DT[2, b := 7L] # works (faster) without warning class(7L) # L makes it an integer DT[ , b := rnorm(5)] # 'replace' integer column with a numeric column ``` ## Reading data.table from RDS or RData file `*.RDS` and `*.RData` are file types which can store in-memory R objects on disk efficiently. However, storing data.table into the binary file loses its column over-allocation. This isn't a big deal -- your data.table will be copied in memory on the next _by reference_ operation and throw a warning. Therefore it is recommended to call `alloc.col()` on each data.table loaded with `readRDS()` or `load()` calls. # General questions about the package ## v1.3 appears to be missing from the CRAN archive? That is correct. v1.3 was available on R-Forge only. There were several large changes internally and these took some time to test in development. ## Is data.table compatible with S-plus? Not currently. - A few core parts of the package are written in C and use internal R functions and R structures. - The package uses lexical scoping which is one of the differences between R and **S-plus** explained by [R FAQ 3.3.1](https://cran.r-project.org/doc/FAQ/R-FAQ.html#Lexical-scoping) ## Is it available for Linux, Mac and Windows? Yes, for both 32-bit and 64-bit on all platforms. Thanks to CRAN. There are no special or OS-specific libraries used. ## I think it's great. What can I do? Please file suggestions, bug reports and enhancement requests on our [issues tracker](https://github.com/Rdatatable/data.table/issues). This helps make the package better. Please do star the package on [GitHub](https://github.com/Rdatatable/data.table/wiki). This helps encourage the developers and helps other R users find the package. You can submit pull requests to change the code and/or documentation yourself; see our [Contribution Guidelines](https://github.com/Rdatatable/data.table/blob/master/CONTRIBUTING.md). ## I think it's not great. How do I warn others about my experience? Please put your vote and comments on [Crantastic](http://crantastic.org/packages/data-table). Please make it constructive so we have a chance to improve. ## I have a question. I know the r-help posting guide tells me to contact the maintainer (not r-help), but is there a larger group of people I can ask? Yes, there are two options. You can post to [datatable-help](mailto:datatable-help@lists.r-forge.r-project.org). It's like r-help, but just for this package. Or the [`[data.table]` tag](https://stackoverflow.com/tags/data.table/info) on [Stack Overflow](https://stackoverflow.com/). Feel free to answer questions in those places, too. ## Where are the datatable-help archives? The [homepage](https://github.com/Rdatatable/data.table/wiki) contains links to the archives in several formats. ## I'd prefer not to post on the Issues page, can I mail just one or two people privately? Sure. You're more likely to get a faster answer from the Issues page or Stack Overflow, though. Further, asking publicly in those places helps build the general knowledge base. ## I have created a package that uses data.table. How do I ensure my package is data.table-aware so that inheritance from `data.frame` works? Please see [this answer](http://stackoverflow.com/a/10529888/403310). data.table/inst/doc/datatable-secondary-indices-and-auto-indexing.html0000644000175100001440000010377413172212367025552 0ustar hornikusers Data {#data}

This vignette assumes that the reader is familiar with data.table's [i, j, by] syntax, and how to perform fast key based subsets. If you're not familar with these concepts, please read the “Introduction to data.table”, “Reference semantics” and “Keys and fast binary search based subset” vignettes first.


Data {#data}

We will use the same flights data as in the “Introduction to data.table” vignette.

flights <- fread("flights14.csv")
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18
dim(flights)
# [1] 253316     11

Introduction

In this vignette, we will

  • discuss secondary indices and provide rationale as to why we need them by citing cases where setting keys is not necessarily ideal,

  • perform fast subsetting, once again, but using the new on argument, which computes secondary indices internally for the task (temporarily), and reuses if one already exists,

  • and finally look at auto indexing which goes a step further and creates secondary indices automatically, but does so on native R syntax for subsetting.

1. Secondary indices

a) What are secondary indices?

Secondary indices are similar to keys in data.table, except for two major differences:

  • It doesn't physically reorder the entire data.table in RAM. Instead, it only computes the order for the set of columns provided and stores that order vector in an additional attribute called index.

  • There can be more than one secondary index for a data.table (as we will see below).

b) Set and get secondary indices

– How can we set the column origin as a secondary index in the data.table flights?

setindex(flights, origin)
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18

## alternatively we can provide character vectors to the function 'setindexv()'
# setindexv(flights, "origin") # useful to program with

# 'index' attribute added
names(attributes(flights))
# [1] "names"             "row.names"         "class"             ".internal.selfref"
# [5] "index"
  • setindex and setindexv() allows adding a secondary index to the data.table.

  • Note that flights is not phyiscally reordered in increasing order of origin, as would have been the case with setkey().

  • Also note that the attribute index has been added to flights.

  • setindex(flights, NULL) would remove all secondary indices.

– How can we get all the secondary indices set so far in flights?

indices(flights)
# [1] "origin"

setindex(flights, origin, dest)
indices(flights)
# [1] "origin"       "origin__dest"
  • The function indices() returns all current secondary indices in the data.table. If none exists, NULL is returned.

  • Note that by creating another index on the columns origin, dest, we do not lose the first index created on the column origin, i.e., we can have multiple secondary indices.

c) Why do we need secondary indices?

– Reordering a data.table can be expensive and not always ideal

Consider the case where you would like to perform a fast key based subset on origin column for the value “JFK”. We'd do this as:

## not run
setkey(flights, origin)
flights["JFK"] # or flights[.("JFK")]

setkey() requires: {.bs-callout .bs-callout-info}

a) computing the order vector for the column(s) provided, here, origin, and

b) reordering the entire data.table, by reference, based on the order vector computed.

Computing the order isn't the time consuming part, since data.table uses true radix sorting on integer, character and numeric vectors. However reordering the data.table could be time consuming (depending on the number of rows and columns).

Unless our task involves repeated subsetting on the same column, fast key based subsetting could effectively be nullified by the time to reorder, depending on our data.table dimensions.

– There can be only one key at the most

Now if we would like to repeat the same operation but on dest column instead, for the value “LAX”, then we have to setkey(), again.

## not run
setkey(flights, dest)
flights["LAX"]

And this reorders flights by dest, again. What we would really like is to be able to perform the fast subsetting by eliminating the reordering step.

And this is precisely what secondary indices allow for!

– Secondary indices can be reused

Since there can be multiple secondary indices, and creating an index is as simple as storing the order vector as an attribute, this allows us to even eliminate the time to recompute the order vector if an index already exists.

– The new on argument allows for cleaner syntax and automatic creation and reuse of secondary indices

As we will see in the next section, the on argument provides several advantages:

on argument {.bs-callout .bs-callout-info}

  • enables subsetting by computing secondary indices on the fly. This eliminates having to do setindex() every time.

  • allows easy reuse of existing indices by just checking the attributes.

  • allows for a cleaner syntax by having the columns on which the subset is performed as part of the syntax. This makes the code easier to follow when looking at it at a later point.

    Note that on argument can also be used on keyed subsets as well. In fact, we encourage to provide the on argument even when subsetting using keys for better readability.

2. Fast subsetting using on argument and secondary indices

a) Fast subsets in i

– Subset all rows where the origin airport matches “JFK” using on

flights["JFK", on = "origin"]
#        year month day dep_delay arr_delay carrier origin dest air_time distance hour
#     1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
#     2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
#     3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
#     4: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
#     5: 2014     1   1        -2       -18      AA    JFK  LAX      338     2475   21
#    ---                                                                              
# 81479: 2014    10  31        -4       -21      UA    JFK  SFO      337     2586   17
# 81480: 2014    10  31        -2       -37      UA    JFK  SFO      344     2586   18
# 81481: 2014    10  31         0       -33      UA    JFK  LAX      320     2475   17
# 81482: 2014    10  31        -6       -38      UA    JFK  SFO      343     2586    9
# 81483: 2014    10  31        -6       -38      UA    JFK  LAX      323     2475   11

## alternatively
# flights[.("JFK"), on = "origin"] (or) 
# flights[list("JFK"), on = "origin"]
  • This statement performs a fast binary search based subset as well, by computing the index on the fly. However, note that it doesn't save the index as an attribute automatically. This may change in the future.

  • If we had already created a secondary index, using setindex(), then on would reuse it instead of (re)computing it. We can see that by using verbose = TRUE:

    setindex(flights, origin)
    flights["JFK", on = "origin", verbose = TRUE][1:5]
    # on= matches existing index, using index
    # Starting bmerge ...done in 0 secs
    #    year month day dep_delay arr_delay carrier origin dest air_time distance hour
    # 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
    # 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
    # 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
    # 4: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
    # 5: 2014     1   1        -2       -18      AA    JFK  LAX      338     2475   21
    

– How can I subset based on origin and dest columns?

For example, if we want to subset "JFK", "LAX" combination, then:

flights[.("JFK", "LAX"), on = c("origin", "dest")][1:5]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
# 4: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
# 5: 2014     1   1        -2       -18      AA    JFK  LAX      338     2475   21
  • on argument accepts a character vector of column names corresponding to the order provided to i-argument.

  • Since the time to compute the secondary index is quite small, we don't have to use setindex(), unless, once again, the task involves repeated subsetting on the same column.

b) Select in j

All the operations we will discuss below are no different to the ones we already saw in the Keys and fast binary search based subset vignette. Except we'll be using the on argument instead of setting keys.

– Return arr_delay column alone as a data.table corresponding to origin = "LGA" and dest = "TPA"

flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")]
#       arr_delay
#    1:         1
#    2:        14
#    3:       -17
#    4:        -4
#    5:       -12
#   ---          
# 1848:        39
# 1849:       -24
# 1850:       -12
# 1851:        21
# 1852:       -11

c) Chaining

– On the result obtained above, use chaining to order the column in decreasing order.

flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")][order(-arr_delay)]
#       arr_delay
#    1:       486
#    2:       380
#    3:       351
#    4:       318
#    5:       300
#   ---          
# 1848:       -40
# 1849:       -43
# 1850:       -46
# 1851:       -48
# 1852:       -49

d) Compute or do in j

– Find the maximum arrival delay correspondong to origin = "LGA" and dest = "TPA".

flights[.("LGA", "TPA"), max(arr_delay), on = c("origin", "dest")]
# [1] 486

e) sub-assign by reference using := in j

We have seen this example already in the Reference semantics and Keys and fast binary search based subset vignette. Let's take a look at all the hours available in the flights data.table:

# get all 'hours' in flights
flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

We see that there are totally 25 unique values in the data. Both 0 and 24 hours seem to be present. Let's go ahead and replace 24 with 0, but this time using on instead of setting keys.

flights[.(24L), hour := 0L, on = "hour"]

Now, let's check if 24 is replaced with 0 in the hour column.

flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  • This is particularly a huge advantage of secondary indices. Previously, just to update a few rows of hour, we had to setkey() on it, which inevitablly reorders the entire data.table. With on, the order is preserved, and the operation is much faster! Looking at the code, the task we wanted to perform is also quite clear.

f) Aggregation using by

– Get the maximum departure delay for each month corresponding to origin = "JFK". Order the result by month

ans <- flights["JFK", max(dep_delay), keyby = month, on = "origin"]
head(ans)
#    month   V1
# 1:     1  881
# 2:     1 1014
# 3:     1  920
# 4:     1 1241
# 5:     1  853
# 6:     1  798
  • We would have had to set the key back to origin, dest again, if we did not use on which internally builds secondary indices on the fly.

g) The mult argument

The other arguments including mult work exactly the same way as we saw in the Keys and fast binary search based subset vignette. The default value for mult is “all”. We can choose, instead only the “first” or “last” matching rows should be returned.

– Subset only the first matching row where dest matches “BOS” and “DAY”

flights[c("BOS", "DAY"), on = "dest", mult = "first"]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1         3         1      AA    JFK  BOS       39      187   12
# 2: 2014     1   1        25        35      EV    EWR  DAY      102      533   17

– Subset only the last matching row where origin matches “LGA”, “JFK”, “EWR” and dest matches “XNA”

flights[.(c("LGA", "JFK", "EWR"), "XNA"), on = c("origin", "dest"), mult = "last"]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014    10  31        -5       -11      MQ    LGA  XNA      165     1147    6
# 2:   NA    NA  NA        NA        NA      NA    JFK  XNA       NA       NA   NA
# 3: 2014    10  31        -2       -25      EV    EWR  XNA      160     1131    6

h) The nomatch argument

We can choose if queries that do not match should return NA or be skipped altogether using the nomatch argument.

– From the previous example, subset all rows only if there's a match

flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", on = c("origin", "dest"), nomatch = 0L]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014    10  31        -5       -11      MQ    LGA  XNA      165     1147    6
# 2: 2014    10  31        -2       -25      EV    EWR  XNA      160     1131    6
  • There are no flights connecting “JFK” and “XNA”. Therefore, that row is skipped in the result.

3. Auto indexing

First we looked at how to fast subset using binary search using keys. Then we figured out that we could improve performance even further and have more cleaner syntax by using secondary indices. What could be better than that? The answer is to optimise native R syntax to use secondary indices internally so that we can have the same performance without having to use newer syntax.

That is what auto indexing does. At the moment, it is only implemented for binary operators == and %in%. And it only works with a single column at the moment as well. An index is automatically created and saved as an attribute. That is, unlike the on argument which computes the index on the fly each time, a secondary index is created here.

Let's start by creating a data.table big enough to highlight the advantage.

set.seed(1L)
dt = data.table(x = sample(1e5L, 1e7L, TRUE), y = runif(100L))
print(object.size(dt), units = "Mb")
# 114.4 Mb

When we use == or %in% on a single column for the first time, a secondary index is created automtically, and it is used to perform the subset.

## have a look at all the attribute names
names(attributes(dt))
# [1] "names"             "row.names"         "class"             ".internal.selfref"

## run thefirst time
(t1 <- system.time(ans <- dt[x == 989L]))
#    user  system elapsed 
#   0.164   0.004   0.169
head(ans)
#      x         y
# 1: 989 0.5372007
# 2: 989 0.5642786
# 3: 989 0.7151100
# 4: 989 0.3920405
# 5: 989 0.9547465
# 6: 989 0.2914710

## secondary index is created
names(attributes(dt))
# [1] "names"             "row.names"         "class"             ".internal.selfref"
# [5] "index"

indices(dt)
# [1] "x"

The time to subset the first time is the time to create the index + the time to subset. Since creating a secondary index involves only creating the order vector, this combined operation is faster than vector scans in many cases. But the real advantage comes in successive subsets. They are extremely fast.

## successive subsets
(t2 <- system.time(dt[x == 989L]))
#    user  system elapsed 
#       0       0       0
system.time(dt[x %in% 1989:2012])
#    user  system elapsed 
#   0.004   0.000   0.001
  • Running the first time took 0.169 seconds where as the second time took 0.000 seconds.

  • Auto indexing can be disabled by setting the global argument options(datatable.auto.index = FALSE).

  • Disabling auto indexing still allows to use indices created explicitly with setindex or setindexv. You can disable indices fully by setting global argument options(datatable.use.index = FALSE).

In the future, we plan to extend auto indexing to expressions involving more than one column. Also we are working on extending binary search to work with more binary operators like <, <=, > and >=. Once done, it would be straightforward to extend it to these operators as well.

We will extend fast subsets using keys and secondary indices to joins in the next vignette, “Joins and rolling joins”.


data.table/inst/doc/datatable-reshape.R0000644000175100001440000000512713172212366017512 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ----echo = FALSE--------------------------------------------------------------------------------- options(width = 100L) ## ------------------------------------------------------------------------------------------------- DT = fread("melt_default.csv") DT ## dob stands for date of birth. str(DT) ## ------------------------------------------------------------------------------------------------- DT.m1 = melt(DT, id.vars = c("family_id", "age_mother"), measure.vars = c("dob_child1", "dob_child2", "dob_child3")) DT.m1 str(DT.m1) ## ------------------------------------------------------------------------------------------------- DT.m1 = melt(DT, measure.vars = c("dob_child1", "dob_child2", "dob_child3"), variable.name = "child", value.name = "dob") DT.m1 ## ------------------------------------------------------------------------------------------------- dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob") ## ------------------------------------------------------------------------------------------------- dcast(DT.m1, family_id ~ ., fun.agg = function(x) sum(!is.na(x)), value.var = "dob") ## ------------------------------------------------------------------------------------------------- DT = fread("melt_enhanced.csv") DT ## 1 = female, 2 = male ## ------------------------------------------------------------------------------------------------- DT.m1 = melt(DT, id = c("family_id", "age_mother")) DT.m1[, c("variable", "child") := tstrsplit(variable, "_", fixed = TRUE)] DT.c1 = dcast(DT.m1, family_id + age_mother + child ~ variable, value.var = "value") DT.c1 str(DT.c1) ## gender column is character type now! ## ------------------------------------------------------------------------------------------------- colA = paste("dob_child", 1:3, sep = "") colB = paste("gender_child", 1:3, sep = "") DT.m2 = melt(DT, measure = list(colA, colB), value.name = c("dob", "gender")) DT.m2 str(DT.m2) ## col type is preserved ## ------------------------------------------------------------------------------------------------- DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender")) DT.m2 ## ------------------------------------------------------------------------------------------------- ## new 'cast' functionality - multiple value.vars DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "gender")) DT.c2 data.table/inst/doc/datatable-reference-semantics.Rmd0000644000175100001440000003544113172210047022322 0ustar hornikusers--- title: "Reference semantics" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Reference semantics} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` This vignette discusses *data.table*'s reference semantics which allows to *add/update/delete* columns of a *data.table by reference*, and also combine them with `i` and `by`. It is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, and perform aggregations by group. If you're not familiar with these concepts, please read the *"Introduction to data.table"* vignette first. *** ## Data {#data} We will use the same `flights` data as in the *"Introduction to data.table"* vignette. ```{r echo = FALSE} options(width = 100L) ``` ```{r} flights <- fread("flights14.csv") flights dim(flights) ``` ## Introduction In this vignette, we will 1. first discuss reference semantics briefly and look at the two different forms in which the `:=` operator can be used 2. then see how we can *add/update/delete* columns *by reference* in `j` using the `:=` operator and how to combine with `i` and `by`. 3. and finally we will look at using `:=` for its *side-effect* and how we can avoid the side effects using `copy()`. ## 1. Reference semantics All the operations we have seen so far in the previous vignette resulted in a new data set. We will see how to *add* new column(s), *update* or *delete* existing column(s) on the original data. ### a) Background Before we look at *reference semantics*, consider the *data.frame* shown below: ```{r} DF = data.frame(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18) DF ``` When we did: ```{r eval = FALSE} DF$c <- 18:13 # (1) -- replace entire column # or DF$c[DF$ID == "b"] <- 15:13 # (2) -- subassign in column 'c' ``` both (1) and (2) resulted in [*deep* copy of the entire *data.frame*](http://r.789695.n4.nabble.com/speeding-up-perception-td3640920.html#a3646694) in versions of `R` versions `< 3.1`. [It copied more than once](http://stackoverflow.com/q/23898969/559784). To improve performance by avoiding these redundant copies, *data.table* utilised the [available but unused `:=` operator in R](http://stackoverflow.com/q/7033106/559784). Great performance improvements were made in `R v3.1` as a result of which only a *shallow* copy is made for (1) and not *deep* copy. However, for (2) still, the entire column is *deep* copied even in `R v3.1+`. This means the more columns one subassigns to in the *same query*, the more *deep* copies R does. #### *shallow* vs *deep* copy {.bs-callout .bs-callout-info} A *shallow* copy is just a copy of the vector of column pointers (corresponding to the columns in a *data.frame* or *data.table*). The actual data is not physically copied in memory. A *deep* copy on the other hand copies the entire data to another location in memory. # With *data.table's* `:=` operator, absolutely no copies are made in *both* (1) and (2), irrespective of R version you are using. This is because `:=` operator updates *data.table* columns *in-place* (by reference). ### b) The `:=` operator It can be used in `j` in two ways: (a) The `LHS := RHS` form ```{r eval = FALSE} DT[, c("colA", "colB", ...) := list(valA, valB, ...)] # when you have only one column to assign to you # can drop the quotes and list(), for convenience DT[, colA := valA] ``` (b) The functional form ```{r eval = FALSE} DT[, `:=`(colA = valA, # valA is assigned to colA colB = valB, # valB is assigned to colB ... )] ``` #### {.bs-callout .bs-callout-warning} Note that the code above explains how `:=` can be used. They are not working examples. We will start using them on `flights` *data.table* from the next section. # #### {.bs-callout .bs-callout-info} * In (a), `LHS` takes a character vector of column names and `RHS` a *list of values*. `RHS` just needs to be a `list`, irrespective of how its generated (e.g., using `lapply()`, `list()`, `mget()`, `mapply()` etc.). This form is usually easy to program with and is particularly useful when you don't know the columns to assign values to in advance. * On the other hand, (b) is handy if you would like to jot some comments down for later. * The result is returned *invisibly*. * Since `:=` is available in `j`, we can combine it with `i` and `by` operations just like the aggregation operations we saw in the previous vignette. # In the two forms of `:=` shown above, note that we don't assign the result back to a variable. Because we don't need to. The input *data.table* is modified by reference. Let's go through examples to understand what we mean by this. For the rest of the vignette, we will work with `flights` *data.table*. ## 2. Add/update/delete columns *by reference* ### a) Add columns by reference {#ref-j} #### -- How can we add columns *speed* and *total delay* of each flight to `flights` *data.table*? ```{r} flights[, `:=`(speed = distance / (air_time/60), # speed in mph (mi/h) delay = arr_delay + dep_delay)] # delay in minutes head(flights) ## alternatively, using the 'LHS := RHS' form # flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)] ``` #### Note that {.bs-callout .bs-callout-info} * We did not have to assign the result back to `flights`. * The `flights` *data.table* now contains the two newly added columns. This is what we mean by *added by reference*. * We used the functional form so that we could add comments on the side to explain what the computation does. You can also see the `LHS := RHS` form (commented). ### b) Update some rows of columns by reference - *sub-assign* by reference {#ref-i-j} Let's take a look at all the `hours` available in the `flights` *data.table*: ```{r} # get all 'hours' in flights flights[, sort(unique(hour))] ``` We see that there are totally `25` unique values in the data. Both *0* and *24* hours seem to be present. Let's go ahead and replace *24* with *0*. #### -- Replace those rows where `hour == 24` with the value `0` ```{r} # subassign by reference flights[hour == 24L, hour := 0L] ``` #### {.bs-callout .bs-callout-info} * We can use `i` along with `:=` in `j` the very same way as we have already seen in the *"Introduction to data.table"* vignette. * Column `hour` is replaced with `0` only on those *row indices* where the condition `hour == 24L` specified in `i` evaluates to `TRUE`. * `:=` returns the result invisibly. Sometimes it might be necessary to see the result after the assignment. We can accomplish that by adding an empty `[]` at the end of the query as shown below: ```{r} flights[hour == 24L, hour := 0L][] ``` # Let's look at all the `hours` to verify. ```{r} # check again for '24' flights[, sort(unique(hour))] ``` #### Exercise: {.bs-callout .bs-callout-warning #update-by-reference-question} What is the difference between `flights[hour == 24L, hour := 0L]` and `flights[hour == 24L][, hour := 0L]`? Hint: The latter needs an assignment (`<-`) if you would want to use the result later. If you can't figure it out, have a look at the `Note` section of `?":="`. ### c) Delete column by reference #### -- Remove `delay` column ```{r} flights[, c("delay") := NULL] head(flights) ## or using the functional form # flights[, `:=`(delay = NULL)] ``` #### {.bs-callout .bs-callout-info #delete-convenience} * Assigning `NULL` to a column *deletes* that column. And it happens *instantly*. * We can also pass column numbers instead of names in the `LHS`, although it is good programming practice to use column names. * When there is just one column to delete, we can drop the `c()` and double quotes and just use the column name *unquoted*, for convenience. That is: ```{r eval = FALSE} flights[, delay := NULL] ``` is equivalent to the code above. ### d) `:=` along with grouping using `by` {#ref-j-by} We have already seen the use of `i` along with `:=` in [Section 2b](#ref-i-j). Let's now see how we can use `:=` along with `by`. #### -- How can we add a new column which contains for each `orig,dest` pair the maximum speed? ```{r} flights[, max_speed := max(speed), by = .(origin, dest)] head(flights) ``` #### {.bs-callout .bs-callout-info} * We add a new column `max_speed` using the `:=` operator by reference. * We provide the columns to group by the same way as shown in the *Introduction to data.table* vignette. For each group, `max(speed)` is computed, which returns a single value. That value is recycled to fit the length of the group. Once again, no copies are being made at all. `flights` *data.table* is modified *in-place*. * We could have also provided `by` with a *character vector* as we saw in the *Introduction to data.table* vignette, e.g., `by = c("origin", "dest")`. # ### e) Multiple columns and `:=` #### -- How can we add two more columns computing `max()` of `dep_delay` and `arr_delay` for each month, using `.SD`? ```{r} in_cols = c("dep_delay", "arr_delay") out_cols = c("max_dep_delay", "max_arr_delay") flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols] head(flights) ``` #### {.bs-callout .bs-callout-info} * We use the `LHS := RHS` form. We store the input column names and the new columns to add in separate variables and provide them to `.SDcols` and for `LHS` (for better readability). * Note that since we allow assignment by reference without quoting column names when there is only one column as explained in [Section 2c](#delete-convenience), we can not do `out_cols := lapply(.SD, max)`. That would result in adding one new column named `out_col`. Instead we should do either `c(out_cols)` or simply `(out_cols)`. Wrapping the variable name with `(` is enough to differentiate between the two cases. * The `LHS := RHS` form allows us to operate on multiple columns. In the RHS, to compute the `max` on columns specified in `.SDcols`, we make use of the base function `lapply()` along with `.SD` in the same way as we have seen before in the *"Introduction to data.table"* vignette. It returns a list of two elements, containing the maximum value corresponding to `dep_delay` and `arr_delay` for each group. # Before moving on to the next section, let's clean up the newly created columns `speed`, `max_speed`, `max_dep_delay` and `max_arr_delay`. ```{r} # RHS gets automatically recycled to length of LHS flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL] head(flights) ``` ## 3) `:=` and `copy()` `:=` modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use `copy()` function, as we will see in a moment. ### a) `:=` for its side effect Let's say we would like to create a function that would return the *maximum speed* for each month. But at the same time, we would also like to add the column `speed` to *flights*. We could write a simple function as follows: ```{r} foo <- function(DT) { DT[, speed := distance / (air_time/60)] DT[, .(max_speed = max(speed)), by = month] } ans = foo(flights) head(flights) head(ans) ``` #### {.bs-callout .bs-callout-info} * Note that the new column `speed` has been added to `flights` *data.table*. This is because `:=` performs operations by reference. Since `DT` (the function argument) and `flights` refer to the same object in memory, modifying `DT` also reflects on `flights`. * And `ans` contains the maximum speed for each month. ### b) The `copy()` function In the previous section, we used `:=` for its side effect. But of course this may not be always desirable. Sometimes, we would like to pass a *data.table* object to a function, and might want to use the `:=` operator, but *wouldn't* want to update the original object. We can accomplish this using the function `copy()`. #### {.bs-callout .bs-callout-info} The `copy()` function *deep* copies the input object and therefore any subsequent update by reference operations performed on the copied object will not affect the original object. # There are two particular places where `copy()` function is essential: 1. Contrary to the situation we have seen in the previous point, we may not want the input data.table to a function to be modified *by reference*. As an example, let's consider the task in the previous section, except we don't want to modify `flights` by reference. Let's first delete the `speed` column we generated in the previous section. ```{r} flights[, speed := NULL] ``` Now, we could accomplish the task as follows: ```{r} foo <- function(DT) { DT <- copy(DT) ## deep copy DT[, speed := distance / (air_time/60)] ## doesn't affect 'flights' DT[, .(max_speed = max(speed)), by = month] } ans <- foo(flights) head(flights) head(ans) ``` #### {.bs-callout .bs-callout-info} * Using `copy()` function did not update `flights` *data.table* by reference. It doesn't contain the column `speed`. * And `ans` contains the maximum speed corresponding to each month. However we could improve this functionality further by *shallow* copying instead of *deep* copying. In fact, we would very much like to [provide this functionality for `v1.9.8`](https://github.com/Rdatatable/data.table/issues/617). We will touch up on this again in the *data.table design* vignette. # 2. When we store the column names on to a variable, e.g., `DT_n = names(DT)`, and then *add/update/delete* column(s) *by reference*. It would also modify `DT_n`, unless we do `copy(names(DT))`. ```{r} DT = data.table(x = 1L, y = 2L) DT_n = names(DT) DT_n ## add a new column by reference DT[, z := 3L] ## DT_n also gets updated DT_n ## use `copy()` DT_n = copy(names(DT)) DT[, w := 4L] ## DT_n doesn't get updated DT_n ``` ## Summary #### The `:=` operator {.bs-callout .bs-callout-info} * It is used to *add/update/delete* columns by reference. * We have also seen how to use `:=` along with `i` and `by` the same way as we have seen in the *Introduction to data.table* vignette. We can in the same way use `keyby`, chain operations together, and pass expressions to `by` as well all in the same way. The syntax is *consistent*. * We can use `:=` for its side effect or use `copy()` to not modify the original object while updating by reference. # So far we have seen a whole lot in `j`, and how to combine it with `by` and little of `i`. Let's turn our attention back to `i` in the next vignette *"Keys and fast binary search based subset"* to perform *blazing fast subsets* by *keying data.tables*. *** data.table/inst/doc/datatable-intro.html0000644000175100001440000016070413172212362017760 0ustar hornikusers Data analysis using data.table

This vignette introduces the data.table syntax, its general form, how to subset rows, select and compute on columns and perform aggregations by group. Familiarity with data.frame data structure from base R is useful, but not essential to follow this vignette.


Data analysis using data.table

Data manipulation operations such as subset, group, update, join etc., are all inherently related. Keeping these related operations together allows for:

  • concise and consistent syntax irrespective of the set of operations you would like to perform to achieve your end goal.

  • performing analysis fluidly without the cognitive burden of having to map each operation to a particular function from a set of functions available before to perform the analysis.

  • automatically optimising operations internally, and very effectively, by knowing precisely the data required for each operation and therefore very fast and memory efficient.

Briefly, if you are interested in reducing programming and compute time tremendously, then this package is for you. The philosophy that data.table adheres to makes this possible. Our goal is to illustrate it through this series of vignettes.

Data {#data}

In this vignette, we will use NYC-flights14 data. It contains On-Time flights data from the Bureau of Transporation Statistics for all the flights that departed from New York City airports in 2014 (inspired by nycflights13). The data is available only for Jan-Oct'14.

We can use data.table's fast file reader fread to load flights directly as follows:

flights <- fread("flights14.csv")
flights
#         year month day dep_delay arr_delay carrier origin dest air_time distance hour
#      1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
#      2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
#      3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
#      4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
#      5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
#     ---                                                                              
# 253312: 2014    10  31         1       -30      UA    LGA  IAH      201     1416   14
# 253313: 2014    10  31        -5       -14      UA    EWR  IAH      189     1400    8
# 253314: 2014    10  31        -8        16      MQ    LGA  RDU       83      431   11
# 253315: 2014    10  31        -4        15      MQ    LGA  DTW       75      502   11
# 253316: 2014    10  31        -5         1      MQ    LGA  SDF      110      659    8
dim(flights)
# [1] 253316     11

Aside: fread accepts http and https URLs directly as well as operating system commands such as sed and awk output. See ?fread for examples.

Introduction

In this vignette, we will

  1. start with basics - what is a data.table, its general form, how to subset rows, select and compute on columns

  2. and then we will look at performing data aggregations by group,

1. Basics {#basics-1}

a) What is data.table? {#what-is-datatable-1a}

data.table is an R package that provides an enhanced version of data.frames. In the Data section, we already created a data.table using fread(). We can also create one using the data.table() function. Here is an example:

DT = data.table(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18)
DT
#    ID a  b  c
# 1:  b 1  7 13
# 2:  b 2  8 14
# 3:  b 3  9 15
# 4:  a 4 10 16
# 5:  a 5 11 17
# 6:  c 6 12 18
class(DT$ID)
# [1] "character"

You can also convert existing objects to a data.table using as.data.table().

Note that: {.bs-callout .bs-callout-info}

  • Unlike data.frames, columns of character type are never converted to factors by default.

  • Row numbers are printed with a : in order to visually separate the row number from the first column.

  • When the number of rows to print exceeds the global option datatable.print.nrows (default = 100), it automatically prints only the top 5 and bottom 5 rows (as can be seen in the Data section).

    getOption("datatable.print.nrows")
    
  • data.table doesn't set or use row names, ever. We will see as to why in “Keys and fast binary search based subset” vignette.

b) General form - in what way is a data.table enhanced? {#enhanced-1b}

In contrast to a data.frame, you can do a lot more than just subsetting rows and selecting columns within the frame of a data.table, i.e., within [ ... ]. To understand it we will have to first look at the general form of data.table syntax, as shown below:

DT[i, j, by]

##   R:      i                 j        by
## SQL:  where   select | update  group by

Users who have a SQL background might perhaps immediately relate to this syntax.

The way to read it (out loud) is: {.bs-callout .bs-callout-info}

Take DT, subset rows using i, then calculate j, grouped by by.

#

Let's begin by looking at i and j first - subsetting rows and operating on columns.

c) Subset rows in i {#subset-i-1c}

– Get all the flights with “JFK” as the origin airport in the month of June.

ans <- flights[origin == "JFK" & month == 6L]
head(ans)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     6   1        -9        -5      AA    JFK  LAX      324     2475    8
# 2: 2014     6   1       -10       -13      AA    JFK  LAX      329     2475   12
# 3: 2014     6   1        18        -1      AA    JFK  LAX      326     2475    7
# 4: 2014     6   1        -6       -16      AA    JFK  LAX      320     2475   10
# 5: 2014     6   1        -4       -45      AA    JFK  LAX      326     2475   18
# 6: 2014     6   1        -6       -23      AA    JFK  LAX      329     2475   14

{.bs-callout .bs-callout-info}

  • Within the frame of a data.table, columns can be referred to as if they are variables. Therefore, we simply refer to dest and month as if they are variables. We do not need to add the prefix flights$ each time. However using flights$dest and flights$month would work just fine.

  • The row indices that satisfies the condition origin == "JFK" & month == 6L are computed, and since there is nothing else left to do, a data.table all columns from flights corresponding to those row indices are simply returned.

  • A comma after the condition is also not required in i. But flights[dest == "JFK" & month == 6L, ] would work just fine. In data.frames however, the comma is necessary.

– Get the first two rows from flights. {#subset-rows-integer}

ans <- flights[1:2]
ans
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11

{.bs-callout .bs-callout-info}

  • In this case, there is no condition. The row indices are already provided in i. We therefore return a data.table with all columns from flight for those row indices.

– Sort flights first by column origin in ascending order, and then by dest in descending order:

We can use the base R function order() to accomplish this.

ans <- flights[order(origin, -dest)]
head(ans)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   5         6        49      EV    EWR  XNA      195     1131    8
# 2: 2014     1   6         7        13      EV    EWR  XNA      190     1131    8
# 3: 2014     1   7        -6       -13      EV    EWR  XNA      179     1131    8
# 4: 2014     1   8        -7       -12      EV    EWR  XNA      184     1131    8
# 5: 2014     1   9        16         7      EV    EWR  XNA      181     1131    8
# 6: 2014     1  13        66        66      EV    EWR  XNA      188     1131    9

order() is internally optimised {.bs-callout .bs-callout-info}

  • We can use “-” on a character columns within the frame of a data.table to sort in decreasing order.

  • In addition, order(...) within the frame of a data.table uses data.table's internal fast radix order forder(), which is much faster than base::order. Here's a small example to highlight the difference.

    odt = data.table(col = sample(1e7))
    (t1 <- system.time(ans1 <- odt[base::order(col)]))  ## uses order from base R
    #    user  system elapsed 
    #   0.384   0.000   0.384
    (t2 <- system.time(ans2 <- odt[order(col)]))        ## uses data.table's forder
    #    user  system elapsed 
    #   0.360   0.004   0.364
    (identical(ans1, ans2))
    # [1] TRUE
    

The speedup here is ~1x. We will discuss data.table's fast order in more detail in the data.table internals vignette.

  • This is so that you can improve performance tremendously while using already familiar functions.

#

d) Select column(s) in j {#select-j-1d}

– Select arr_delay column, but return it as a vector.

ans <- flights[, arr_delay]
head(ans)
# [1]  13  13   9 -26   1   0

{.bs-callout .bs-callout-info}

  • Since columns can be referred to as if they are variables within the frame of data.tables, we directly refer to the variable we want to subset. Since we want all the rows, we simply skip i.

  • It returns all the rows for the column arr_delay.

– Select arr_delay column, but return as a data.table instead.

ans <- flights[, list(arr_delay)]
head(ans)
#    arr_delay
# 1:        13
# 2:        13
# 3:         9
# 4:       -26
# 5:         1
# 6:         0

{.bs-callout .bs-callout-info}

  • We wrap the variables (column names) within list(), which ensures that a data.table is returned. In case of a single column name, not wrapping with list() returns a vector instead, as seen in the previous example.

  • data.table also allows using .() to wrap columns with. It is an alias to list(); they both mean the same. Feel free to use whichever you prefer.

    We will continue to use .() from here on.

#

data.tables (and data.frames) are internally lists as well, but with all its columns of equal length and with a class attribute. Allowing j to return a list enables converting and returning a data.table very efficiently.

Tip: {.bs-callout .bs-callout-warning #tip-1}

As long as j-expression returns a list, each element of the list will be converted to a column in the resulting data.table. This makes j quite powerful, as we will see shortly.

– Select both arr_delay and dep_delay columns.

ans <- flights[, .(arr_delay, dep_delay)]
head(ans)
#    arr_delay dep_delay
# 1:        13        14
# 2:        13        -3
# 3:         9         2
# 4:       -26        -8
# 5:         1         2
# 6:         0         4

## alternatively
# ans <- flights[, list(arr_delay, dep_delay)]

{.bs-callout .bs-callout-info}

  • Wrap both columns within .(), or list(). That's it.

#

– Select both arr_delay and dep_delay columns and rename them to delay_arr and delay_dep.

Since .() is just an alias for list(), we can name columns as we would while creating a list.

ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)]
head(ans)
#    delay_arr delay_dep
# 1:        13        14
# 2:        13        -3
# 3:         9         2
# 4:       -26        -8
# 5:         1         2
# 6:         0         4

That's it.

e) Compute or do in j

– How many trips have had total delay < 0?

ans <- flights[, sum((arr_delay + dep_delay) < 0)]
ans
# [1] 141814

What's happening here? {.bs-callout .bs-callout-info}

  • data.table's j can handle more than just selecting columns - it can handle expressions, i.e., compute on columns. This shouldn't be surprising, as columns can be referred to as if they are variables. Then we should be able to compute by calling functions on those variables. And that's what precisely happens here.

f) Subset in i and do in j

– Calculate the average arrival and departure delay for all flights with “JFK” as the origin airport in the month of June.

ans <- flights[origin == "JFK" & month == 6L,
               .(m_arr = mean(arr_delay), m_dep = mean(dep_delay))]
ans
#       m_arr    m_dep
# 1: 5.839349 9.807884

{.bs-callout .bs-callout-info}

  • We first subset in i to find matching row indices where origin airport equals “JFK”, and month equals 6. At this point, we do not subset the entire data.table corresponding to those rows.

  • Now, we look at j and find that it uses only two columns. And what we have to do is to compute their mean(). Therefore we subset just those columns corresponding to the matching rows, and compute their mean().

Because the three main components of the query (i, j and by) are together inside [...], data.table can see all three and optimise the query altogether before evaluation, not each separately. We are able to therefore avoid the entire subset, for both speed and memory efficiency.

– How many trips have been made in 2014 from “JFK” airport in the month of June?

ans <- flights[origin == "JFK" & month == 6L, length(dest)]
ans
# [1] 8422

The function length() requires an input argument. We just needed to compute the number of rows in the subset. We could have used any other column as input argument to length() really.

This type of operation occurs quite frequently, especially while grouping as we will see in the next section, that data.table provides a special symbol .N for it.

Special symbol .N: {.bs-callout .bs-callout-info #special-N}

.N is a special in-built variable that holds the number of observations in the current group. It is particularly useful when combined with by as we'll see in the next section. In the absence of group by operations, it simply returns the number of rows in the subset.

# So we can now accomplish the same task by using .N as follows:

ans <- flights[origin == "JFK" & month == 6L, .N]
ans
# [1] 8422

{.bs-callout .bs-callout-info}

  • Once again, we subset in i to get the row indices where origin airport equals “JFK”, and month equals 6.

  • We see that j uses only .N and no other columns. Therefore the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices).

  • Note that we did not wrap .N with list() or .(). Therefore a vector is returned.

We could have accomplished the same operation by doing nrow(flights[origin == "JFK" & month == 6L]). However, it would have to subset the entire data.table first corresponding to the row indices in i and then return the rows using nrow(), which is unnecessary and inefficient. We will cover this and other optimisation aspects in detail under the data.table design vignette.

g) Great! But how can I refer to columns by names in j (like in a data.frame)?

You can refer to column names the data.frame way using with = FALSE.

– Select both arr_delay and dep_delay columns the data.frame way.

ans <- flights[, c("arr_delay", "dep_delay"), with = FALSE]
head(ans)
#    arr_delay dep_delay
# 1:        13        14
# 2:        13        -3
# 3:         9         2
# 4:       -26        -8
# 5:         1         2
# 6:         0         4

The argument is named with after the R function with() because of similar functionality. Suppose you've a data.frame DF and you'd like to subset all rows where x > 1.

DF = data.frame(x = c(1,1,1,2,2,3,3,3), y = 1:8)

## (1) normal way
DF[DF$x > 1, ] # data.frame needs that ',' as well
#   x y
# 4 2 4
# 5 2 5
# 6 3 6
# 7 3 7
# 8 3 8

## (2) using with
DF[with(DF, x > 1), ]
#   x y
# 4 2 4
# 5 2 5
# 6 3 6
# 7 3 7
# 8 3 8

{.bs-callout .bs-callout-info #with_false}

  • Using with() in (2) allows using DF's column x as if it were a variable.

    Hence the argument name with in data.table. Setting with = FALSE disables the ability to refer to columns as if they are variables, thereby restoring the “data.frame mode”.

  • We can also deselect columns using - or !. For example:

    ## not run
    
    # returns all columns except arr_delay and dep_delay
    ans <- flights[, !c("arr_delay", "dep_delay"), with = FALSE]
    # or
    ans <- flights[, -c("arr_delay", "dep_delay"), with = FALSE]
    
  • From v1.9.5+, we can also select by specifying start and end column names, for e.g, year:day to select the first three columns.

    ## not run
    
    # returns year,month and day
    ans <- flights[, year:day, with = FALSE]
    # returns day, month and year
    ans <- flights[, day:year, with = FALSE]
    # returns all columns except year, month and day
    ans <- flights[, -(year:day), with = FALSE]
    ans <- flights[, !(year:day), with = FALSE]
    

    This is particularly handy while working interactively.

#

with = TRUE is default in data.table because we can do much more by allowing j to handle expressions - especially when combined with by as we'll see in a moment.

2. Aggregations

We've already seen i and j from data.table's general form in the previous section. In this section, we'll see how they can be combined together with by to perform operations by group. Let's look at some examples.

a) Grouping using by

– How can we get the number of trips corresponding to each origin airport?

ans <- flights[, .(.N), by = .(origin)]
ans
#    origin     N
# 1:    JFK 81483
# 2:    LGA 84433
# 3:    EWR 87400

## or equivalently using a character vector in 'by'
# ans <- flights[, .(.N), by = "origin"]

{.bs-callout .bs-callout-info}

  • We know .N is a special variable that holds the number of rows in the current group. Grouping by origin obtains the number of rows, .N, for each group.

  • By doing head(flights) you can see that the origin airports occur in the order “JFK”, “LGA” and “EWR”. The original order of grouping variables is preserved in the result.

  • Since we did not provide a name for the column returned in j, it was named Nautomatically by recognising the special symbol .N.

  • by also accepts character vector of column names. It is particularly useful to program with, for e.g., designing a function with the columns to be group by as a function argument.

  • When there's only one column or expression to refer to in j and by, we can drop the .() notation. This is purely for convenience. We could instead do:

    ans <- flights[, .N, by = origin]
    ans
    #    origin     N
    # 1:    JFK 81483
    # 2:    LGA 84433
    # 3:    EWR 87400
    

    We'll use this convenient form wherever applicable hereafter.

#

– How can we calculate the number of trips for each origin airport for carrier code “AA”? {#origin-.N}

The unique carrier code “AA” corresponds to American Airlines Inc.

ans <- flights[carrier == "AA", .N, by = origin]
ans
#    origin     N
# 1:    JFK 11923
# 2:    LGA 11730
# 3:    EWR  2649

{.bs-callout .bs-callout-info}

  • We first obtain the row indices for the expression carrier == "AA" from i.

  • Using those row indices, we obtain the number of rows while grouped by origin. Once again no columns are actually materialised here, because the j-expression does not require any columns to be actually subsetted and is therefore fast and memory efficient.

– How can we get the total number of trips for each origin, dest pair for carrier code “AA”? {#origin-dest-.N}

ans <- flights[carrier == "AA", .N, by = .(origin,dest)]
head(ans)
#    origin dest    N
# 1:    JFK  LAX 3387
# 2:    LGA  PBI  245
# 3:    EWR  LAX   62
# 4:    JFK  MIA 1876
# 5:    JFK  SEA  298
# 6:    EWR  MIA  848

## or equivalently using a character vector in 'by'
# ans <- flights[carrier == "AA", .N, by = c("origin", "dest")]

{.bs-callout .bs-callout-info}

  • by accepts multiple columns. We just provide all the columns by which to group by.

– How can we get the average arrival and departure delay for each orig,dest pair for each month for carrier code “AA”? {#origin-dest-month}

ans <- flights[carrier == "AA",
        .(mean(arr_delay), mean(dep_delay)),
        by = .(origin, dest, month)]
ans
#      origin dest month         V1         V2
#   1:    JFK  LAX     1   6.590361 14.2289157
#   2:    LGA  PBI     1  -7.758621  0.3103448
#   3:    EWR  LAX     1   1.366667  7.5000000
#   4:    JFK  MIA     1  15.720670 18.7430168
#   5:    JFK  SEA     1  14.357143 30.7500000
#  ---                                        
# 196:    LGA  MIA    10  -6.251799 -1.4208633
# 197:    JFK  MIA    10  -1.880184  6.6774194
# 198:    EWR  PHX    10  -3.032258 -4.2903226
# 199:    JFK  MCO    10 -10.048387 -1.6129032
# 200:    JFK  DCA    10  16.483871 15.5161290

{.bs-callout .bs-callout-info}

  • We did not provide column names for expressions in j, they were automatically generated (V1, V2).

  • Once again, note that the input order of grouping columns is preserved in the result.

#

Now what if we would like to order the result by those grouping columns origin, dest and month?

b) keyby

data.table retaining the original order of groups is intentional and by design. There are cases when preserving the original order is essential. But at times we would like to automatically sort by the variables we grouped by.

– So how can we directly order by all the grouping variables?

ans <- flights[carrier == "AA",
        .(mean(arr_delay), mean(dep_delay)),
        keyby = .(origin, dest, month)]
ans
#      origin dest month         V1         V2
#   1:    EWR  DFW     1   6.427673 10.0125786
#   2:    EWR  DFW     2  10.536765 11.3455882
#   3:    EWR  DFW     3  12.865031  8.0797546
#   4:    EWR  DFW     4  17.792683 12.9207317
#   5:    EWR  DFW     5  18.487805 18.6829268
#  ---                                        
# 196:    LGA  PBI     1  -7.758621  0.3103448
# 197:    LGA  PBI     2  -7.865385  2.4038462
# 198:    LGA  PBI     3  -5.754098  3.0327869
# 199:    LGA  PBI     4 -13.966667 -4.7333333
# 200:    LGA  PBI     5 -10.357143 -6.8571429

{.bs-callout .bs-callout-info}

  • All we did was to change by to keyby. This automatically orders the result by the grouping variables in increasing order. Note that keyby() is applied after performing the operation, i.e., on the computed result.

Keys: Actually keyby does a little more than just ordering. It also sets a key after ordering by setting an attribute called sorted. But we'll learn more about keys in the next vignette.

For now, all you've to know is you can use keyby to automatically order by the columns specified in by.

c) Chaining

Let's reconsider the task of getting the total number of trips for each origin, dest pair for carrier “AA”.

ans <- flights[carrier == "AA", .N, by = .(origin, dest)]

– How can we order ans using the columns origin in ascending order, and dest in descending order?

We can store the intermediate result in a variable, and then use order(origin, -dest) on that variable. It seems fairly straightforward.

ans <- ans[order(origin, -dest)]
head(ans)
#    origin dest    N
# 1:    EWR  PHX  121
# 2:    EWR  MIA  848
# 3:    EWR  LAX   62
# 4:    EWR  DFW 1618
# 5:    JFK  STT  229
# 6:    JFK  SJU  690

{.bs-callout .bs-callout-info}

  • Recall that we can use “-” on a character column in order() within the frame of a data.table. This is possible to due data.table's internal query optimisation.

  • Also recall that order(...) with the frame of a data.table is automatically optimised to use data.table's internal fast radix order forder() for speed. So you can keep using the already familiar base R functions without compromising in speed or memory efficiency that data.table offers. We will cover this in more detail in the data.table internals vignette.

#

But this requires having to assign the intermediate result and then overwriting that result. We can do one better and avoid this intermediate assignment on to a variable altogether by chaining expressions.

ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)]
head(ans, 10)
#     origin dest    N
#  1:    EWR  PHX  121
#  2:    EWR  MIA  848
#  3:    EWR  LAX   62
#  4:    EWR  DFW 1618
#  5:    JFK  STT  229
#  6:    JFK  SJU  690
#  7:    JFK  SFO 1312
#  8:    JFK  SEA  298
#  9:    JFK  SAN  299
# 10:    JFK  ORD  432

{.bs-callout .bs-callout-info}

  • We can tack expressions one after another, forming a chain of operations, i.e., DT[ ... ][ ... ][ ... ].

  • Or you can also chain them vertically:

    DT[ ...
     ][ ...
     ][ ...
     ]
    

d) Expressions in by

– Can by accept expressions as well or just take columns?

Yes it does. As an example, if we would like to find out how many flights started late but arrived early (or on time), started and arrived late etc…

ans <- flights[, .N, .(dep_delay>0, arr_delay>0)]
ans
#    dep_delay arr_delay      N
# 1:      TRUE      TRUE  72836
# 2:     FALSE      TRUE  34583
# 3:     FALSE     FALSE 119304
# 4:      TRUE     FALSE  26593

{.bs-callout .bs-callout-info}

  • The last row corresponds to dep_delay > 0 = TRUE and arr_delay > 0 = FALSE. We can see that 26593 flights started late but arrived early (or on time).

  • Note that we did not provide any names to by-expression. And names have been automatically assigned in the result.

  • You can provide other columns along with expressions, for example: DT[, .N, by = .(a, b>0)].

e) Multiple columns in j - .SD

– Do we have to compute mean() for each column individually?

It is of course not practical to have to type mean(myCol) for every column one by one. What if you had a 100 columns to compute mean() of?

How can we do this efficiently? To get there, refresh on this tip - “As long as j-expression returns a list, each element of the list will be converted to a column in the resulting data.table”. Suppose we can refer to the data subset for each group as a variable while grouping, then we can loop through all the columns of that variable using the already familiar base function lapply(). We don't have to learn any new function.

Special symbol .SD: {.bs-callout .bs-callout-info #special-SD}

data.table provides a special symbol, called .SD. It stands for Subset of Data. It by itself is a data.table that holds the data for the current group defined using by.

Recall that a data.table is internally a list as well with all its columns of equal length.

#

Let's use the data.table DT from before to get a glimpse of what .SD looks like.

DT
#    ID a  b  c
# 1:  b 1  7 13
# 2:  b 2  8 14
# 3:  b 3  9 15
# 4:  a 4 10 16
# 5:  a 5 11 17
# 6:  c 6 12 18

DT[, print(.SD), by = ID]
#    a b  c
# 1: 1 7 13
# 2: 2 8 14
# 3: 3 9 15
#    a  b  c
# 1: 4 10 16
# 2: 5 11 17
#    a  b  c
# 1: 6 12 18
# Empty data.table (0 rows) of 1 col: ID

{.bs-callout .bs-callout-info}

  • .SD contains all the columns except the grouping columns by default.

  • It is also generated by preserving the original order - data corresponding to ID = "b", then ID = "a", and then ID = "c".

#

To compute on (multiple) columns, we can then simply use the base R function lapply().

DT[, lapply(.SD, mean), by = ID]
#    ID   a    b    c
# 1:  b 2.0  8.0 14.0
# 2:  a 4.5 10.5 16.5
# 3:  c 6.0 12.0 18.0

{.bs-callout .bs-callout-info}

  • .SD holds the rows corresponding to columns a, b and c for that group. We compute the mean() on each of these columns using the already familiar base function lapply().

  • Each group returns a list of three elements containing the mean value which will become the columns of the resulting data.table.

  • Since lapply() returns a list, there is no need to wrap it with an additional .() (if necessary, refer to this tip).

#

We are almost there. There is one little thing left to address. In our flights data.table, we only wanted to calculate the mean() of two columns arr_delay and dep_delay. But .SD would contain all the columns other than the grouping variables by default.

– How can we specify just the columns we would like to compute the mean() on?

.SDcols {.bs-callout .bs-callout-info}

Using the argument .SDcols. It accepts either column names or column indices. For example, .SDcols = c("arr_delay", "dep_delay") ensures that .SD contains only these two columns for each group.

Similar to the with = FALSE section, you can also provide the columns to remove instead of columns to keep using - or ! sign as well as select consecutive columns as colA:colB and deselect consecutive columns as !(colA:colB) or-(colA:colB).

# Now let us try to use .SD along with .SDcols to get the mean() of arr_delay and dep_delay columns grouped by origin, dest and month.

flights[carrier == "AA",                       ## Only on trips with carrier "AA"
        lapply(.SD, mean),                     ## compute the mean
        by = .(origin, dest, month),           ## for every 'origin,dest,month'
        .SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols
#      origin dest month  arr_delay  dep_delay
#   1:    JFK  LAX     1   6.590361 14.2289157
#   2:    LGA  PBI     1  -7.758621  0.3103448
#   3:    EWR  LAX     1   1.366667  7.5000000
#   4:    JFK  MIA     1  15.720670 18.7430168
#   5:    JFK  SEA     1  14.357143 30.7500000
#  ---                                        
# 196:    LGA  MIA    10  -6.251799 -1.4208633
# 197:    JFK  MIA    10  -1.880184  6.6774194
# 198:    EWR  PHX    10  -3.032258 -4.2903226
# 199:    JFK  MCO    10 -10.048387 -1.6129032
# 200:    JFK  DCA    10  16.483871 15.5161290

f) Subset .SD for each group:

– How can we return the first two rows for each month?

ans <- flights[, head(.SD, 2), by = month]
head(ans)
#    month year day dep_delay arr_delay carrier origin dest air_time distance hour
# 1:     1 2014   1        14        13      AA    JFK  LAX      359     2475    9
# 2:     1 2014   1        -3        13      AA    JFK  LAX      363     2475   11
# 3:     2 2014   1        -1         1      AA    JFK  LAX      358     2475    8
# 4:     2 2014   1        -5         3      AA    JFK  LAX      358     2475   11
# 5:     3 2014   1       -11        36      AA    JFK  LAX      375     2475    8
# 6:     3 2014   1        -3        14      AA    JFK  LAX      368     2475   11

{.bs-callout .bs-callout-info}

  • .SD is a data.table that holds all the rows for that group. We simply subset the first two rows as we have seen here already.

  • For each group, head(.SD, 2) returns the first two rows as a data.table which is also a list. So we do not have to wrap it with .().

g) Why keep j so flexible?

So that we have a consistent syntax and keep using already existing (and familiar) base functions instead of learning new functions. To illustrate, let us use the data.table DT we created at the very beginning under What is a data.table? section.

– How can we concatenate columns a and b for each group in ID?

DT[, .(val = c(a,b)), by = ID]
#     ID val
#  1:  b   1
#  2:  b   2
#  3:  b   3
#  4:  b   7
#  5:  b   8
#  6:  b   9
#  7:  a   4
#  8:  a   5
#  9:  a  10
# 10:  a  11
# 11:  c   6
# 12:  c  12

{.bs-callout .bs-callout-info}

  • That's it. There is no special syntax required. All we need to know is the base function c() which concatenates vectors and the tip from before.

– What if we would like to have all the values of column a and b concatenated, but returned as a list column?

DT[, .(val = list(c(a,b))), by = ID]
#    ID         val
# 1:  b 1,2,3,7,8,9
# 2:  a  4, 5,10,11
# 3:  c        6,12

{.bs-callout .bs-callout-info}

  • Here, we first concatenate the values with c(a,b) for each group, and wrap that with list(). So for each group, we return a list of all concatenated values.

  • Note those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others.

# Once you start internalising usage in j, you will realise how powerful the syntax can be. A very useful way to understand it is by playing around, with the help of print().

For example:

## (1) look at the difference between
DT[, print(c(a,b)), by = ID]
# [1] 1 2 3 7 8 9
# [1]  4  5 10 11
# [1]  6 12
# Empty data.table (0 rows) of 1 col: ID

## (2) and
DT[, print(list(c(a,b))), by = ID]
# [[1]]
# [1] 1 2 3 7 8 9
# 
# [[1]]
# [1]  4  5 10 11
# 
# [[1]]
# [1]  6 12
# Empty data.table (0 rows) of 1 col: ID

In (1), for each group, a vector is returned, with length = 6,4,2 here. However (2) returns a list of length 1 for each group, with its first element holding vectors of length 6,4,2. Therefore (1) results in a length of 6+4+2 = 12, whereas (2) returns 1+1+1=3.

Summary

The general form of data.table syntax is:

DT[i, j, by]

We have seen so far that,

Using i: {.bs-callout .bs-callout-info}

  • We can subset rows similar to a data.frame - except you don't have to use DT$ repetitively since columns within the frame of a data.table are seen as if they are variables.

  • We can also sort a data.table using order(), which internally uses data.table's fast order for performance.

We can do much more in i by keying a data.table, which allows blazing fast subsets and joins. We will see this in the “Keys and fast binary search based subsets” and “Joins and rolling joins” vignette.

Using j: {.bs-callout .bs-callout-info}

  1. Select columns the data.table way: DT[, .(colA, colB)].

  2. Select columns the data.frame way: DT[, c("colA", "colB"), with = FALSE].

  3. Compute on columns: DT[, .(sum(colA), mean(colB))].

  4. Provide names if necessary: DT[, .(sA =sum(colA), mB = mean(colB))].

  5. Combine with i: DT[colA > value, sum(colB)].

#

Using by: {.bs-callout .bs-callout-info}

  • Using by, we can group by columns by specifying a list of columns or a character vector of column names or even expressions. The flexibility of j, combined with by and i makes for a very powerful syntax.

  • by can handle multiple columns and also expressions.

  • We can keyby grouping columns to automatically sort the grouped result.

  • We can use .SD and .SDcols in j to operate on multiple columns using already familiar base functions. Here are some examples:

1. `DT[, lapply(.SD, fun), by = ..., .SDcols = ...]` - applies `fun` to all columns specified in `.SDcols` while grouping by the columns specified in `by`.

2. `DT[, head(.SD, 2), by = ...]` - return the first two rows for each group.

3. `DT[col > val, head(.SD, 1), by = ...]` - combine `i` along with `j` and `by`.

#

And remember the tip: {.bs-callout .bs-callout-warning}

As long as j returns a list, each element of the list will become a column in the resulting data.table.

#

We will see how to add/update/delete columns by reference and how to combine them with i and by in the next vignette.


data.table/inst/doc/datatable-intro.Rmd0000644000175100001440000007142113172210047017531 0ustar hornikusers--- title: "Introduction to data.table" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to data.table} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` This vignette introduces the *data.table* syntax, its general form, how to *subset* rows, *select and compute* on columns and perform aggregations *by group*. Familiarity with *data.frame* data structure from base R is useful, but not essential to follow this vignette. *** ## Data analysis using data.table Data manipulation operations such as *subset*, *group*, *update*, *join* etc., are all inherently related. Keeping these *related operations together* allows for: * *concise* and *consistent* syntax irrespective of the set of operations you would like to perform to achieve your end goal. * performing analysis *fluidly* without the cognitive burden of having to map each operation to a particular function from a set of functions available before to perform the analysis. * *automatically* optimising operations internally, and very effectively, by knowing precisely the data required for each operation and therefore very fast and memory efficient. Briefly, if you are interested in reducing *programming* and *compute* time tremendously, then this package is for you. The philosophy that *data.table* adheres to makes this possible. Our goal is to illustrate it through this series of vignettes. ## Data {#data} In this vignette, we will use [NYC-flights14](https://github.com/arunsrinivasan/flights/wiki/NYC-Flights-2014-data) data. It contains On-Time flights data from the [Bureau of Transporation Statistics](http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236) for all the flights that departed from New York City airports in 2014 (inspired by [nycflights13](https://github.com/hadley/nycflights13)). The data is available only for Jan-Oct'14. We can use *data.table's* fast file reader `fread` to load *flights* directly as follows: ```{r echo = FALSE} options(width = 100L) ``` ```{r} flights <- fread("flights14.csv") flights dim(flights) ``` Aside: `fread` accepts `http` and `https` URLs directly as well as operating system commands such as `sed` and `awk` output. See `?fread` for examples. ## Introduction In this vignette, we will 1. start with basics - what is a *data.table*, its general form, how to *subset* rows, *select and compute* on columns 2. and then we will look at performing data aggregations by group, ## 1. Basics {#basics-1} ### a) What is data.table? {#what-is-datatable-1a} *data.table* is an R package that provides **an enhanced version** of *data.frames*. In the [Data](#data) section, we already created a *data.table* using `fread()`. We can also create one using the `data.table()` function. Here is an example: ```{r} DT = data.table(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18) DT class(DT$ID) ``` You can also convert existing objects to a *data.table* using `as.data.table()`. #### Note that: {.bs-callout .bs-callout-info} * Unlike *data.frames*, columns of `character` type are *never* converted to `factors` by default. * Row numbers are printed with a `:` in order to visually separate the row number from the first column. * When the number of rows to print exceeds the global option `datatable.print.nrows` (default = `r getOption("datatable.print.nrows")`), it automatically prints only the top 5 and bottom 5 rows (as can be seen in the [Data](#data) section). ```{.r} getOption("datatable.print.nrows") ``` * *data.table* doesn't set or use *row names*, ever. We will see as to why in *"Keys and fast binary search based subset"* vignette. ### b) General form - in what way is a data.table *enhanced*? {#enhanced-1b} In contrast to a *data.frame*, you can do *a lot more* than just subsetting rows and selecting columns within the frame of a *data.table*, i.e., within `[ ... ]`. To understand it we will have to first look at the *general form* of *data.table* syntax, as shown below: ```{r eval = FALSE} DT[i, j, by] ## R: i j by ## SQL: where select | update group by ``` Users who have a SQL background might perhaps immediately relate to this syntax. #### The way to read it (out loud) is: {.bs-callout .bs-callout-info} Take `DT`, subset rows using `i`, then calculate `j`, grouped by `by`. # Let's begin by looking at `i` and `j` first - subsetting rows and operating on columns. ### c) Subset rows in `i` {#subset-i-1c} #### -- Get all the flights with "JFK" as the origin airport in the month of June. ```{r} ans <- flights[origin == "JFK" & month == 6L] head(ans) ``` #### {.bs-callout .bs-callout-info} * Within the frame of a *data.table*, columns can be referred to *as if they are variables*. Therefore, we simply refer to `dest` and `month` as if they are variables. We do not need to add the prefix `flights$` each time. However using `flights$dest` and `flights$month` would work just fine. * The *row indices* that satisfies the condition `origin == "JFK" & month == 6L` are computed, and since there is nothing else left to do, a *data.table* all columns from `flights` corresponding to those *row indices* are simply returned. * A comma after the condition is also not required in `i`. But `flights[dest == "JFK" & month == 6L, ]` would work just fine. In *data.frames* however, the comma is necessary. #### -- Get the first two rows from `flights`. {#subset-rows-integer} ```{r} ans <- flights[1:2] ans ``` #### {.bs-callout .bs-callout-info} * In this case, there is no condition. The row indices are already provided in `i`. We therefore return a *data.table* with all columns from `flight` for those *row indices*. #### -- Sort `flights` first by column `origin` in *ascending* order, and then by `dest` in *descending* order: We can use the base R function `order()` to accomplish this. ```{r} ans <- flights[order(origin, -dest)] head(ans) ``` #### `order()` is internally optimised {.bs-callout .bs-callout-info} * We can use "-" on a *character* columns within the frame of a *data.table* to sort in decreasing order. * In addition, `order(...)` within the frame of a *data.table* uses *data.table*'s internal fast radix order `forder()`, which is much faster than `base::order`. Here's a small example to highlight the difference. ```{r} odt = data.table(col = sample(1e7)) (t1 <- system.time(ans1 <- odt[base::order(col)])) ## uses order from base R (t2 <- system.time(ans2 <- odt[order(col)])) ## uses data.table's forder (identical(ans1, ans2)) ```` The speedup here is **~`r round(t1[3]/t2[3])`x**. We will discuss *data.table*'s fast order in more detail in the *data.table internals* vignette. * This is so that you can improve performance tremendously while using already familiar functions. # ```{r echo = FALSE} rm(odt); rm(ans1); rm(ans2); rm(t1); rm(t2) ``` ### d) Select column(s) in `j` {#select-j-1d} #### -- Select `arr_delay` column, but return it as a *vector*. ```{r} ans <- flights[, arr_delay] head(ans) ``` #### {.bs-callout .bs-callout-info} * Since columns can be referred to as if they are variables within the frame of data.tables, we directly refer to the *variable* we want to subset. Since we want *all the rows*, we simply skip `i`. * It returns *all* the rows for the column `arr_delay`. #### -- Select `arr_delay` column, but return as a *data.table* instead. ```{r} ans <- flights[, list(arr_delay)] head(ans) ``` #### {.bs-callout .bs-callout-info} * We wrap the *variables* (column names) within `list()`, which ensures that a *data.table* is returned. In case of a single column name, not wrapping with `list()` returns a vector instead, as seen in the [previous example](#select-j-1d). * *data.table* also allows using `.()` to wrap columns with. It is an *alias* to `list()`; they both mean the same. Feel free to use whichever you prefer. We will continue to use `.()` from here on. # *data.tables* (and *data.frames*) are internally *lists* as well, but with all its columns of equal length and with a *class* attribute. Allowing `j` to return a *list* enables converting and returning a *data.table* very efficiently. #### Tip: {.bs-callout .bs-callout-warning #tip-1} As long as `j-expression` returns a *list*, each element of the list will be converted to a column in the resulting *data.table*. This makes `j` quite powerful, as we will see shortly. #### -- Select both `arr_delay` and `dep_delay` columns. ```{r} ans <- flights[, .(arr_delay, dep_delay)] head(ans) ## alternatively # ans <- flights[, list(arr_delay, dep_delay)] ``` #### {.bs-callout .bs-callout-info} * Wrap both columns within `.()`, or `list()`. That's it. # #### -- Select both `arr_delay` and `dep_delay` columns *and* rename them to `delay_arr` and `delay_dep`. Since `.()` is just an alias for `list()`, we can name columns as we would while creating a *list*. ```{r} ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)] head(ans) ``` That's it. ### e) Compute or *do* in `j` #### -- How many trips have had total delay < 0? ```{r} ans <- flights[, sum((arr_delay + dep_delay) < 0)] ans ``` #### What's happening here? {.bs-callout .bs-callout-info} * *data.table*'s `j` can handle more than just *selecting columns* - it can handle *expressions*, i.e., *compute on columns*. This shouldn't be surprising, as columns can be referred to as if they are variables. Then we should be able to *compute* by calling functions on those variables. And that's what precisely happens here. ### f) Subset in `i` *and* do in `j` #### -- Calculate the average arrival and departure delay for all flights with "JFK" as the origin airport in the month of June. ```{r} ans <- flights[origin == "JFK" & month == 6L, .(m_arr = mean(arr_delay), m_dep = mean(dep_delay))] ans ``` #### {.bs-callout .bs-callout-info} * We first subset in `i` to find matching *row indices* where `origin` airport equals *"JFK"*, and `month` equals *6*. At this point, we *do not* subset the entire *data.table* corresponding to those rows. * Now, we look at `j` and find that it uses only *two columns*. And what we have to do is to compute their `mean()`. Therefore we subset just those columns corresponding to the matching rows, and compute their `mean()`. Because the three main components of the query (`i`, `j` and `by`) are *together* inside `[...]`, *data.table* can see all three and optimise the query altogether *before evaluation*, not each separately. We are able to therefore avoid the entire subset, for both speed and memory efficiency. #### -- How many trips have been made in 2014 from "JFK" airport in the month of June? ```{r} ans <- flights[origin == "JFK" & month == 6L, length(dest)] ans ``` The function `length()` requires an input argument. We just needed to compute the number of rows in the subset. We could have used any other column as input argument to `length()` really. This type of operation occurs quite frequently, especially while grouping as we will see in the next section, that *data.table* provides a *special symbol* `.N` for it. #### Special symbol `.N`: {.bs-callout .bs-callout-info #special-N} `.N` is a special in-built variable that holds the number of observations in the current group. It is particularly useful when combined with `by` as we'll see in the next section. In the absence of group by operations, it simply returns the number of rows in the subset. # So we can now accomplish the same task by using `.N` as follows: ```{r} ans <- flights[origin == "JFK" & month == 6L, .N] ans ``` #### {.bs-callout .bs-callout-info} * Once again, we subset in `i` to get the *row indices* where `origin` airport equals *"JFK"*, and `month` equals *6*. * We see that `j` uses only `.N` and no other columns. Therefore the entire subset is not materialised. We simply return the number of rows in the subset (which is just the length of row indices). * Note that we did not wrap `.N` with `list()` or `.()`. Therefore a vector is returned. We could have accomplished the same operation by doing `nrow(flights[origin == "JFK" & month == 6L])`. However, it would have to subset the entire *data.table* first corresponding to the *row indices* in `i` *and then* return the rows using `nrow()`, which is unnecessary and inefficient. We will cover this and other optimisation aspects in detail under the *data.table design* vignette. ### g) Great! But how can I refer to columns by names in `j` (like in a *data.frame*)? You can refer to column names the *data.frame* way using `with = FALSE`. #### -- Select both `arr_delay` and `dep_delay` columns the *data.frame* way. ```{r} ans <- flights[, c("arr_delay", "dep_delay"), with = FALSE] head(ans) ``` The argument is named `with` after the R function `with()` because of similar functionality. Suppose you've a *data.frame* `DF` and you'd like to subset all rows where `x > 1`. ```{r} DF = data.frame(x = c(1,1,1,2,2,3,3,3), y = 1:8) ## (1) normal way DF[DF$x > 1, ] # data.frame needs that ',' as well ## (2) using with DF[with(DF, x > 1), ] ``` #### {.bs-callout .bs-callout-info #with_false} * Using `with()` in (2) allows using `DF`'s column `x` as if it were a variable. Hence the argument name `with` in *data.table*. Setting `with = FALSE` disables the ability to refer to columns as if they are variables, thereby restoring the "*data.frame* mode". * We can also *deselect* columns using `-` or `!`. For example: ```{r eval = FALSE} ## not run # returns all columns except arr_delay and dep_delay ans <- flights[, !c("arr_delay", "dep_delay"), with = FALSE] # or ans <- flights[, -c("arr_delay", "dep_delay"), with = FALSE] ``` * From `v1.9.5+`, we can also select by specifying start and end column names, for e.g, `year:day` to select the first three columns. ```{r eval = FALSE} ## not run # returns year,month and day ans <- flights[, year:day, with = FALSE] # returns day, month and year ans <- flights[, day:year, with = FALSE] # returns all columns except year, month and day ans <- flights[, -(year:day), with = FALSE] ans <- flights[, !(year:day), with = FALSE] ``` This is particularly handy while working interactively. # `with = TRUE` is default in *data.table* because we can do much more by allowing `j` to handle expressions - especially when combined with `by` as we'll see in a moment. ## 2. Aggregations We've already seen `i` and `j` from *data.table*'s general form in the previous section. In this section, we'll see how they can be combined together with `by` to perform operations *by group*. Let's look at some examples. ### a) Grouping using `by` #### -- How can we get the number of trips corresponding to each origin airport? ```{r} ans <- flights[, .(.N), by = .(origin)] ans ## or equivalently using a character vector in 'by' # ans <- flights[, .(.N), by = "origin"] ``` #### {.bs-callout .bs-callout-info} * We know `.N` [is a special variable](#special-N) that holds the number of rows in the current group. Grouping by `origin` obtains the number of rows, `.N`, for each group. * By doing `head(flights)` you can see that the origin airports occur in the order *"JFK"*, *"LGA"* and *"EWR"*. The original order of grouping variables is preserved in the result. * Since we did not provide a name for the column returned in `j`, it was named `N`automatically by recognising the special symbol `.N`. * `by` also accepts character vector of column names. It is particularly useful to program with, for e.g., designing a function with the columns to be group by as a function argument. * When there's only one column or expression to refer to in `j` and `by`, we can drop the `.()` notation. This is purely for convenience. We could instead do: ```{r} ans <- flights[, .N, by = origin] ans ``` We'll use this convenient form wherever applicable hereafter. # #### -- How can we calculate the number of trips for each origin airport for carrier code *"AA"*? {#origin-.N} The unique carrier code *"AA"* corresponds to *American Airlines Inc.* ```{r} ans <- flights[carrier == "AA", .N, by = origin] ans ``` #### {.bs-callout .bs-callout-info} * We first obtain the row indices for the expression `carrier == "AA"` from `i`. * Using those *row indices*, we obtain the number of rows while grouped by `origin`. Once again no columns are actually materialised here, because the `j-expression` does not require any columns to be actually subsetted and is therefore fast and memory efficient. #### -- How can we get the total number of trips for each `origin, dest` pair for carrier code *"AA"*? {#origin-dest-.N} ```{r} ans <- flights[carrier == "AA", .N, by = .(origin,dest)] head(ans) ## or equivalently using a character vector in 'by' # ans <- flights[carrier == "AA", .N, by = c("origin", "dest")] ``` #### {.bs-callout .bs-callout-info} * `by` accepts multiple columns. We just provide all the columns by which to group by. #### -- How can we get the average arrival and departure delay for each `orig,dest` pair for each month for carrier code *"AA"*? {#origin-dest-month} ```{r} ans <- flights[carrier == "AA", .(mean(arr_delay), mean(dep_delay)), by = .(origin, dest, month)] ans ``` #### {.bs-callout .bs-callout-info} * We did not provide column names for expressions in `j`, they were automatically generated (`V1`, `V2`). * Once again, note that the input order of grouping columns is preserved in the result. # Now what if we would like to order the result by those grouping columns `origin`, `dest` and `month`? ### b) keyby *data.table* retaining the original order of groups is intentional and by design. There are cases when preserving the original order is essential. But at times we would like to automatically sort by the variables we grouped by. #### -- So how can we directly order by all the grouping variables? ```{r} ans <- flights[carrier == "AA", .(mean(arr_delay), mean(dep_delay)), keyby = .(origin, dest, month)] ans ``` #### {.bs-callout .bs-callout-info} * All we did was to change `by` to `keyby`. This automatically orders the result by the grouping variables in increasing order. Note that `keyby()` is applied after performing the operation, i.e., on the computed result. **Keys:** Actually `keyby` does a little more than *just ordering*. It also *sets a key* after ordering by setting an *attribute* called `sorted`. But we'll learn more about `keys` in the next vignette. For now, all you've to know is you can use `keyby` to automatically order by the columns specified in `by`. ### c) Chaining Let's reconsider the task of [getting the total number of trips for each `origin, dest` pair for carrier *"AA"*](#origin-dest-.N). ```{r} ans <- flights[carrier == "AA", .N, by = .(origin, dest)] ``` #### -- How can we order `ans` using the columns `origin` in ascending order, and `dest` in descending order? We can store the intermediate result in a variable, and then use `order(origin, -dest)` on that variable. It seems fairly straightforward. ```{r} ans <- ans[order(origin, -dest)] head(ans) ``` #### {.bs-callout .bs-callout-info} * Recall that we can use "-" on a *character* column in `order()` within the frame of a *data.table*. This is possible to due *data.table*'s internal query optimisation. * Also recall that `order(...)` with the frame of a *data.table* is *automatically optimised* to use *data.table*'s internal fast radix order `forder()` for speed. So you can keep using the already *familiar* base R functions without compromising in speed or memory efficiency that *data.table* offers. We will cover this in more detail in the *data.table internals* vignette. # But this requires having to assign the intermediate result and then overwriting that result. We can do one better and avoid this intermediate assignment on to a variable altogether by `chaining` expressions. ```{r} ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] head(ans, 10) ``` #### {.bs-callout .bs-callout-info} * We can tack expressions one after another, *forming a chain* of operations, i.e., `DT[ ... ][ ... ][ ... ]`. * Or you can also chain them vertically: ```{r eval = FALSE} DT[ ... ][ ... ][ ... ] ``` ### d) Expressions in `by` #### -- Can `by` accept *expressions* as well or just take columns? Yes it does. As an example, if we would like to find out how many flights started late but arrived early (or on time), started and arrived late etc... ```{r} ans <- flights[, .N, .(dep_delay>0, arr_delay>0)] ans ``` #### {.bs-callout .bs-callout-info} * The last row corresponds to `dep_delay > 0 = TRUE` and `arr_delay > 0 = FALSE`. We can see that `r flights[!is.na(arr_delay) & !is.na(dep_delay), .N, .(dep_delay>0, arr_delay>0)][, N[4L]]` flights started late but arrived early (or on time). * Note that we did not provide any names to `by-expression`. And names have been automatically assigned in the result. * You can provide other columns along with expressions, for example: `DT[, .N, by = .(a, b>0)]`. ### e) Multiple columns in `j` - `.SD` #### -- Do we have to compute `mean()` for each column individually? It is of course not practical to have to type `mean(myCol)` for every column one by one. What if you had a 100 columns to compute `mean()` of? How can we do this efficiently? To get there, refresh on [this tip](#tip-1) - *"As long as j-expression returns a list, each element of the list will be converted to a column in the resulting data.table"*. Suppose we can refer to the *data subset for each group* as a variable *while grouping*, then we can loop through all the columns of that variable using the already familiar base function `lapply()`. We don't have to learn any new function. #### Special symbol `.SD`: {.bs-callout .bs-callout-info #special-SD} *data.table* provides a *special* symbol, called `.SD`. It stands for **S**ubset of **D**ata. It by itself is a *data.table* that holds the data for *the current group* defined using `by`. Recall that a *data.table* is internally a list as well with all its columns of equal length. # Let's use the [*data.table* `DT` from before](#what-is-datatable-1a) to get a glimpse of what `.SD` looks like. ```{r} DT DT[, print(.SD), by = ID] ``` #### {.bs-callout .bs-callout-info} * `.SD` contains all the columns *except the grouping columns* by default. * It is also generated by preserving the original order - data corresponding to `ID = "b"`, then `ID = "a"`, and then `ID = "c"`. # To compute on (multiple) columns, we can then simply use the base R function `lapply()`. ```{r} DT[, lapply(.SD, mean), by = ID] ``` #### {.bs-callout .bs-callout-info} * `.SD` holds the rows corresponding to columns *a*, *b* and *c* for that group. We compute the `mean()` on each of these columns using the already familiar base function `lapply()`. * Each group returns a list of three elements containing the mean value which will become the columns of the resulting `data.table`. * Since `lapply()` returns a *list*, there is no need to wrap it with an additional `.()` (if necessary, refer to [this tip](#tip-1)). # We are almost there. There is one little thing left to address. In our `flights` *data.table*, we only wanted to calculate the `mean()` of two columns `arr_delay` and `dep_delay`. But `.SD` would contain all the columns other than the grouping variables by default. #### -- How can we specify just the columns we would like to compute the `mean()` on? #### .SDcols {.bs-callout .bs-callout-info} Using the argument `.SDcols`. It accepts either column names or column indices. For example, `.SDcols = c("arr_delay", "dep_delay")` ensures that `.SD` contains only these two columns for each group. Similar to the [with = FALSE section](#with_false), you can also provide the columns to remove instead of columns to keep using `-` or `!` sign as well as select consecutive columns as `colA:colB` and deselect consecutive columns as `!(colA:colB) or `-(colA:colB)`. # Now let us try to use `.SD` along with `.SDcols` to get the `mean()` of `arr_delay` and `dep_delay` columns grouped by `origin`, `dest` and `month`. ```{r} flights[carrier == "AA", ## Only on trips with carrier "AA" lapply(.SD, mean), ## compute the mean by = .(origin, dest, month), ## for every 'origin,dest,month' .SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols ``` ### f) Subset `.SD` for each group: #### -- How can we return the first two rows for each `month`? ```{r} ans <- flights[, head(.SD, 2), by = month] head(ans) ``` #### {.bs-callout .bs-callout-info} * `.SD` is a *data.table* that holds all the rows for *that group*. We simply subset the first two rows as we have seen [here](#subset-rows-integer) already. * For each group, `head(.SD, 2)` returns the first two rows as a *data.table* which is also a list. So we do not have to wrap it with `.()`. ### g) Why keep `j` so flexible? So that we have a consistent syntax and keep using already existing (and familiar) base functions instead of learning new functions. To illustrate, let us use the *data.table* `DT` we created at the very beginning under [What is a data.table?](#what-is-datatable-1a) section. #### -- How can we concatenate columns `a` and `b` for each group in `ID`? ```{r} DT[, .(val = c(a,b)), by = ID] ``` #### {.bs-callout .bs-callout-info} * That's it. There is no special syntax required. All we need to know is the base function `c()` which concatenates vectors and [the tip from before](#tip-1). #### -- What if we would like to have all the values of column `a` and `b` concatenated, but returned as a list column? ```{r} DT[, .(val = list(c(a,b))), by = ID] ``` #### {.bs-callout .bs-callout-info} * Here, we first concatenate the values with `c(a,b)` for each group, and wrap that with `list()`. So for each group, we return a list of all concatenated values. * Note those commas are for display only. A list column can contain any object in each cell, and in this example, each cell is itself a vector and some cells contain longer vectors than others. # Once you start internalising usage in `j`, you will realise how powerful the syntax can be. A very useful way to understand it is by playing around, with the help of `print()`. For example: ```{r} ## (1) look at the difference between DT[, print(c(a,b)), by = ID] ## (2) and DT[, print(list(c(a,b))), by = ID] ``` In (1), for each group, a vector is returned, with length = 6,4,2 here. However (2) returns a list of length 1 for each group, with its first element holding vectors of length 6,4,2. Therefore (1) results in a length of ` 6+4+2 = `r 6+4+2``, whereas (2) returns `1+1+1=`r 1+1+1``. ## Summary The general form of *data.table* syntax is: ```{r eval = FALSE} DT[i, j, by] ``` We have seen so far that, #### Using `i`: {.bs-callout .bs-callout-info} * We can subset rows similar to a *data.frame* - except you don't have to use `DT$` repetitively since columns within the frame of a *data.table* are seen as if they are *variables*. * We can also sort a *data.table* using `order()`, which internally uses *data.table*'s fast order for performance. We can do much more in `i` by keying a *data.table*, which allows blazing fast subsets and joins. We will see this in the *"Keys and fast binary search based subsets"* and *"Joins and rolling joins"* vignette. #### Using `j`: {.bs-callout .bs-callout-info} 1. Select columns the *data.table* way: `DT[, .(colA, colB)]`. 2. Select columns the *data.frame* way: `DT[, c("colA", "colB"), with = FALSE]`. 3. Compute on columns: `DT[, .(sum(colA), mean(colB))]`. 4. Provide names if necessary: `DT[, .(sA =sum(colA), mB = mean(colB))]`. 5. Combine with `i`: `DT[colA > value, sum(colB)]`. # #### Using `by`: {.bs-callout .bs-callout-info} * Using `by`, we can group by columns by specifying a *list of columns* or a *character vector of column names* or even *expressions*. The flexibility of `j`, combined with `by` and `i` makes for a very powerful syntax. * `by` can handle multiple columns and also *expressions*. * We can `keyby` grouping columns to automatically sort the grouped result. * We can use `.SD` and `.SDcols` in `j` to operate on multiple columns using already familiar base functions. Here are some examples: 1. `DT[, lapply(.SD, fun), by = ..., .SDcols = ...]` - applies `fun` to all columns specified in `.SDcols` while grouping by the columns specified in `by`. 2. `DT[, head(.SD, 2), by = ...]` - return the first two rows for each group. 3. `DT[col > val, head(.SD, 1), by = ...]` - combine `i` along with `j` and `by`. # #### And remember the tip: {.bs-callout .bs-callout-warning} As long as `j` returns a *list*, each element of the list will become a column in the resulting *data.table*. # We will see how to *add/update/delete* columns *by reference* and how to combine them with `i` and `by` in the next vignette. *** data.table/inst/doc/datatable-reference-semantics.R0000644000175100001440000001066013172212366022003 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ----echo = FALSE--------------------------------------------------------------------------------- options(width = 100L) ## ------------------------------------------------------------------------------------------------- flights <- fread("flights14.csv") flights dim(flights) ## ------------------------------------------------------------------------------------------------- DF = data.frame(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18) DF ## ----eval = FALSE--------------------------------------------------------------------------------- # DF$c <- 18:13 # (1) -- replace entire column # # or # DF$c[DF$ID == "b"] <- 15:13 # (2) -- subassign in column 'c' ## ----eval = FALSE--------------------------------------------------------------------------------- # DT[, c("colA", "colB", ...) := list(valA, valB, ...)] # # # when you have only one column to assign to you # # can drop the quotes and list(), for convenience # DT[, colA := valA] ## ----eval = FALSE--------------------------------------------------------------------------------- # DT[, `:=`(colA = valA, # valA is assigned to colA # colB = valB, # valB is assigned to colB # ... # )] ## ------------------------------------------------------------------------------------------------- flights[, `:=`(speed = distance / (air_time/60), # speed in mph (mi/h) delay = arr_delay + dep_delay)] # delay in minutes head(flights) ## alternatively, using the 'LHS := RHS' form # flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)] ## ------------------------------------------------------------------------------------------------- # get all 'hours' in flights flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- # subassign by reference flights[hour == 24L, hour := 0L] ## ------------------------------------------------------------------------------------------------- flights[hour == 24L, hour := 0L][] ## ------------------------------------------------------------------------------------------------- # check again for '24' flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- flights[, c("delay") := NULL] head(flights) ## or using the functional form # flights[, `:=`(delay = NULL)] ## ----eval = FALSE--------------------------------------------------------------------------------- # flights[, delay := NULL] ## ------------------------------------------------------------------------------------------------- flights[, max_speed := max(speed), by = .(origin, dest)] head(flights) ## ------------------------------------------------------------------------------------------------- in_cols = c("dep_delay", "arr_delay") out_cols = c("max_dep_delay", "max_arr_delay") flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols] head(flights) ## ------------------------------------------------------------------------------------------------- # RHS gets automatically recycled to length of LHS flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL] head(flights) ## ------------------------------------------------------------------------------------------------- foo <- function(DT) { DT[, speed := distance / (air_time/60)] DT[, .(max_speed = max(speed)), by = month] } ans = foo(flights) head(flights) head(ans) ## ------------------------------------------------------------------------------------------------- flights[, speed := NULL] ## ------------------------------------------------------------------------------------------------- foo <- function(DT) { DT <- copy(DT) ## deep copy DT[, speed := distance / (air_time/60)] ## doesn't affect 'flights' DT[, .(max_speed = max(speed)), by = month] } ans <- foo(flights) head(flights) head(ans) ## ------------------------------------------------------------------------------------------------- DT = data.table(x = 1L, y = 2L) DT_n = names(DT) DT_n ## add a new column by reference DT[, z := 3L] ## DT_n also gets updated DT_n ## use `copy()` DT_n = copy(names(DT)) DT[, w := 4L] ## DT_n doesn't get updated DT_n data.table/inst/doc/datatable-faq.R0000644000175100001440000001027713172212361016627 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- library(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ------------------------------------------------------------------------ X = data.table(grp = c("a", "a", "b", "b", "b", "c", "c"), foo = 1:7) setkey(X, grp) Y = data.table(c("b", "c"), bar = c(4, 2)) X Y X[Y, sum(foo*bar)] X[Y, sum(foo*bar), by = .EACHI] ## ------------------------------------------------------------------------ DF = data.frame(x = 1:3, y = 4:6, z = 7:9) DF DF[ , c("y", "z")] ## ------------------------------------------------------------------------ DT = data.table(DF) DT[ , c(y, z)] ## ------------------------------------------------------------------------ DT[ , .(y, z)] ## ------------------------------------------------------------------------ data.table(NULL) data.frame(NULL) as.data.table(NULL) as.data.frame(NULL) is.null(data.table(NULL)) is.null(data.frame(NULL)) ## ------------------------------------------------------------------------ DT = data.table(a = 1:3, b = c(4, 5, 6), d = c(7L,8L,9L)) DT[0] sapply(DT[0], class) ## ------------------------------------------------------------------------ DT = data.table(x = rep(c("a", "b"), c(2, 3)), y = 1:5) DT DT[ , {z = sum(y); z + 3}, by = x] ## ------------------------------------------------------------------------ DT[ , { cat("Objects:", paste(objects(), collapse = ","), "\n") cat("Trace: x=", as.character(x), " y=", y, "\n") sum(y)}, by = x] ## ------------------------------------------------------------------------ DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x, sum(y)), by = x] DT[ , .(g = 1, h = 2, i = 3, j = 4, repeatgroupname = x[1], sum(y)), by = x] ## ------------------------------------------------------------------------ A = matrix(1:12, nrow = 4) A ## ------------------------------------------------------------------------ A[c(1, 3), c(2, 3)] ## ------------------------------------------------------------------------ B = cbind(c(1, 3), c(2, 3)) B A[B] ## ------------------------------------------------------------------------ rownames(A) = letters[1:4] colnames(A) = LETTERS[1:3] A B = cbind(c("a", "c"), c("B", "C")) A[B] ## ------------------------------------------------------------------------ A = data.frame(A = 1:4, B = letters[11:14], C = pi*1:4) rownames(A) = letters[1:4] A B A[B] ## ------------------------------------------------------------------------ B = data.frame(c("a", "c"), c("B", "C")) cat(try(A[B], silent = TRUE)) ## ---- eval = FALSE------------------------------------------------------- # DT[where, select|update, group by][order by][...] ... [...] ## ------------------------------------------------------------------------ base::cbind.data.frame ## ------------------------------------------------------------------------ foo = data.frame(a = 1:3) cbind.data.frame = function(...) cat("Not printed\n") cbind(foo) rm("cbind.data.frame") ## ------------------------------------------------------------------------ DT = data.table(a = rep(1:3, 1:3), b = 1:6, c = 7:12) DT DT[ , { mySD = copy(.SD) mySD[1, b := 99L] mySD}, by = a] ## ------------------------------------------------------------------------ DT = data.table(a = c(1,1,2,2,2), b = c(1,2,2,2,1)) DT DT[ , list(.N = .N), list(a, b)] # show intermediate result for exposition cat(try( DT[ , list(.N = .N), by = list(a, b)][ , unique(.N), by = a] # compound query more typical , silent = TRUE)) ## ------------------------------------------------------------------------ if (packageVersion("data.table") >= "1.8.1") { DT[ , .N, by = list(a, b)][ , unique(N), by = a] } if (packageVersion("data.table") >= "1.9.3") { DT[ , .N, by = .(a, b)][ , unique(N), by = a] # same } ## ------------------------------------------------------------------------ DT = data.table(a = 1:5, b = 1:5) suppressWarnings( DT[2, b := 6] # works (slower) with warning ) class(6) # numeric not integer DT[2, b := 7L] # works (faster) without warning class(7L) # L makes it an integer DT[ , b := rnorm(5)] # 'replace' integer column with a numeric column data.table/inst/doc/datatable-keys-fast-subset.Rmd0000644000175100001440000004720013172210047021605 0ustar hornikusers--- title: "Keys and fast binary search based subset" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Keys and fast binary search based subset} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` This vignette is aimed at those who are already familiar with *data.table* syntax, its general form, how to subset rows in `i`, select and compute on columns, add/modify/delete columns *by reference* in `j` and group by using `by`. If you're not familiar with these concepts, please read the *"Introduction to data.table"* and *"Reference semantics"* vignettes first. *** ## Data {#data} We will use the same `flights` data as in the *"Introduction to data.table"* vignette. ```{r echo = FALSE} options(width = 100L) ``` ```{r} flights <- fread("flights14.csv") head(flights) dim(flights) ``` ## Introduction In this vignette, we will * first introduce the concept of `key` in *data.table*, and set and use keys to perform *fast binary search* based subsets in `i`, * see that we can combine key based subsets along with `j` and `by` in the exact same way as before, * look at other additional useful arguments - `mult` and `nomatch`, * and finally conclude by looking at the advantage of setting keys - perform *fast binary search based subsets* and compare with the traditional vector scan approach. ## 1. Keys ### a) What is a *key*? In the *"Introduction to data.table"* vignette, we saw how to subset rows in `i` using logical expressions, row numbers and using `order()`. In this section, we will look at another way of subsetting incredibly fast - using *keys*. But first, let's start by looking at *data.frames*. All *data.frames* have a row names attribute. Consider the *data.frame* `DF` below. ```{r} set.seed(1L) DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE), ID2 = sample(1:3, 10, TRUE), val = sample(10), stringsAsFactors = FALSE, row.names = sample(LETTERS[1:10])) DF rownames(DF) ``` We can *subset* a particular row using its row name as shown below: ```{r} DF["C", ] ``` i.e., row names are more or less *an index* to rows of a *data.frame*. However, 1. Each row is limited to *exactly one* row name. But, a person (for example) has at least two names - a *first* and a *second* name. It is useful to organise a telephone directory by *surname* then *first name*. 2. And row names should be *unique*. ```{r eval = FALSE} rownames(DF) = sample(LETTERS[1:5], 10, TRUE) # Warning: non-unique values when setting 'row.names': 'C', 'D' # Error in `row.names<-.data.frame`(`*tmp*`, value = value): duplicate 'row.names' are not allowed ``` Now let's convert it to a *data.table*. ```{r} DT = as.data.table(DF) DT rownames(DT) ``` * Note that row names have been reset. * *data.tables* never uses row names. Since *data.tables* **inherit** from *data.frames*, it still has the row names attribute. But it never uses them. We'll see in a moment as to why. If you would like to preserve the row names, use `keep.rownames = TRUE` in `as.data.table()` - this will create a new column called `rn` and assign row names to this column. Instead, in *data.tables* we set and use `keys`. Think of a `key` as **supercharged rownames**. #### Keys and their properties {.bs-callout .bs-callout-info #key-properties} 1. We can set keys on *multiple columns* and the column can be of *different types* -- *integer*, *numeric*, *character*, *factor*, *integer64* etc. *list* and *complex* types are not supported yet. 2. Uniqueness is not enforced, i.e., duplicate key values are allowed. Since rows are sorted by key, any duplicates in the key columns will appear consecutively. 3. Setting a `key` does *two* things: a. physically reorders the rows of the *data.table* by the column(s) provided *by reference*, always in *increasing* order. b. marks those columns as *key* columns by setting an attribute called `sorted` to the *data.table*. Since the rows are reordered, a *data.table* can have at most one key because it can not be sorted in more than one way. # For the rest of the vignette, we will work with `flights` data set. ### b) Set, get and use keys on a *data.table* #### -- How can we set the column `origin` as key in the *data.table* `flights`? ```{r} setkey(flights, origin) head(flights) ## alternatively we can provide character vectors to the function 'setkeyv()' # setkeyv(flights, "origin") # useful to program with ``` #### {.bs-callout .bs-callout-info} * You can use the function `setkey()` and provide the column names (without quoting them). This is helpful during interactive use. * Alternatively you can pass a character vector of column names to the function `setkeyv()`. This is particularly useful while designing functions to pass columns to set key on as function arguments. * Note that we did not have to assign the result back to a variable. This is because like the `:=` function we saw in the *"Introduction to data.table"* vignette, `setkey()` and `setkeyv()` modify the input *data.table* *by reference*. They return the result invisibly. * The *data.table* is now reordered (or sorted) by the column we provided - `origin`. Since we reorder by reference, we only require additional memory of one column of length equal to the number of rows in the *data.table*, and is therefore very memory efficient. * You can also set keys directly when creating *data.tables* using the `data.table()` function using `key` argument. It takes a character vector of column names. #### set* and `:=`: {.bs-callout .bs-callout-info} In *data.table*, the `:=` operator and all the `set*` (e.g., `setkey`, `setorder`, `setnames` etc..) functions are the only ones which modify the input object *by reference*. # Once you *key* a *data.table* by certain columns, you can subset by querying those key columns using the `.()` notation in `i`. Recall that `.()` is an *alias to* `list()`. #### -- Use the key column `origin` to subset all rows where the origin airport matches *"JFK"* ```{r} flights[.("JFK")] ## alternatively # flights[J("JFK")] (or) # flights[list("JFK")] ``` #### {.bs-callout .bs-callout-info} * The *key* column has already been set to `origin`. So it is sufficient to provide the value, here *"JFK"*, directly. The `.()` syntax helps identify that the task requires looking up the value *"JFK"* in the key column of *data.table* (here column `origin` of `flights` *data.table*). * The *row indices* corresponding to the value *"JFK"* in `origin` is obtained first. And since there is no expression in `j`, all columns corresponding to those row indices are returned. * On single column key of *character* type, you can drop the `.()` notation and use the values directly when subsetting, like subset using row names on *data.frames*. ```{r eval = FALSE} flights["JFK"] ## same as flights[.("JFK")] ``` * We can subset any amount of values as required ```{r eval = FALSE} flights[c("JFK", "LGA")] ## same as flights[.(c("JFK", "LGA"))] ``` This returns all columns corresponding to those rows where `origin` column matches either *"JFK"* or *"LGA"*. #### -- How can we get the column(s) a *data.table* is keyed by? Using the function `key()`. ```{r} key(flights) ``` #### {.bs-callout .bs-callout-info} * It returns a character vector of all the key columns. * If no key is set, it returns `NULL`. ### c) Keys and multiple columns To refresh, *keys* are like *supercharged* row names. We can set key on multiple columns and they can be of multiple types. #### -- How can I set keys on both `origin` *and* `dest` columns? ```{r} setkey(flights, origin, dest) head(flights) ## or alternatively # setkeyv(flights, c("origin", "dest")) # provide a character vector of column names key(flights) ``` #### {.bs-callout .bs-callout-info} * It sorts the *data.table* first by the column `origin` and then by `dest` *by reference*. #### -- Subset all rows using key columns where first key column `origin` matches *"JFK"* and second key column `dest` matches *"MIA"* ```{r} flights[.("JFK", "MIA")] ``` #### How does the subset work here? {.bs-callout .bs-callout-info #multiple-key-point} * It is important to undertand how this works internally. *"JFK"* is first matched against the first key column `origin`. And *within those matching rows*, *"MIA"* is matched against the second key column `dest` to obtain *row indices* where both `origin` and `dest` match the given values. * Since no `j` is provided, we simply return *all columns* corresponding to those row indices. # #### -- Subset all rows where just the first key column `origin` matches *"JFK"* ```{r} key(flights) flights[.("JFK")] ## or in this case simply flights["JFK"], for convenience ``` #### {.bs-callout .bs-callout-info} * Since we did not provide any values for the second key column `dest`, it just matches *"JFK"* against the first key column `origin` and returns all the matched rows. #### -- Subset all rows where just the second key column `dest` matches *"MIA"* ```{r} flights[.(unique(origin), "MIA")] ``` #### What's happening here? {.bs-callout .bs-callout-info} * Read [this](#multiple-key-point) again. The value provided for the second key column *"MIA"* has to find the matching values in `dest` key column *on the matching rows provided by the first key column `origin`*. We can not skip the values of key columns *before*. Therefore we provide *all* unique values from key column `origin`. * *"MIA"* is automatically recycled to fit the length of `unique(origin)` which is *3*. ## 2) Combining keys with `j` and `by` All we have seen so far is the same concept -- obtaining *row indices* in `i`, but just using a different method -- using `keys`. It shouldn't be surprising that we can do exactly the same things in `j` and `by` as seen from the previous vignettes. We will highlight this with a few examples. ### a) Select in `j` #### -- Return `arr_delay` column as a *data.table* corresponding to `origin = "LGA"` and `dest = "TPA"`. ```{r} key(flights) flights[.("LGA", "TPA"), .(arr_delay)] ``` #### {.bs-callout .bs-callout-info} * The *row indices* corresponding to `origin == "LGA"` and `dest == "TPA"` are obtained using *key based subset*. * Once we have the row indices, we look at `j` which requires only the `arr_delay` column. So we simply select the column `arr_delay` for those *row indices* in the exact same way as we have seen in *Introduction to data.table* vignette. * We could have returned the result by using `with = FALSE` as well. ```{r eval = FALSE} flights[.("LGA", "TPA"), "arr_delay", with = FALSE] ``` ### b) Chaining #### -- On the result obtained above, use chaining to order the column in decreasing order. ```{r} flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)] ``` ### c) Compute or *do* in `j` #### -- Find the maximum arrival delay correspondong to `origin = "LGA"` and `dest = "TPA"`. ```{r} flights[.("LGA", "TPA"), max(arr_delay)] ``` #### {.bs-callout .bs-callout-info} * We can verify that the result is identical to first value (486) from the previous example. ### d) *sub-assign* by reference using `:=` in `j` We have seen this example already in the *Reference semantics* vignette. Let's take a look at all the `hours` available in the `flights` *data.table*: ```{r} # get all 'hours' in flights flights[, sort(unique(hour))] ``` We see that there are totally `25` unique values in the data. Both *0* and *24* hours seem to be present. Let's go ahead and replace *24* with *0*, but this time using *key*. ```{r} setkey(flights, hour) key(flights) flights[.(24), hour := 0L] key(flights) ``` #### {.bs-callout .bs-callout-info} * We first set `key` to `hour`. This reorders `flights` by the column `hour` and marks that column as the `key` column. * Now we can subset on `hour` by using the `.()` notation. We subset for the value *24* and obtain the corresponding *row indices*. * And on those row indices, we replace the `key` column with the value `0`. * Since we have replaced values on the *key* column, the *data.table* `flights` isn't sorted by `hour` anymore. Therefore, the key has been automatically removed by setting to NULL. # Now, there shouldn't be any *24* in the `hour` column. ```{r} flights[, sort(unique(hour))] ``` ### e) Aggregation using `by` Let's set the key back to `origin, dest` first. ```{r} setkey(flights, origin, dest) key(flights) ``` #### -- Get the maximum departure delay for each `month` corresponding to `origin = "JFK"`. Order the result by `month` ```{r} ans <- flights["JFK", max(dep_delay), keyby = month] head(ans) key(ans) ``` #### {.bs-callout .bs-callout-info} * We subset on the `key` column *origin* to obtain the *row indices* corresponding to *"JFK"*. * Once we obtain the row indices, we only need two columns - `month` to group by and `dep_delay` to obtain `max()` for each group. *data.table's* query optimisation therefore subsets just those two columns corresponding to the *row indices* obtained in `i`, for speed and memory efficiency. * And on that subset, we group by *month* and compute `max(dep_delay)`. * We use `keyby` to automatically key that result by *month*. Now we understand what that means. In addition to ordering, it also sets *month* as the `key` column. ## 3) Additional arguments - `mult` and `nomatch` ### a) The *mult* argument We can choose, for each query, if *"all"* the matching rows should be returned, or just the *"first"* or *"last"* using the `mult` argument. The default value is *"all"* - what we've seen so far. #### -- Subset only the first matching row from all rows where `origin` matches *"JFK"* and `dest` matches *"MIA"* ```{r} flights[.("JFK", "MIA"), mult = "first"] ``` #### -- Subset only the last matching row of all the rows where `origin` matches *"LGA", "JFK", "EWR"* and `dest` matches *"XNA"* ```{r} flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last"] ``` #### {.bs-callout .bs-callout-info} * The query *"JFK", "XNA"* doesn't match any rows in `flights` and therefore returns `NA`. * Once again, the query for second key column `dest`, *"XNA"*, is recycled to fit the length of the query for first key column `origin`, which is of length 3. ### b) The *nomatch* argument We can choose if queries that do not match should return `NA` or be skipped altogether using the `nomatch` argument. #### -- From the previous example, Subset all rows only if there's a match ```{r} flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = 0L] ``` #### {.bs-callout .bs-callout-info} * Default value for `nomatch` is `NA`. Setting `nomatch = 0L` skips queries with no matches. * The query “JFKâ€, “XNA†doesn’t match any rows in flights and therefore is skipped. ## 4) binary search vs vector scans We have seen so far how we can set and use keys to subset. But what's the advantage? For example, instead of doing: ```{r eval = FALSE} # key by origin,dest columns flights[.("JFK", "MIA")] ``` we could have done: ```{r eval = FALSE} flights[origin == "JFK" & dest == "MIA"] ``` One advantage very likely is shorter syntax. But even more than that, *binary search based subsets* are **incredibly fast**. ### a) Performance of binary search approach To illustrate, let's create a sample *data.table* with 20 million rows and three columns and key it by columns `x` and `y`. ```{r} set.seed(2L) N = 2e7L DT = data.table(x = sample(letters, N, TRUE), y = sample(1000L, N, TRUE), val = runif(N), key = c("x", "y")) print(object.size(DT), units = "Mb") key(DT) ``` `DT` is ~380MB. It is not really huge, but this will do to illustrate the point. From what we have seen in the Introduction to data.table section, we can subset those rows where columns `x = "g"` and `y = 877` as follows: ```{r} ## (1) Usual way of subsetting - vector scan approach t1 <- system.time(ans1 <- DT[x == "g" & y == 877L]) t1 head(ans1) dim(ans1) ``` Now let's try to subset by using keys. ```{r} ## (2) Subsetting using keys t2 <- system.time(ans2 <- DT[.("g", 877L)]) t2 head(ans2) dim(ans2) identical(ans1$val, ans2$val) ``` * The speedup is **~`r round(t1[3]/max(t2[3], .001))`x**! ### b) Why does keying a *data.table* result in blazing fast susbets? To understand that, let's first look at what *vector scan approach* (method 1) does. #### Vector scan approach: {.bs-callout .bs-callout-info} * The column `x` is searched for the value *"g"* row by row, on all 20 million of them. This results in a *logical vector* of size 20 million, with values `TRUE, FALSE or NA` corresponding to `x`'s value. * Similarly, the column `y` is searched for `877` on all 20 million rows one by one, and stored in another logical vector. * Element wise `&` operations are performed on the intermediate logical vectors and all the rows where the expression evaluates to `TRUE` are returned. This is what we call a *vector scan approach*. And this is quite inefficient, especially on larger tables and when one needs repeated subsetting, because it has to scan through all the rows each time. # Now let us look at binary search approach (method 2). Recall from [Properties of key](#key-properties) - *setting keys reorders the data.table by key columns*. Since the data is sorted, we don't have to *scan through the entire length of the column*! We can instead use *binary search* to search a value in `O(log n)` as opposed to `O(n)` in case of *vector scan approach*, where `n` is the number of rows in the *data.table*. #### Binary search approach: {.bs-callout .bs-callout-info} Here's a very simple illustration. Let's consider the (sorted) numbers shown below: ```{r eval = FALSE} 1, 5, 10, 19, 22, 23, 30 ``` Suppose we'd like to find the matching position of the value *1*, using binary search, this is how we would proceed - because we know that the data is *sorted*. * Start with the middle value = 19. Is 1 == 19? No. 1 < 19. * Since the value we're looking for is smaller than 19, it should be somewhere before 19. So we can discard the rest of the half that are >= 19. * Our set is now reduced to *1, 5, 10*. Grab the middle value once again = 5. Is 1 == 5? No. 1 < 5. * Our set is reduced to *1*. Is 1 == 1? Yes. The corresponding index is also 1. And that's the only match. A vector scan approach on the other hand would have to scan through all the values (here, 7). # It can be seen that with every search we reduce the number of searches by half. This is why *binary search* based subsets are **incredibly fast**. Since rows of each column of *data.tables* have contiguous locations in memory, the operations are performed in a very cache efficient manner (also contributes to *speed*). In addition, since we obtain the matching row indices directly without having to create those huge logical vectors (equal to the number of rows in a *data.table*), it is quite **memory efficient** as well. ## Summary In this vignette, we have learnt another method to subset rows in `i` by keying a *data.table*. Setting keys allows us to perform blazing fast subsets by using *binary search*. In particular, we have seen how to #### {.bs-callout .bs-callout-info} * set key and subset using the key on a *data.table*. * subset using keys which fetches *row indices* in `i`, but much faster. * combine key based subsets with `j` and `by`. Note that the `j` and `by` operations are exactly the same as before. # Key based subsets are **incredibly fast** and are particularly useful when the task involves *repeated subsetting*. But it may not be always desirable to set key and physically reorder the *data.table*. In the next vignette, we will address this using a *new* feature -- *secondary indexes*. *** data.table/inst/doc/datatable-reshape.html0000644000175100001440000010254213172212366020254 0ustar hornikusers Data

This vignette discusses the default usage of reshaping functions melt (wide to long) and dcast (long to wide) for data.tables as well as the new extended functionalities of melting and casting on multiple columns available from v1.9.6.


Data

We will load the data sets directly within sections.

Introduction

The melt and dcast functions for data.tables are extensions of the corresponding functions from the reshape2 package.

In this vignette, we will

  1. first briefly look at the default melting and casting of data.tables to convert them from wide to long format and vice versa,

  2. then look at scenarios where the current functionalities becomes cumbersome and inefficient,

  3. and finally look at the new improvements to both melt and dcast methods for data.tables to handle multiple columns simultaneously.

The extended functionalities are in line with data.table's philosophy of performing operations efficiently and in a straightforward manner.

Note: {.bs-callout .bs-callout-info}

From v1.9.6 on, you don't have to load reshape2 package to use these functions for data.tables. You just need to load data.table. If you've to load reshape2 for melting or casting matrices and/or data.frames, then make sure to load it before loading data.table.

1. Default functionality

a) melting data.tables (wide to long)

Suppose we have a data.table (artificial data) as shown below:

DT = fread("melt_default.csv")
DT
#    family_id age_mother dob_child1 dob_child2 dob_child3
# 1:         1         30 1998-11-26 2000-01-29         NA
# 2:         2         27 1996-06-22         NA         NA
# 3:         3         26 2002-07-11 2004-04-05 2007-09-02
# 4:         4         32 2004-10-10 2009-08-27 2012-07-21
# 5:         5         29 2000-12-05 2005-02-28         NA
## dob stands for date of birth.

str(DT)
# Classes 'data.table' and 'data.frame':    5 obs. of  5 variables:
#  $ family_id : int  1 2 3 4 5
#  $ age_mother: int  30 27 26 32 29
#  $ dob_child1: chr  "1998-11-26" "1996-06-22" "2002-07-11" "2004-10-10" ...
#  $ dob_child2: chr  "2000-01-29" NA "2004-04-05" "2009-08-27" ...
#  $ dob_child3: chr  NA NA "2007-09-02" "2012-07-21" ...
#  - attr(*, ".internal.selfref")=<externalptr>

#

- Convert DT to long form where each dob is a separate observation.

We could accomplish this using melt() by specifying id.vars and measure.vars arguments as follows:

DT.m1 = melt(DT, id.vars = c("family_id", "age_mother"),
                measure.vars = c("dob_child1", "dob_child2", "dob_child3"))
DT.m1
#     family_id age_mother   variable      value
#  1:         1         30 dob_child1 1998-11-26
#  2:         2         27 dob_child1 1996-06-22
#  3:         3         26 dob_child1 2002-07-11
#  4:         4         32 dob_child1 2004-10-10
#  5:         5         29 dob_child1 2000-12-05
#  6:         1         30 dob_child2 2000-01-29
#  7:         2         27 dob_child2         NA
#  8:         3         26 dob_child2 2004-04-05
#  9:         4         32 dob_child2 2009-08-27
# 10:         5         29 dob_child2 2005-02-28
# 11:         1         30 dob_child3         NA
# 12:         2         27 dob_child3         NA
# 13:         3         26 dob_child3 2007-09-02
# 14:         4         32 dob_child3 2012-07-21
# 15:         5         29 dob_child3         NA
str(DT.m1)
# Classes 'data.table' and 'data.frame':    15 obs. of  4 variables:
#  $ family_id : int  1 2 3 4 5 1 2 3 4 5 ...
#  $ age_mother: int  30 27 26 32 29 30 27 26 32 29 ...
#  $ variable  : Factor w/ 3 levels "dob_child1","dob_child2",..: 1 1 1 1 1 2 2 2 2 2 ...
#  $ value     : chr  "1998-11-26" "1996-06-22" "2002-07-11" "2004-10-10" ...
#  - attr(*, ".internal.selfref")=<externalptr>

{.bs-callout .bs-callout-info}

  • measure.vars specify the set of columns we would like to collapse (or combine) together.

  • We can also specify column indices instead of names.

  • By default, variable column is of type factor. Set variable.factor argument to FALSE if you'd like to return a character vector instead. variable.factor argument is only available in melt from data.table and not in the reshape2 package.

  • By default, the molten columns are automatically named variable and value.

  • melt preserves column attributes in result.

#

- Name the variable and value columns to child and dob respectively

DT.m1 = melt(DT, measure.vars = c("dob_child1", "dob_child2", "dob_child3"),
               variable.name = "child", value.name = "dob")
DT.m1
#     family_id age_mother      child        dob
#  1:         1         30 dob_child1 1998-11-26
#  2:         2         27 dob_child1 1996-06-22
#  3:         3         26 dob_child1 2002-07-11
#  4:         4         32 dob_child1 2004-10-10
#  5:         5         29 dob_child1 2000-12-05
#  6:         1         30 dob_child2 2000-01-29
#  7:         2         27 dob_child2         NA
#  8:         3         26 dob_child2 2004-04-05
#  9:         4         32 dob_child2 2009-08-27
# 10:         5         29 dob_child2 2005-02-28
# 11:         1         30 dob_child3         NA
# 12:         2         27 dob_child3         NA
# 13:         3         26 dob_child3 2007-09-02
# 14:         4         32 dob_child3 2012-07-21
# 15:         5         29 dob_child3         NA

{.bs-callout .bs-callout-info}

  • By default, when one of id.vars or measure.vars is missing, the rest of the columns are automatically assigned to the missing argument.

  • When neither id.vars nor measure.vars are specified, as mentioned under ?melt, all non-numeric, integer, logical columns will be assigned to id.vars.

    In addition, a warning message is issued highlighting the columns that are automatically considered to be id.vars.

b) Casting data.tables (long to wide)

In the previous section, we saw how to get from wide form to long form. Let's see the reverse operation in this section.

- How can we get back to the original data table DT from DT.m?

That is, we'd like to collect all child observations corresponding to each family_id, age_mother together under the same row. We can accomplish it using dcast as follows:

dcast(DT.m1, family_id + age_mother ~ child, value.var = "dob")
#    family_id age_mother dob_child1 dob_child2 dob_child3
# 1:         1         30 1998-11-26 2000-01-29         NA
# 2:         2         27 1996-06-22         NA         NA
# 3:         3         26 2002-07-11 2004-04-05 2007-09-02
# 4:         4         32 2004-10-10 2009-08-27 2012-07-21
# 5:         5         29 2000-12-05 2005-02-28         NA

{.bs-callout .bs-callout-info}

  • dcast uses formula interface. The variables on the LHS of formula represents the id vars and RHS the measure vars.

  • value.var denotes the column to be filled in with while casting to wide format.

  • dcast also tries to preserve attributes in result wherever possible.

#

- Starting from DT.m, how can we get the number of children in each family?

You can also pass a function to aggregate by in dcast with the argument fun.aggregate. This is particularly essential when the formula provided does not identify single observation for each cell.

dcast(DT.m1, family_id ~ ., fun.agg = function(x) sum(!is.na(x)), value.var = "dob")
#    family_id .
# 1:         1 2
# 2:         2 1
# 3:         3 3
# 4:         4 3
# 5:         5 2

Check ?dcast for other useful arguments and additional examples.

2. Limitations in current melt/dcast approaches

So far we've seen features of melt and dcast that are based on reshape2 package, but implemented efficiently for data.table*s, using internal data.table machinery (*fast radix ordering, binary search etc..).

However, there are situations we might run into where the desired operation is not expressed in a straightforward manner. For example, consider the data.table shown below:

DT = fread("melt_enhanced.csv")
DT
#    family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
# 1:         1         30 1998-11-26 2000-01-29         NA             1             2            NA
# 2:         2         27 1996-06-22         NA         NA             2            NA            NA
# 3:         3         26 2002-07-11 2004-04-05 2007-09-02             2             2             1
# 4:         4         32 2004-10-10 2009-08-27 2012-07-21             1             1             1
# 5:         5         29 2000-12-05 2005-02-28         NA             2             1            NA
## 1 = female, 2 = male

And you'd like to combine (melt) all the dob columns together, and gender columns together. Using the current functionality, we can do something like this:

DT.m1 = melt(DT, id = c("family_id", "age_mother"))
# Warning in melt.data.table(DT, id = c("family_id", "age_mother")): 'measure.vars' [dob_child1,
# dob_child2, dob_child3, gender_child1, ...] are not all of the same type. By order of hierarchy, the
# molten data value column will be of type 'character'. All measure variables not of type 'character'
# will be coerced to. Check DETAILS in ?melt.data.table for more on coercion.
DT.m1[, c("variable", "child") := tstrsplit(variable, "_", fixed = TRUE)]
DT.c1 = dcast(DT.m1, family_id + age_mother + child ~ variable, value.var = "value")
DT.c1
#     family_id age_mother  child        dob gender
#  1:         1         30 child1 1998-11-26      1
#  2:         1         30 child2 2000-01-29      2
#  3:         1         30 child3         NA     NA
#  4:         2         27 child1 1996-06-22      2
#  5:         2         27 child2         NA     NA
#  6:         2         27 child3         NA     NA
#  7:         3         26 child1 2002-07-11      2
#  8:         3         26 child2 2004-04-05      2
#  9:         3         26 child3 2007-09-02      1
# 10:         4         32 child1 2004-10-10      1
# 11:         4         32 child2 2009-08-27      1
# 12:         4         32 child3 2012-07-21      1
# 13:         5         29 child1 2000-12-05      2
# 14:         5         29 child2 2005-02-28      1
# 15:         5         29 child3         NA     NA

str(DT.c1) ## gender column is character type now!
# Classes 'data.table' and 'data.frame':    15 obs. of  5 variables:
#  $ family_id : int  1 1 1 2 2 2 3 3 3 4 ...
#  $ age_mother: int  30 30 30 27 27 27 26 26 26 32 ...
#  $ child     : chr  "child1" "child2" "child3" "child1" ...
#  $ dob       : chr  "1998-11-26" "2000-01-29" NA "1996-06-22" ...
#  $ gender    : chr  "1" "2" NA "2" ...
#  - attr(*, ".internal.selfref")=<externalptr> 
#  - attr(*, "sorted")= chr  "family_id" "age_mother" "child"

Issues {.bs-callout .bs-callout-info}

  1. What we wanted to do was to combine all the dob and gender type columns together respectively. Instead we are combining everything together, and then splitting them again. I think it's easy to see that it's quite roundabout (and inefficient).

    As an analogy, imagine you've a closet with four shelves of clothes and you'd like to put together the clothes from shelves 1 and 2 together (in 1), and 3 and 4 together (in 3). What we are doing is more or less to combine all the clothes together, and then split them back on to shelves 1 and 3!

  2. The columns to melt may be of different types, as in this case (character and integer types). By melting them all together, the columns will be coerced in result, as explained by the warning message above and shown from output of str(DT.c1), where gender has been converted to character type.

  3. We are generating an additional column by splitting the variable column into two columns, whose purpose is quite cryptic. We do it because we need it for casting in the next step.

  4. Finally, we cast the data set. But the issue is it's a much more computationally involved operation than melt. Specifically, it requires computing the order of the variables in formula, and that's costly.

#

In fact, base::reshape is capable of performing this operation in a very straightforward manner. It is an extremely useful and often underrated function. You should definitely give it a try!

3. Enhanced (new) functionality

a) Enhanced melt

Since we'd like for data.tables to perform this operation straightforward and efficient using the same interface, we went ahead and implemented an additional functionality, where we can melt to multiple columns simultaneously.

- melt multiple columns simultaneously

The idea is quite simple. We pass a list of columns to measure.vars, where each element of the list contains the columns that should be combined together.

colA = paste("dob_child", 1:3, sep = "")
colB = paste("gender_child", 1:3, sep = "")
DT.m2 = melt(DT, measure = list(colA, colB), value.name = c("dob", "gender"))
DT.m2
#     family_id age_mother variable        dob gender
#  1:         1         30        1 1998-11-26      1
#  2:         2         27        1 1996-06-22      2
#  3:         3         26        1 2002-07-11      2
#  4:         4         32        1 2004-10-10      1
#  5:         5         29        1 2000-12-05      2
#  6:         1         30        2 2000-01-29      2
#  7:         2         27        2         NA     NA
#  8:         3         26        2 2004-04-05      2
#  9:         4         32        2 2009-08-27      1
# 10:         5         29        2 2005-02-28      1
# 11:         1         30        3         NA     NA
# 12:         2         27        3         NA     NA
# 13:         3         26        3 2007-09-02      1
# 14:         4         32        3 2012-07-21      1
# 15:         5         29        3         NA     NA

str(DT.m2) ## col type is preserved
# Classes 'data.table' and 'data.frame':    15 obs. of  5 variables:
#  $ family_id : int  1 2 3 4 5 1 2 3 4 5 ...
#  $ age_mother: int  30 27 26 32 29 30 27 26 32 29 ...
#  $ variable  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 2 2 2 2 2 ...
#  $ dob       : chr  "1998-11-26" "1996-06-22" "2002-07-11" "2004-10-10" ...
#  $ gender    : int  1 2 2 1 2 2 NA 2 1 1 ...
#  - attr(*, ".internal.selfref")=<externalptr>

- Using patterns()

Usually in these problems, the columns we'd like to melt can be distinguished by a common pattern. We can use the function patterns(), implemented for convenience, to provide regular expressions for the columns to be combined together. The above operation can be rewritten as:

DT.m2 = melt(DT, measure = patterns("^dob", "^gender"), value.name = c("dob", "gender"))
DT.m2
#     family_id age_mother variable        dob gender
#  1:         1         30        1 1998-11-26      1
#  2:         2         27        1 1996-06-22      2
#  3:         3         26        1 2002-07-11      2
#  4:         4         32        1 2004-10-10      1
#  5:         5         29        1 2000-12-05      2
#  6:         1         30        2 2000-01-29      2
#  7:         2         27        2         NA     NA
#  8:         3         26        2 2004-04-05      2
#  9:         4         32        2 2009-08-27      1
# 10:         5         29        2 2005-02-28      1
# 11:         1         30        3         NA     NA
# 12:         2         27        3         NA     NA
# 13:         3         26        3 2007-09-02      1
# 14:         4         32        3 2012-07-21      1
# 15:         5         29        3         NA     NA

That's it!

{.bs-callout .bs-callout-info}

  • We can remove the variable column if necessary.

  • The functionality is implemented entirely in C, and is therefore both fast and memory efficient in addition to being straightforward.

b) Enhanced dcast

Okay great! We can now melt into multiple columns simultaneously. Now given the data set DT.m2 as shown above, how can we get back to the same format as the original data we started with?

If we use the current functionality of dcast, then we'd have to cast twice and bind the results together. But that's once again verbose, not straightforward and is also inefficient.

- Casting multiple value.vars simultaneously

We can now provide multiple value.var columns to dcast for data.tables directly so that the operations are taken care of internally and efficiently.

## new 'cast' functionality - multiple value.vars
DT.c2 = dcast(DT.m2, family_id + age_mother ~ variable, value.var = c("dob", "gender"))
DT.c2
#    family_id age_mother      dob_1      dob_2      dob_3 gender_1 gender_2 gender_3
# 1:         1         30 1998-11-26 2000-01-29         NA        1        2       NA
# 2:         2         27 1996-06-22         NA         NA        2       NA       NA
# 3:         3         26 2002-07-11 2004-04-05 2007-09-02        2        2        1
# 4:         4         32 2004-10-10 2009-08-27 2012-07-21        1        1        1
# 5:         5         29 2000-12-05 2005-02-28         NA        2        1       NA

{.bs-callout .bs-callout-info}

  • Attributes are preserved in result wherever possible.

  • Everything is taken care of internally, and efficiently. In addition to being fast, it is also very memory efficient.

#

Multiple functions to fun.aggregate: {.bs-callout .bs-callout-info}

You can also provide multiple functions to fun.aggregate to dcast for data.tables. Check the examples in ?dcast which illustrates this functionality.

#


data.table/inst/doc/datatable-reference-semantics.html0000644000175100001440000011554013172212366022551 0ustar hornikusers Data {#data}

This vignette discusses data.table's reference semantics which allows to add/update/delete columns of a data.table by reference, and also combine them with i and by. It is aimed at those who are already familiar with data.table syntax, its general form, how to subset rows in i, select and compute on columns, and perform aggregations by group. If you're not familiar with these concepts, please read the “Introduction to data.table” vignette first.


Data {#data}

We will use the same flights data as in the “Introduction to data.table” vignette.

flights <- fread("flights14.csv")
flights
#         year month day dep_delay arr_delay carrier origin dest air_time distance hour
#      1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
#      2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
#      3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
#      4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
#      5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
#     ---                                                                              
# 253312: 2014    10  31         1       -30      UA    LGA  IAH      201     1416   14
# 253313: 2014    10  31        -5       -14      UA    EWR  IAH      189     1400    8
# 253314: 2014    10  31        -8        16      MQ    LGA  RDU       83      431   11
# 253315: 2014    10  31        -4        15      MQ    LGA  DTW       75      502   11
# 253316: 2014    10  31        -5         1      MQ    LGA  SDF      110      659    8
dim(flights)
# [1] 253316     11

Introduction

In this vignette, we will

  1. first discuss reference semantics briefly and look at the two different forms in which the := operator can be used

  2. then see how we can add/update/delete columns by reference in j using the := operator and how to combine with i and by.

  3. and finally we will look at using := for its side-effect and how we can avoid the side effects using copy().

1. Reference semantics

All the operations we have seen so far in the previous vignette resulted in a new data set. We will see how to add new column(s), update or delete existing column(s) on the original data.

a) Background

Before we look at reference semantics, consider the data.frame shown below:

DF = data.frame(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18)
DF
#   ID a  b  c
# 1  b 1  7 13
# 2  b 2  8 14
# 3  b 3  9 15
# 4  a 4 10 16
# 5  a 5 11 17
# 6  c 6 12 18

When we did:

DF$c <- 18:13               # (1) -- replace entire column
# or
DF$c[DF$ID == "b"] <- 15:13 # (2) -- subassign in column 'c'

both (1) and (2) resulted in deep copy of the entire data.frame in versions of R versions < 3.1. It copied more than once. To improve performance by avoiding these redundant copies, data.table utilised the available but unused := operator in R.

Great performance improvements were made in R v3.1 as a result of which only a shallow copy is made for (1) and not deep copy. However, for (2) still, the entire column is deep copied even in R v3.1+. This means the more columns one subassigns to in the same query, the more deep copies R does.

shallow vs deep copy {.bs-callout .bs-callout-info}

A shallow copy is just a copy of the vector of column pointers (corresponding to the columns in a data.frame or data.table). The actual data is not physically copied in memory.

A deep copy on the other hand copies the entire data to another location in memory.

# With data.table's := operator, absolutely no copies are made in both (1) and (2), irrespective of R version you are using. This is because := operator updates data.table columns in-place (by reference).

b) The := operator

It can be used in j in two ways:

(a) The LHS := RHS form

```r
DT[, c("colA", "colB", ...) := list(valA, valB, ...)]

# when you have only one column to assign to you
# can drop the quotes and list(), for convenience
DT[, colA := valA]
```

(b) The functional form

```r
DT[, `:=`(colA = valA, # valA is assigned to colA
          colB = valB, # valB is assigned to colB
          ...
)]
```

{.bs-callout .bs-callout-warning}

Note that the code above explains how := can be used. They are not working examples. We will start using them on flights data.table from the next section.

#

{.bs-callout .bs-callout-info}

  • In (a), LHS takes a character vector of column names and RHS a list of values. RHS just needs to be a list, irrespective of how its generated (e.g., using lapply(), list(), mget(), mapply() etc.). This form is usually easy to program with and is particularly useful when you don't know the columns to assign values to in advance.

  • On the other hand, (b) is handy if you would like to jot some comments down for later.

  • The result is returned invisibly.

  • Since := is available in j, we can combine it with i and by operations just like the aggregation operations we saw in the previous vignette.

#

In the two forms of := shown above, note that we don't assign the result back to a variable. Because we don't need to. The input data.table is modified by reference. Let's go through examples to understand what we mean by this.

For the rest of the vignette, we will work with flights data.table.

2. Add/update/delete columns by reference

a) Add columns by reference {#ref-j}

– How can we add columns speed and total delay of each flight to flights data.table?

flights[, `:=`(speed = distance / (air_time/60), # speed in mph (mi/h)
               delay = arr_delay + dep_delay)]   # delay in minutes
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed delay
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490    27
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909    10
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769    11
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414   -34
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857     3
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18 434.3363     4

## alternatively, using the 'LHS := RHS' form
# flights[, c("speed", "delay") := list(distance/(air_time/60), arr_delay + dep_delay)]

Note that {.bs-callout .bs-callout-info}

  • We did not have to assign the result back to flights.

  • The flights data.table now contains the two newly added columns. This is what we mean by added by reference.

  • We used the functional form so that we could add comments on the side to explain what the computation does. You can also see the LHS := RHS form (commented).

b) Update some rows of columns by reference - sub-assign by reference {#ref-i-j}

Let's take a look at all the hours available in the flights data.table:

# get all 'hours' in flights
flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

We see that there are totally 25 unique values in the data. Both 0 and 24 hours seem to be present. Let's go ahead and replace 24 with 0.

– Replace those rows where hour == 24 with the value 0

# subassign by reference
flights[hour == 24L, hour := 0L]

{.bs-callout .bs-callout-info}

  • We can use i along with := in j the very same way as we have already seen in the “Introduction to data.table” vignette.

  • Column hour is replaced with 0 only on those row indices where the condition hour == 24L specified in i evaluates to TRUE.

  • := returns the result invisibly. Sometimes it might be necessary to see the result after the assignment. We can accomplish that by adding an empty [] at the end of the query as shown below:

    flights[hour == 24L, hour := 0L][]
    #         year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed
    #      1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490
    #      2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909
    #      3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769
    #      4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414
    #      5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857
    #     ---                                                                                       
    # 253312: 2014    10  31         1       -30      UA    LGA  IAH      201     1416   14 422.6866
    # 253313: 2014    10  31        -5       -14      UA    EWR  IAH      189     1400    8 444.4444
    # 253314: 2014    10  31        -8        16      MQ    LGA  RDU       83      431   11 311.5663
    # 253315: 2014    10  31        -4        15      MQ    LGA  DTW       75      502   11 401.6000
    # 253316: 2014    10  31        -5         1      MQ    LGA  SDF      110      659    8 359.4545
    #         delay
    #      1:    27
    #      2:    10
    #      3:    11
    #      4:   -34
    #      5:     3
    #     ---      
    # 253312:   -29
    # 253313:   -19
    # 253314:     8
    # 253315:    11
    # 253316:    -4
    

# Let's look at all the hours to verify.

# check again for '24'
flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

Exercise: {.bs-callout .bs-callout-warning #update-by-reference-question}

What is the difference between flights[hour == 24L, hour := 0L] and flights[hour == 24L][, hour := 0L]? Hint: The latter needs an assignment (<-) if you would want to use the result later.

If you can't figure it out, have a look at the Note section of ?":=".

c) Delete column by reference

– Remove delay column

flights[, c("delay") := NULL]
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18 434.3363

## or using the functional form
# flights[, `:=`(delay = NULL)]

{.bs-callout .bs-callout-info #delete-convenience}

  • Assigning NULL to a column deletes that column. And it happens instantly.

  • We can also pass column numbers instead of names in the LHS, although it is good programming practice to use column names.

  • When there is just one column to delete, we can drop the c() and double quotes and just use the column name unquoted, for convenience. That is:

    flights[, delay := NULL]
    

    is equivalent to the code above.

d) := along with grouping using by {#ref-j-by}

We have already seen the use of i along with := in Section 2b. Let's now see how we can use := along with by.

– How can we add a new column which contains for each orig,dest pair the maximum speed?

flights[, max_speed := max(speed), by = .(origin, dest)]
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed max_speed
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490  526.5957
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909  526.5957
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769  526.5957
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414  517.5000
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857  526.5957
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18 434.3363  518.4507

{.bs-callout .bs-callout-info}

  • We add a new column max_speed using the := operator by reference.

  • We provide the columns to group by the same way as shown in the Introduction to data.table vignette. For each group, max(speed) is computed, which returns a single value. That value is recycled to fit the length of the group. Once again, no copies are being made at all. flights data.table is modified in-place.

  • We could have also provided by with a character vector as we saw in the Introduction to data.table vignette, e.g., by = c("origin", "dest").

#

e) Multiple columns and :=

– How can we add two more columns computing max() of dep_delay and arr_delay for each month, using .SD?

in_cols  = c("dep_delay", "arr_delay")
out_cols = c("max_dep_delay", "max_arr_delay")
flights[, c(out_cols) := lapply(.SD, max), by = month, .SDcols = in_cols]
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed max_speed
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490  526.5957
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909  526.5957
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769  526.5957
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414  517.5000
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857  526.5957
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18 434.3363  518.4507
#    max_dep_delay max_arr_delay
# 1:           973           996
# 2:           973           996
# 3:           973           996
# 4:           973           996
# 5:           973           996
# 6:           973           996

{.bs-callout .bs-callout-info}

  • We use the LHS := RHS form. We store the input column names and the new columns to add in separate variables and provide them to .SDcols and for LHS (for better readability).

  • Note that since we allow assignment by reference without quoting column names when there is only one column as explained in Section 2c, we can not do out_cols := lapply(.SD, max). That would result in adding one new column named out_col. Instead we should do either c(out_cols) or simply (out_cols). Wrapping the variable name with ( is enough to differentiate between the two cases.

  • The LHS := RHS form allows us to operate on multiple columns. In the RHS, to compute the max on columns specified in .SDcols, we make use of the base function lapply() along with .SD in the same way as we have seen before in the “Introduction to data.table” vignette. It returns a list of two elements, containing the maximum value corresponding to dep_delay and arr_delay for each group.

# Before moving on to the next section, let's clean up the newly created columns speed, max_speed, max_dep_delay and max_arr_delay.

# RHS gets automatically recycled to length of LHS
flights[, c("speed", "max_speed", "max_dep_delay", "max_arr_delay") := NULL]
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18

3) := and copy()

:= modifies the input object by reference. Apart from the features we have discussed already, sometimes we might want to use the update by reference feature for its side effect. And at other times it may not be desirable to modify the original object, in which case we can use copy() function, as we will see in a moment.

a) := for its side effect

Let's say we would like to create a function that would return the maximum speed for each month. But at the same time, we would also like to add the column speed to flights. We could write a simple function as follows:

foo <- function(DT) {
  DT[, speed := distance / (air_time/60)]
  DT[, .(max_speed = max(speed)), by = month]
}
ans = foo(flights)
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour    speed
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9 413.6490
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11 409.0909
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19 423.0769
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7 395.5414
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13 424.2857
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18 434.3363
head(ans)
#    month max_speed
# 1:     1  535.6425
# 2:     2  535.6425
# 3:     3  549.0756
# 4:     4  585.6000
# 5:     5  544.2857
# 6:     6  608.5714

{.bs-callout .bs-callout-info}

  • Note that the new column speed has been added to flights data.table. This is because := performs operations by reference. Since DT (the function argument) and flights refer to the same object in memory, modifying DT also reflects on flights.

  • And ans contains the maximum speed for each month.

b) The copy() function

In the previous section, we used := for its side effect. But of course this may not be always desirable. Sometimes, we would like to pass a data.table object to a function, and might want to use the := operator, but wouldn't want to update the original object. We can accomplish this using the function copy().

{.bs-callout .bs-callout-info}

The copy() function deep copies the input object and therefore any subsequent update by reference operations performed on the copied object will not affect the original object.

#

There are two particular places where copy() function is essential:

  1. Contrary to the situation we have seen in the previous point, we may not want the input data.table to a function to be modified by reference. As an example, let's consider the task in the previous section, except we don't want to modify flights by reference.

    Let's first delete the speed column we generated in the previous section.

    flights[, speed := NULL]
    

    Now, we could accomplish the task as follows:

    foo <- function(DT) {
      DT <- copy(DT)                              ## deep copy
      DT[, speed := distance / (air_time/60)]     ## doesn't affect 'flights'
      DT[, .(max_speed = max(speed)), by = month]
    }
    ans <- foo(flights)
    head(flights)
    #    year month day dep_delay arr_delay carrier origin dest air_time distance hour
    # 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
    # 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
    # 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
    # 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
    # 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
    # 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18
    head(ans)
    #    month max_speed
    # 1:     1  535.6425
    # 2:     2  535.6425
    # 3:     3  549.0756
    # 4:     4  585.6000
    # 5:     5  544.2857
    # 6:     6  608.5714
    

{.bs-callout .bs-callout-info}

  • Using copy() function did not update flights data.table by reference. It doesn't contain the column speed.

  • And ans contains the maximum speed corresponding to each month.

However we could improve this functionality further by shallow copying instead of deep copying. In fact, we would very much like to provide this functionality for v1.9.8. We will touch up on this again in the data.table design vignette.

#

  1. When we store the column names on to a variable, e.g., DT_n = names(DT), and then add/update/delete column(s) by reference. It would also modify DT_n, unless we do copy(names(DT)).

    DT = data.table(x = 1L, y = 2L)
    DT_n = names(DT)
    DT_n
    # [1] "x" "y"
    
    ## add a new column by reference
    DT[, z := 3L]
    
    ## DT_n also gets updated
    DT_n
    # [1] "x" "y" "z"
    
    ## use `copy()`
    DT_n = copy(names(DT))
    DT[, w := 4L]
    
    ## DT_n doesn't get updated
    DT_n
    # [1] "x" "y" "z"
    

Summary

The := operator {.bs-callout .bs-callout-info}

  • It is used to add/update/delete columns by reference.

  • We have also seen how to use := along with i and by the same way as we have seen in the Introduction to data.table vignette. We can in the same way use keyby, chain operations together, and pass expressions to by as well all in the same way. The syntax is consistent.

  • We can use := for its side effect or use copy() to not modify the original object while updating by reference.

#

So far we have seen a whole lot in j, and how to combine it with by and little of i. Let's turn our attention back to i in the next vignette “Keys and fast binary search based subset” to perform blazing fast subsets by keying data.tables.


data.table/inst/doc/datatable-keys-fast-subset.html0000644000175100001440000013562413172212366022045 0ustar hornikusers Data {#data}

This vignette is aimed at those who are already familiar with data.table syntax, its general form, how to subset rows in i, select and compute on columns, add/modify/delete columns by reference in j and group by using by. If you're not familiar with these concepts, please read the “Introduction to data.table” and “Reference semantics” vignettes first.


Data {#data}

We will use the same flights data as in the “Introduction to data.table” vignette.

flights <- fread("flights14.csv")
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
# 2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
# 3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
# 4: 2014     1   1        -8       -26      AA    LGA  PBI      157     1035    7
# 5: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
# 6: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18
dim(flights)
# [1] 253316     11

Introduction

In this vignette, we will

  • first introduce the concept of key in data.table, and set and use keys to perform fast binary search based subsets in i,

  • see that we can combine key based subsets along with j and by in the exact same way as before,

  • look at other additional useful arguments - mult and nomatch,

  • and finally conclude by looking at the advantage of setting keys - perform fast binary search based subsets and compare with the traditional vector scan approach.

1. Keys

a) What is a key?

In the “Introduction to data.table” vignette, we saw how to subset rows in i using logical expressions, row numbers and using order(). In this section, we will look at another way of subsetting incredibly fast - using keys.

But first, let's start by looking at data.frames. All data.frames have a row names attribute. Consider the data.frame DF below.

set.seed(1L)
DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE),
                ID2 = sample(1:3, 10, TRUE),
                val = sample(10),
                stringsAsFactors = FALSE,
                row.names = sample(LETTERS[1:10]))
DF
#   ID1 ID2 val
# C   a   3   5
# D   a   1   6
# E   b   2   4
# G   a   1   2
# B   b   1  10
# H   a   2   8
# I   b   1   9
# F   b   2   1
# J   a   3   7
# A   b   2   3

rownames(DF)
#  [1] "C" "D" "E" "G" "B" "H" "I" "F" "J" "A"

We can subset a particular row using its row name as shown below:

DF["C", ]
#   ID1 ID2 val
# C   a   3   5

i.e., row names are more or less an index to rows of a data.frame. However,

  1. Each row is limited to exactly one row name.

    But, a person (for example) has at least two names - a first and a second name. It is useful to organise a telephone directory by surname then first name.

  2. And row names should be unique.

    rownames(DF) = sample(LETTERS[1:5], 10, TRUE)
    # Warning: non-unique values when setting 'row.names': 'C', 'D'
    # Error in `row.names<-.data.frame`(`*tmp*`, value = value): duplicate 'row.names' are not allowed
    

Now let's convert it to a data.table.

DT = as.data.table(DF)
DT
#     ID1 ID2 val
#  1:   a   3   5
#  2:   a   1   6
#  3:   b   2   4
#  4:   a   1   2
#  5:   b   1  10
#  6:   a   2   8
#  7:   b   1   9
#  8:   b   2   1
#  9:   a   3   7
# 10:   b   2   3

rownames(DT)
#  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
  • Note that row names have been reset.

  • data.tables never uses row names. Since data.tables inherit from data.frames, it still has the row names attribute. But it never uses them. We'll see in a moment as to why.

    If you would like to preserve the row names, use keep.rownames = TRUE in as.data.table() - this will create a new column called rn and assign row names to this column.

Instead, in data.tables we set and use keys. Think of a key as supercharged rownames.

Keys and their properties {.bs-callout .bs-callout-info #key-properties}

  1. We can set keys on multiple columns and the column can be of different typesinteger, numeric, character, factor, integer64 etc. list and complex types are not supported yet.

  2. Uniqueness is not enforced, i.e., duplicate key values are allowed. Since rows are sorted by key, any duplicates in the key columns will appear consecutively.

  3. Setting a key does two things:

    a. physically reorders the rows of the data.table by the column(s) provided by reference, always in increasing order.

    b. marks those columns as key columns by setting an attribute called sorted to the data.table.

    Since the rows are reordered, a data.table can have at most one key because it can not be sorted in more than one way.

#

For the rest of the vignette, we will work with flights data set.

b) Set, get and use keys on a data.table

– How can we set the column origin as key in the data.table flights?

setkey(flights, origin)
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1         4         0      AA    EWR  LAX      339     2454   18
# 2: 2014     1   1        -5       -17      AA    EWR  MIA      161     1085   16
# 3: 2014     1   1       191       185      AA    EWR  DFW      214     1372   16
# 4: 2014     1   1        -1        -2      AA    EWR  DFW      214     1372   14
# 5: 2014     1   1        -3       -10      AA    EWR  MIA      154     1085    6
# 6: 2014     1   1         4       -17      AA    EWR  DFW      215     1372    9

## alternatively we can provide character vectors to the function 'setkeyv()'
# setkeyv(flights, "origin") # useful to program with

{.bs-callout .bs-callout-info}

  • You can use the function setkey() and provide the column names (without quoting them). This is helpful during interactive use.

  • Alternatively you can pass a character vector of column names to the function setkeyv(). This is particularly useful while designing functions to pass columns to set key on as function arguments.

  • Note that we did not have to assign the result back to a variable. This is because like the := function we saw in the “Introduction to data.table” vignette, setkey() and setkeyv() modify the input data.table by reference. They return the result invisibly.

  • The data.table is now reordered (or sorted) by the column we provided - origin. Since we reorder by reference, we only require additional memory of one column of length equal to the number of rows in the data.table, and is therefore very memory efficient.

  • You can also set keys directly when creating data.tables using the data.table() function using key argument. It takes a character vector of column names.

set* and :=: {.bs-callout .bs-callout-info}

In data.table, the := operator and all the set* (e.g., setkey, setorder, setnames etc..) functions are the only ones which modify the input object by reference.

#

Once you key a data.table by certain columns, you can subset by querying those key columns using the .() notation in i. Recall that .() is an alias to list().

– Use the key column origin to subset all rows where the origin airport matches “JFK”

flights[.("JFK")]
#        year month day dep_delay arr_delay carrier origin dest air_time distance hour
#     1: 2014     1   1        14        13      AA    JFK  LAX      359     2475    9
#     2: 2014     1   1        -3        13      AA    JFK  LAX      363     2475   11
#     3: 2014     1   1         2         9      AA    JFK  LAX      351     2475   19
#     4: 2014     1   1         2         1      AA    JFK  LAX      350     2475   13
#     5: 2014     1   1        -2       -18      AA    JFK  LAX      338     2475   21
#    ---                                                                              
# 81479: 2014    10  31        -4       -21      UA    JFK  SFO      337     2586   17
# 81480: 2014    10  31        -2       -37      UA    JFK  SFO      344     2586   18
# 81481: 2014    10  31         0       -33      UA    JFK  LAX      320     2475   17
# 81482: 2014    10  31        -6       -38      UA    JFK  SFO      343     2586    9
# 81483: 2014    10  31        -6       -38      UA    JFK  LAX      323     2475   11

## alternatively
# flights[J("JFK")] (or) 
# flights[list("JFK")]

{.bs-callout .bs-callout-info}

  • The key column has already been set to origin. So it is sufficient to provide the value, here “JFK”, directly. The .() syntax helps identify that the task requires looking up the value “JFK” in the key column of data.table (here column origin of flights data.table).

  • The row indices corresponding to the value “JFK” in origin is obtained first. And since there is no expression in j, all columns corresponding to those row indices are returned.

  • On single column key of character type, you can drop the .() notation and use the values directly when subsetting, like subset using row names on data.frames.

    flights["JFK"]              ## same as flights[.("JFK")]
    
  • We can subset any amount of values as required

    flights[c("JFK", "LGA")]    ## same as flights[.(c("JFK", "LGA"))]
    

    This returns all columns corresponding to those rows where origin column matches either “JFK” or “LGA”.

– How can we get the column(s) a data.table is keyed by?

Using the function key().

key(flights)
# [1] "origin"

{.bs-callout .bs-callout-info}

  • It returns a character vector of all the key columns.

  • If no key is set, it returns NULL.

c) Keys and multiple columns

To refresh, keys are like supercharged row names. We can set key on multiple columns and they can be of multiple types.

– How can I set keys on both origin and dest columns?

setkey(flights, origin, dest)
head(flights)
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   2        -2       -25      EV    EWR  ALB       30      143    7
# 2: 2014     1   3        88        79      EV    EWR  ALB       29      143   23
# 3: 2014     1   4       220       211      EV    EWR  ALB       32      143   15
# 4: 2014     1   4        35        19      EV    EWR  ALB       32      143    7
# 5: 2014     1   5        47        42      EV    EWR  ALB       26      143    8
# 6: 2014     1   5        66        62      EV    EWR  ALB       31      143   23

## or alternatively
# setkeyv(flights, c("origin", "dest")) # provide a character vector of column names

key(flights)
# [1] "origin" "dest"

{.bs-callout .bs-callout-info}

  • It sorts the data.table first by the column origin and then by dest by reference.

– Subset all rows using key columns where first key column origin matches “JFK” and second key column dest matches “MIA”

flights[.("JFK", "MIA")]
#       year month day dep_delay arr_delay carrier origin dest air_time distance hour
#    1: 2014     1   1        -1       -17      AA    JFK  MIA      161     1089   15
#    2: 2014     1   1         7        -8      AA    JFK  MIA      166     1089    9
#    3: 2014     1   1         2        -1      AA    JFK  MIA      164     1089   12
#    4: 2014     1   1         6         3      AA    JFK  MIA      157     1089    5
#    5: 2014     1   1         6       -12      AA    JFK  MIA      154     1089   17
#   ---                                                                              
# 2746: 2014    10  31        -1       -22      AA    JFK  MIA      148     1089   16
# 2747: 2014    10  31        -3       -20      AA    JFK  MIA      146     1089    8
# 2748: 2014    10  31         2       -17      AA    JFK  MIA      150     1089    6
# 2749: 2014    10  31        -3       -12      AA    JFK  MIA      150     1089    5
# 2750: 2014    10  31        29         4      AA    JFK  MIA      146     1089   19

How does the subset work here? {.bs-callout .bs-callout-info #multiple-key-point}

  • It is important to undertand how this works internally. “JFK” is first matched against the first key column origin. And within those matching rows, “MIA” is matched against the second key column dest to obtain row indices where both origin and dest match the given values.

  • Since no j is provided, we simply return all columns corresponding to those row indices.

#

– Subset all rows where just the first key column origin matches “JFK”

key(flights)
# [1] "origin" "dest"

flights[.("JFK")] ## or in this case simply flights["JFK"], for convenience
#        year month day dep_delay arr_delay carrier origin dest air_time distance hour
#     1: 2014     1   1        10         4      B6    JFK  ABQ      280     1826   20
#     2: 2014     1   2       134       161      B6    JFK  ABQ      252     1826   22
#     3: 2014     1   7         6         6      B6    JFK  ABQ      269     1826   20
#     4: 2014     1   8        15       -15      B6    JFK  ABQ      259     1826   20
#     5: 2014     1   9        45        32      B6    JFK  ABQ      267     1826   20
#    ---                                                                              
# 81479: 2014    10  31         0       -18      DL    JFK  TPA      142     1005    8
# 81480: 2014    10  31         1        -8      B6    JFK  TPA      149     1005   19
# 81481: 2014    10  31        -2       -22      B6    JFK  TPA      145     1005   14
# 81482: 2014    10  31        -8        -5      B6    JFK  TPA      149     1005    9
# 81483: 2014    10  31        -4       -18      B6    JFK  TPA      145     1005    8

{.bs-callout .bs-callout-info}

  • Since we did not provide any values for the second key column dest, it just matches “JFK” against the first key column origin and returns all the matched rows.

– Subset all rows where just the second key column dest matches “MIA”

flights[.(unique(origin), "MIA")]
#       year month day dep_delay arr_delay carrier origin dest air_time distance hour
#    1: 2014     1   1        -5       -17      AA    EWR  MIA      161     1085   16
#    2: 2014     1   1        -3       -10      AA    EWR  MIA      154     1085    6
#    3: 2014     1   1        -5        -8      AA    EWR  MIA      157     1085   11
#    4: 2014     1   1        43        42      UA    EWR  MIA      155     1085   15
#    5: 2014     1   1        60        49      UA    EWR  MIA      162     1085   21
#   ---                                                                              
# 9924: 2014    10  31       -11        -8      AA    LGA  MIA      157     1096   13
# 9925: 2014    10  31        -5       -11      AA    LGA  MIA      150     1096    9
# 9926: 2014    10  31        -2        10      AA    LGA  MIA      156     1096    6
# 9927: 2014    10  31        -2       -16      AA    LGA  MIA      156     1096   19
# 9928: 2014    10  31         1       -11      US    LGA  MIA      164     1096   15

What's happening here? {.bs-callout .bs-callout-info}

  • Read this again. The value provided for the second key column “MIA” has to find the matching values in dest key column on the matching rows provided by the first key column origin. We can not skip the values of key columns before. Therefore we provide all unique values from key column origin.

  • “MIA” is automatically recycled to fit the length of unique(origin) which is 3.

2) Combining keys with j and by

All we have seen so far is the same concept – obtaining row indices in i, but just using a different method – using keys. It shouldn't be surprising that we can do exactly the same things in j and by as seen from the previous vignettes. We will highlight this with a few examples.

a) Select in j

– Return arr_delay column as a data.table corresponding to origin = "LGA" and dest = "TPA".

key(flights)
# [1] "origin" "dest"
flights[.("LGA", "TPA"), .(arr_delay)]
#       arr_delay
#    1:         1
#    2:        14
#    3:       -17
#    4:        -4
#    5:       -12
#   ---          
# 1848:        39
# 1849:       -24
# 1850:       -12
# 1851:        21
# 1852:       -11

{.bs-callout .bs-callout-info}

  • The row indices corresponding to origin == "LGA" and dest == "TPA" are obtained using key based subset.

  • Once we have the row indices, we look at j which requires only the arr_delay column. So we simply select the column arr_delay for those row indices in the exact same way as we have seen in Introduction to data.table vignette.

  • We could have returned the result by using with = FALSE as well.

    flights[.("LGA", "TPA"), "arr_delay", with = FALSE]
    

b) Chaining

– On the result obtained above, use chaining to order the column in decreasing order.

flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)]
#       arr_delay
#    1:       486
#    2:       380
#    3:       351
#    4:       318
#    5:       300
#   ---          
# 1848:       -40
# 1849:       -43
# 1850:       -46
# 1851:       -48
# 1852:       -49

c) Compute or do in j

– Find the maximum arrival delay correspondong to origin = "LGA" and dest = "TPA".

flights[.("LGA", "TPA"), max(arr_delay)]
# [1] 486

{.bs-callout .bs-callout-info}

  • We can verify that the result is identical to first value (486) from the previous example.

d) sub-assign by reference using := in j

We have seen this example already in the Reference semantics vignette. Let's take a look at all the hours available in the flights data.table:

# get all 'hours' in flights
flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24

We see that there are totally 25 unique values in the data. Both 0 and 24 hours seem to be present. Let's go ahead and replace 24 with 0, but this time using key.

setkey(flights, hour)
key(flights)
# [1] "hour"
flights[.(24), hour := 0L]
key(flights)
# NULL

{.bs-callout .bs-callout-info}

  • We first set key to hour. This reorders flights by the column hour and marks that column as the key column.

  • Now we can subset on hour by using the .() notation. We subset for the value 24 and obtain the corresponding row indices.

  • And on those row indices, we replace the key column with the value 0.

  • Since we have replaced values on the key column, the data.table flights isn't sorted by hour anymore. Therefore, the key has been automatically removed by setting to NULL.

# Now, there shouldn't be any 24 in the hour column.

flights[, sort(unique(hour))]
#  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

e) Aggregation using by

Let's set the key back to origin, dest first.

setkey(flights, origin, dest)
key(flights)
# [1] "origin" "dest"

– Get the maximum departure delay for each month corresponding to origin = "JFK". Order the result by month

ans <- flights["JFK", max(dep_delay), keyby = month]
head(ans)
#    month   V1
# 1:     1  881
# 2:     2 1014
# 3:     3  920
# 4:     4 1241
# 5:     5  853
# 6:     6  798
key(ans)
# [1] "month"

{.bs-callout .bs-callout-info}

  • We subset on the key column origin to obtain the row indices corresponding to “JFK”.

  • Once we obtain the row indices, we only need two columns - month to group by and dep_delay to obtain max() for each group. data.table's query optimisation therefore subsets just those two columns corresponding to the row indices obtained in i, for speed and memory efficiency.

  • And on that subset, we group by month and compute max(dep_delay).

  • We use keyby to automatically key that result by month. Now we understand what that means. In addition to ordering, it also sets month as the key column.

3) Additional arguments - mult and nomatch

a) The mult argument

We can choose, for each query, if “all” the matching rows should be returned, or just the “first” or “last” using the mult argument. The default value is “all” - what we've seen so far.

– Subset only the first matching row from all rows where origin matches “JFK” and dest matches “MIA”

flights[.("JFK", "MIA"), mult = "first"]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     1   1         6         3      AA    JFK  MIA      157     1089    5

– Subset only the last matching row of all the rows where origin matches “LGA”, “JFK”, “EWR” and dest matches “XNA”

flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last"]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     5  23       163       148      MQ    LGA  XNA      158     1147   18
# 2:   NA    NA  NA        NA        NA      NA    JFK  XNA       NA       NA   NA
# 3: 2014     2   3       231       268      EV    EWR  XNA      184     1131   12

{.bs-callout .bs-callout-info}

  • The query “JFK”, “XNA” doesn't match any rows in flights and therefore returns NA.

  • Once again, the query for second key column dest, “XNA”, is recycled to fit the length of the query for first key column origin, which is of length 3.

b) The nomatch argument

We can choose if queries that do not match should return NA or be skipped altogether using the nomatch argument.

– From the previous example, Subset all rows only if there's a match

flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = 0L]
#    year month day dep_delay arr_delay carrier origin dest air_time distance hour
# 1: 2014     5  23       163       148      MQ    LGA  XNA      158     1147   18
# 2: 2014     2   3       231       268      EV    EWR  XNA      184     1131   12

{.bs-callout .bs-callout-info}

  • Default value for nomatch is NA. Setting nomatch = 0L skips queries with no matches.

  • The query “JFKâ€, “XNA†doesn’t match any rows in flights and therefore is skipped.

4) binary search vs vector scans

We have seen so far how we can set and use keys to subset. But what's the advantage? For example, instead of doing:

# key by origin,dest columns
flights[.("JFK", "MIA")]

we could have done:

flights[origin == "JFK" & dest == "MIA"]

One advantage very likely is shorter syntax. But even more than that, binary search based subsets are incredibly fast.

a) Performance of binary search approach

To illustrate, let's create a sample data.table with 20 million rows and three columns and key it by columns x and y.

set.seed(2L)
N = 2e7L
DT = data.table(x = sample(letters, N, TRUE),
                y = sample(1000L, N, TRUE),
              val = runif(N), key = c("x", "y"))
print(object.size(DT), units = "Mb")
# 381.5 Mb

key(DT)
# [1] "x" "y"

DT is ~380MB. It is not really huge, but this will do to illustrate the point.

From what we have seen in the Introduction to data.table section, we can subset those rows where columns x = "g" and y = 877 as follows:

## (1) Usual way of subsetting - vector scan approach
t1 <- system.time(ans1 <- DT[x == "g" & y == 877L])
t1
#    user  system elapsed 
#   0.140   0.012   0.155
head(ans1)
#    x   y       val
# 1: g 877 0.3946652
# 2: g 877 0.9424275
# 3: g 877 0.7068512
# 4: g 877 0.6959935
# 5: g 877 0.9673482
# 6: g 877 0.4842585
dim(ans1)
# [1] 761   3

Now let's try to subset by using keys.

## (2) Subsetting using keys
t2 <- system.time(ans2 <- DT[.("g", 877L)])
t2
#    user  system elapsed 
#   0.004   0.000   0.000
head(ans2)
#    x   y       val
# 1: g 877 0.3946652
# 2: g 877 0.9424275
# 3: g 877 0.7068512
# 4: g 877 0.6959935
# 5: g 877 0.9673482
# 6: g 877 0.4842585
dim(ans2)
# [1] 761   3

identical(ans1$val, ans2$val)
# [1] TRUE
  • The speedup is ~155x!

b) Why does keying a data.table result in blazing fast susbets?

To understand that, let's first look at what vector scan approach (method 1) does.

Vector scan approach: {.bs-callout .bs-callout-info}

  • The column x is searched for the value “g” row by row, on all 20 million of them. This results in a logical vector of size 20 million, with values TRUE, FALSE or NA corresponding to x's value.

  • Similarly, the column y is searched for 877 on all 20 million rows one by one, and stored in another logical vector.

  • Element wise & operations are performed on the intermediate logical vectors and all the rows where the expression evaluates to TRUE are returned.

This is what we call a vector scan approach. And this is quite inefficient, especially on larger tables and when one needs repeated subsetting, because it has to scan through all the rows each time.

#

Now let us look at binary search approach (method 2). Recall from Properties of key - setting keys reorders the data.table by key columns. Since the data is sorted, we don't have to scan through the entire length of the column! We can instead use binary search to search a value in O(log n) as opposed to O(n) in case of vector scan approach, where n is the number of rows in the data.table.

Binary search approach: {.bs-callout .bs-callout-info}

Here's a very simple illustration. Let's consider the (sorted) numbers shown below:

1, 5, 10, 19, 22, 23, 30

Suppose we'd like to find the matching position of the value 1, using binary search, this is how we would proceed - because we know that the data is sorted.

  • Start with the middle value = 19. Is 1 == 19? No. 1 < 19.

  • Since the value we're looking for is smaller than 19, it should be somewhere before 19. So we can discard the rest of the half that are >= 19.

  • Our set is now reduced to 1, 5, 10. Grab the middle value once again = 5. Is 1 == 5? No. 1 < 5.

  • Our set is reduced to 1. Is 1 == 1? Yes. The corresponding index is also 1. And that's the only match.

A vector scan approach on the other hand would have to scan through all the values (here, 7).

#

It can be seen that with every search we reduce the number of searches by half. This is why binary search based subsets are incredibly fast. Since rows of each column of data.tables have contiguous locations in memory, the operations are performed in a very cache efficient manner (also contributes to speed).

In addition, since we obtain the matching row indices directly without having to create those huge logical vectors (equal to the number of rows in a data.table), it is quite memory efficient as well.

Summary

In this vignette, we have learnt another method to subset rows in i by keying a data.table. Setting keys allows us to perform blazing fast subsets by using binary search. In particular, we have seen how to

{.bs-callout .bs-callout-info}

  • set key and subset using the key on a data.table.

  • subset using keys which fetches row indices in i, but much faster.

  • combine key based subsets with j and by. Note that the j and by operations are exactly the same as before.

#

Key based subsets are incredibly fast and are particularly useful when the task involves repeated subsetting. But it may not be always desirable to set key and physically reorder the data.table. In the next vignette, we will address this using a new feature – secondary indexes.


data.table/inst/doc/datatable-intro.R0000644000175100001440000001726113172212362017214 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ----echo = FALSE--------------------------------------------------------------------------------- options(width = 100L) ## ------------------------------------------------------------------------------------------------- flights <- fread("flights14.csv") flights dim(flights) ## ------------------------------------------------------------------------------------------------- DT = data.table(ID = c("b","b","b","a","a","c"), a = 1:6, b = 7:12, c = 13:18) DT class(DT$ID) ## ----eval = FALSE--------------------------------------------------------------------------------- # DT[i, j, by] # # ## R: i j by # ## SQL: where select | update group by ## ------------------------------------------------------------------------------------------------- ans <- flights[origin == "JFK" & month == 6L] head(ans) ## ------------------------------------------------------------------------------------------------- ans <- flights[1:2] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[order(origin, -dest)] head(ans) ## ------------------------------------------------------------------------------------------------- odt = data.table(col = sample(1e7)) (t1 <- system.time(ans1 <- odt[base::order(col)])) ## uses order from base R (t2 <- system.time(ans2 <- odt[order(col)])) ## uses data.table's forder (identical(ans1, ans2)) ## ----echo = FALSE--------------------------------------------------------------------------------- rm(odt); rm(ans1); rm(ans2); rm(t1); rm(t2) ## ------------------------------------------------------------------------------------------------- ans <- flights[, arr_delay] head(ans) ## ------------------------------------------------------------------------------------------------- ans <- flights[, list(arr_delay)] head(ans) ## ------------------------------------------------------------------------------------------------- ans <- flights[, .(arr_delay, dep_delay)] head(ans) ## alternatively # ans <- flights[, list(arr_delay, dep_delay)] ## ------------------------------------------------------------------------------------------------- ans <- flights[, .(delay_arr = arr_delay, delay_dep = dep_delay)] head(ans) ## ------------------------------------------------------------------------------------------------- ans <- flights[, sum((arr_delay + dep_delay) < 0)] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[origin == "JFK" & month == 6L, .(m_arr = mean(arr_delay), m_dep = mean(dep_delay))] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[origin == "JFK" & month == 6L, length(dest)] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[origin == "JFK" & month == 6L, .N] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[, c("arr_delay", "dep_delay"), with = FALSE] head(ans) ## ------------------------------------------------------------------------------------------------- DF = data.frame(x = c(1,1,1,2,2,3,3,3), y = 1:8) ## (1) normal way DF[DF$x > 1, ] # data.frame needs that ',' as well ## (2) using with DF[with(DF, x > 1), ] ## ----eval = FALSE--------------------------------------------------------------------------------- # ## not run # # # returns all columns except arr_delay and dep_delay # ans <- flights[, !c("arr_delay", "dep_delay"), with = FALSE] # # or # ans <- flights[, -c("arr_delay", "dep_delay"), with = FALSE] ## ----eval = FALSE--------------------------------------------------------------------------------- # ## not run # # # returns year,month and day # ans <- flights[, year:day, with = FALSE] # # returns day, month and year # ans <- flights[, day:year, with = FALSE] # # returns all columns except year, month and day # ans <- flights[, -(year:day), with = FALSE] # ans <- flights[, !(year:day), with = FALSE] ## ------------------------------------------------------------------------------------------------- ans <- flights[, .(.N), by = .(origin)] ans ## or equivalently using a character vector in 'by' # ans <- flights[, .(.N), by = "origin"] ## ------------------------------------------------------------------------------------------------- ans <- flights[, .N, by = origin] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .N, by = origin] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .N, by = .(origin,dest)] head(ans) ## or equivalently using a character vector in 'by' # ans <- flights[carrier == "AA", .N, by = c("origin", "dest")] ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .(mean(arr_delay), mean(dep_delay)), by = .(origin, dest, month)] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .(mean(arr_delay), mean(dep_delay)), keyby = .(origin, dest, month)] ans ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .N, by = .(origin, dest)] ## ------------------------------------------------------------------------------------------------- ans <- ans[order(origin, -dest)] head(ans) ## ------------------------------------------------------------------------------------------------- ans <- flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] head(ans, 10) ## ----eval = FALSE--------------------------------------------------------------------------------- # DT[ ... # ][ ... # ][ ... # ] ## ------------------------------------------------------------------------------------------------- ans <- flights[, .N, .(dep_delay>0, arr_delay>0)] ans ## ------------------------------------------------------------------------------------------------- DT DT[, print(.SD), by = ID] ## ------------------------------------------------------------------------------------------------- DT[, lapply(.SD, mean), by = ID] ## ------------------------------------------------------------------------------------------------- flights[carrier == "AA", ## Only on trips with carrier "AA" lapply(.SD, mean), ## compute the mean by = .(origin, dest, month), ## for every 'origin,dest,month' .SDcols = c("arr_delay", "dep_delay")] ## for just those specified in .SDcols ## ------------------------------------------------------------------------------------------------- ans <- flights[, head(.SD, 2), by = month] head(ans) ## ------------------------------------------------------------------------------------------------- DT[, .(val = c(a,b)), by = ID] ## ------------------------------------------------------------------------------------------------- DT[, .(val = list(c(a,b))), by = ID] ## ------------------------------------------------------------------------------------------------- ## (1) look at the difference between DT[, print(c(a,b)), by = ID] ## (2) and DT[, print(list(c(a,b))), by = ID] ## ----eval = FALSE--------------------------------------------------------------------------------- # DT[i, j, by] data.table/inst/doc/datatable-secondary-indices-and-auto-indexing.Rmd0000644000175100001440000003113113172210047025304 0ustar hornikusers--- title: "Secondary indices and auto indexing" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Secondary indices and auto indexing} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ``` This vignette assumes that the reader is familiar with data.table's `[i, j, by]` syntax, and how to perform fast key based subsets. If you're not familar with these concepts, please read the *"Introduction to data.table"*, *"Reference semantics"* and *"Keys and fast binary search based subset"* vignettes first. *** ## Data {#data} We will use the same `flights` data as in the *"Introduction to data.table"* vignette. ```{r echo = FALSE} options(width = 100L) ``` ```{r} flights <- fread("flights14.csv") head(flights) dim(flights) ``` ## Introduction In this vignette, we will * discuss *secondary indices* and provide rationale as to why we need them by citing cases where setting keys is not necessarily ideal, * perform fast subsetting, once again, but using the new `on` argument, which computes secondary indices internally for the task (temporarily), and reuses if one already exists, * and finally look at *auto indexing* which goes a step further and creates secondary indices automatically, but does so on native R syntax for subsetting. ## 1. Secondary indices ### a) What are secondary indices? Secondary indices are similar to `keys` in *data.table*, except for two major differences: * It *doesn't* physically reorder the entire data.table in RAM. Instead, it only computes the order for the set of columns provided and stores that *order vector* in an additional attribute called `index`. * There can be more than one secondary index for a data.table (as we will see below). ### b) Set and get secondary indices #### -- How can we set the column `origin` as a secondary index in the *data.table* `flights`? ```{r} setindex(flights, origin) head(flights) ## alternatively we can provide character vectors to the function 'setindexv()' # setindexv(flights, "origin") # useful to program with # 'index' attribute added names(attributes(flights)) ``` * `setindex` and `setindexv()` allows adding a secondary index to the data.table. * Note that `flights` is **not** phyiscally reordered in increasing order of `origin`, as would have been the case with `setkey()`. * Also note that the attribute `index` has been added to `flights`. * `setindex(flights, NULL)` would remove all secondary indices. #### -- How can we get all the secondary indices set so far in `flights`? ```{r} indices(flights) setindex(flights, origin, dest) indices(flights) ``` * The function `indices()` returns all current secondary indices in the data.table. If none exists, `NULL` is returned. * Note that by creating another index on the columns `origin, dest`, we do not lose the first index created on the column `origin`, i.e., we can have multiple secondary indices. ### c) Why do we need secondary indices? #### -- Reordering a data.table can be expensive and not always ideal Consider the case where you would like to perform a fast key based subset on `origin` column for the value "JFK". We'd do this as: ```{r, eval = FALSE} ## not run setkey(flights, origin) flights["JFK"] # or flights[.("JFK")] ``` #### `setkey()` requires: {.bs-callout .bs-callout-info} a) computing the order vector for the column(s) provided, here, `origin`, and b) reordering the entire data.table, by reference, based on the order vector computed. # Computing the order isn't the time consuming part, since data.table uses true radix sorting on integer, character and numeric vectors. However reordering the data.table could be time consuming (depending on the number of rows and columns). Unless our task involves repeated subsetting on the same column, fast key based subsetting could effectively be nullified by the time to reorder, depending on our data.table dimensions. #### -- There can be only one `key` at the most Now if we would like to repeat the same operation but on `dest` column instead, for the value "LAX", then we have to `setkey()`, *again*. ```{r, eval = FALSE} ## not run setkey(flights, dest) flights["LAX"] ``` And this reorders `flights` by `dest`, *again*. What we would really like is to be able to perform the fast subsetting by eliminating the reordering step. And this is precisely what *secondary indices* allow for! #### -- Secondary indices can be reused Since there can be multiple secondary indices, and creating an index is as simple as storing the order vector as an attribute, this allows us to even eliminate the time to recompute the order vector if an index already exists. #### -- The new `on` argument allows for cleaner syntax and automatic creation and reuse of secondary indices As we will see in the next section, the `on` argument provides several advantages: #### `on` argument {.bs-callout .bs-callout-info} * enables subsetting by computing secondary indices on the fly. This eliminates having to do `setindex()` every time. * allows easy reuse of existing indices by just checking the attributes. * allows for a cleaner syntax by having the columns on which the subset is performed as part of the syntax. This makes the code easier to follow when looking at it at a later point. Note that `on` argument can also be used on keyed subsets as well. In fact, we encourage to provide the `on` argument even when subsetting using keys for better readability. # ## 2. Fast subsetting using `on` argument and secondary indices ### a) Fast subsets in `i` #### -- Subset all rows where the origin airport matches *"JFK"* using `on` ```{r} flights["JFK", on = "origin"] ## alternatively # flights[.("JFK"), on = "origin"] (or) # flights[list("JFK"), on = "origin"] ``` * This statement performs a fast binary search based subset as well, by computing the index on the fly. However, note that it doesn't save the index as an attribute automatically. This may change in the future. * If we had already created a secondary index, using `setindex()`, then `on` would reuse it instead of (re)computing it. We can see that by using `verbose = TRUE`: ```{r} setindex(flights, origin) flights["JFK", on = "origin", verbose = TRUE][1:5] ``` #### -- How can I subset based on `origin` *and* `dest` columns? For example, if we want to subset `"JFK", "LAX"` combination, then: ```{r} flights[.("JFK", "LAX"), on = c("origin", "dest")][1:5] ``` * `on` argument accepts a character vector of column names corresponding to the order provided to `i-argument`. * Since the time to compute the secondary index is quite small, we don't have to use `setindex()`, unless, once again, the task involves repeated subsetting on the same column. ### b) Select in `j` All the operations we will discuss below are no different to the ones we already saw in the *Keys and fast binary search based subset* vignette. Except we'll be using the `on` argument instead of setting keys. #### -- Return `arr_delay` column alone as a data.table corresponding to `origin = "LGA"` and `dest = "TPA"` ```{r} flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")] ``` ### c) Chaining #### -- On the result obtained above, use chaining to order the column in decreasing order. ```{r} flights[.("LGA", "TPA"), .(arr_delay), on = c("origin", "dest")][order(-arr_delay)] ``` ### d) Compute or *do* in `j` #### -- Find the maximum arrival delay correspondong to `origin = "LGA"` and `dest = "TPA"`. ```{r} flights[.("LGA", "TPA"), max(arr_delay), on = c("origin", "dest")] ``` ### e) *sub-assign* by reference using `:=` in `j` We have seen this example already in the *Reference semantics* and *Keys and fast binary search based subset* vignette. Let's take a look at all the `hours` available in the `flights` *data.table*: ```{r} # get all 'hours' in flights flights[, sort(unique(hour))] ``` We see that there are totally `25` unique values in the data. Both *0* and *24* hours seem to be present. Let's go ahead and replace *24* with *0*, but this time using `on` instead of setting keys. ```{r} flights[.(24L), hour := 0L, on = "hour"] ``` Now, let's check if `24` is replaced with `0` in the `hour` column. ```{r} flights[, sort(unique(hour))] ``` * This is particularly a huge advantage of secondary indices. Previously, just to update a few rows of `hour`, we had to `setkey()` on it, which inevitablly reorders the entire data.table. With `on`, the order is preserved, and the operation is much faster! Looking at the code, the task we wanted to perform is also quite clear. ### f) Aggregation using `by` #### -- Get the maximum departure delay for each `month` corresponding to `origin = "JFK"`. Order the result by `month` ```{r} ans <- flights["JFK", max(dep_delay), keyby = month, on = "origin"] head(ans) ``` * We would have had to set the `key` back to `origin, dest` again, if we did not use `on` which internally builds secondary indices on the fly. ### g) The *mult* argument The other arguments including `mult` work exactly the same way as we saw in the *Keys and fast binary search based subset* vignette. The default value for `mult` is "all". We can choose, instead only the "first" or "last" matching rows should be returned. #### -- Subset only the first matching row where `dest` matches *"BOS"* and *"DAY"* ```{r} flights[c("BOS", "DAY"), on = "dest", mult = "first"] ``` #### -- Subset only the last matching row where `origin` matches *"LGA", "JFK", "EWR"* and `dest` matches *"XNA"* ```{r} flights[.(c("LGA", "JFK", "EWR"), "XNA"), on = c("origin", "dest"), mult = "last"] ``` ### h) The *nomatch* argument We can choose if queries that do not match should return `NA` or be skipped altogether using the `nomatch` argument. #### -- From the previous example, subset all rows only if there's a match ```{r} flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", on = c("origin", "dest"), nomatch = 0L] ``` * There are no flights connecting "JFK" and "XNA". Therefore, that row is skipped in the result. ## 3. Auto indexing First we looked at how to fast subset using binary search using *keys*. Then we figured out that we could improve performance even further and have more cleaner syntax by using secondary indices. What could be better than that? The answer is to optimise *native R syntax* to use secondary indices internally so that we can have the same performance without having to use newer syntax. That is what *auto indexing* does. At the moment, it is only implemented for binary operators `==` and `%in%`. And it only works with a single column at the moment as well. An index is automatically created *and* saved as an attribute. That is, unlike the `on` argument which computes the index on the fly each time, a secondary index is created here. Let's start by creating a data.table big enough to highlight the advantage. ```{r} set.seed(1L) dt = data.table(x = sample(1e5L, 1e7L, TRUE), y = runif(100L)) print(object.size(dt), units = "Mb") ``` When we use `==` or `%in%` on a single column for the first time, a secondary index is created automtically, and it is used to perform the subset. ```{r} ## have a look at all the attribute names names(attributes(dt)) ## run thefirst time (t1 <- system.time(ans <- dt[x == 989L])) head(ans) ## secondary index is created names(attributes(dt)) indices(dt) ``` The time to subset the first time is the time to create the index + the time to subset. Since creating a secondary index involves only creating the order vector, this combined operation is faster than vector scans in many cases. But the real advantage comes in successive subsets. They are extremely fast. ```{r} ## successive subsets (t2 <- system.time(dt[x == 989L])) system.time(dt[x %in% 1989:2012]) ``` * Running the first time took `r sprintf("%.3f", t1["elapsed"])` seconds where as the second time took `r sprintf("%.3f", t2["elapsed"])` seconds. * Auto indexing can be disabled by setting the global argument `options(datatable.auto.index = FALSE)`. * Disabling auto indexing still allows to use indices created explicitly with `setindex` or `setindexv`. You can disable indices fully by setting global argument `options(datatable.use.index = FALSE)`. # In the future, we plan to extend auto indexing to expressions involving more than one column. Also we are working on extending binary search to work with more binary operators like `<`, `<=`, `>` and `>=`. Once done, it would be straightforward to extend it to these operators as well. We will extend fast *subsets* using keys and secondary indices to *joins* in the next vignette, *"Joins and rolling joins"*. *** data.table/inst/doc/datatable-keys-fast-subset.R0000644000175100001440000001400413172212366021266 0ustar hornikusers## ---- echo = FALSE, message = FALSE-------------------------------------- require(data.table) knitr::opts_chunk$set( comment = "#", error = FALSE, tidy = FALSE, cache = FALSE, collapse = TRUE) ## ----echo = FALSE--------------------------------------------------------------------------------- options(width = 100L) ## ------------------------------------------------------------------------------------------------- flights <- fread("flights14.csv") head(flights) dim(flights) ## ------------------------------------------------------------------------------------------------- set.seed(1L) DF = data.frame(ID1 = sample(letters[1:2], 10, TRUE), ID2 = sample(1:3, 10, TRUE), val = sample(10), stringsAsFactors = FALSE, row.names = sample(LETTERS[1:10])) DF rownames(DF) ## ------------------------------------------------------------------------------------------------- DF["C", ] ## ----eval = FALSE--------------------------------------------------------------------------------- # rownames(DF) = sample(LETTERS[1:5], 10, TRUE) # # Warning: non-unique values when setting 'row.names': 'C', 'D' # # Error in `row.names<-.data.frame`(`*tmp*`, value = value): duplicate 'row.names' are not allowed ## ------------------------------------------------------------------------------------------------- DT = as.data.table(DF) DT rownames(DT) ## ------------------------------------------------------------------------------------------------- setkey(flights, origin) head(flights) ## alternatively we can provide character vectors to the function 'setkeyv()' # setkeyv(flights, "origin") # useful to program with ## ------------------------------------------------------------------------------------------------- flights[.("JFK")] ## alternatively # flights[J("JFK")] (or) # flights[list("JFK")] ## ----eval = FALSE--------------------------------------------------------------------------------- # flights["JFK"] ## same as flights[.("JFK")] ## ----eval = FALSE--------------------------------------------------------------------------------- # flights[c("JFK", "LGA")] ## same as flights[.(c("JFK", "LGA"))] ## ------------------------------------------------------------------------------------------------- key(flights) ## ------------------------------------------------------------------------------------------------- setkey(flights, origin, dest) head(flights) ## or alternatively # setkeyv(flights, c("origin", "dest")) # provide a character vector of column names key(flights) ## ------------------------------------------------------------------------------------------------- flights[.("JFK", "MIA")] ## ------------------------------------------------------------------------------------------------- key(flights) flights[.("JFK")] ## or in this case simply flights["JFK"], for convenience ## ------------------------------------------------------------------------------------------------- flights[.(unique(origin), "MIA")] ## ------------------------------------------------------------------------------------------------- key(flights) flights[.("LGA", "TPA"), .(arr_delay)] ## ----eval = FALSE--------------------------------------------------------------------------------- # flights[.("LGA", "TPA"), "arr_delay", with = FALSE] ## ------------------------------------------------------------------------------------------------- flights[.("LGA", "TPA"), .(arr_delay)][order(-arr_delay)] ## ------------------------------------------------------------------------------------------------- flights[.("LGA", "TPA"), max(arr_delay)] ## ------------------------------------------------------------------------------------------------- # get all 'hours' in flights flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- setkey(flights, hour) key(flights) flights[.(24), hour := 0L] key(flights) ## ------------------------------------------------------------------------------------------------- flights[, sort(unique(hour))] ## ------------------------------------------------------------------------------------------------- setkey(flights, origin, dest) key(flights) ## ------------------------------------------------------------------------------------------------- ans <- flights["JFK", max(dep_delay), keyby = month] head(ans) key(ans) ## ------------------------------------------------------------------------------------------------- flights[.("JFK", "MIA"), mult = "first"] ## ------------------------------------------------------------------------------------------------- flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last"] ## ------------------------------------------------------------------------------------------------- flights[.(c("LGA", "JFK", "EWR"), "XNA"), mult = "last", nomatch = 0L] ## ----eval = FALSE--------------------------------------------------------------------------------- # # key by origin,dest columns # flights[.("JFK", "MIA")] ## ----eval = FALSE--------------------------------------------------------------------------------- # flights[origin == "JFK" & dest == "MIA"] ## ------------------------------------------------------------------------------------------------- set.seed(2L) N = 2e7L DT = data.table(x = sample(letters, N, TRUE), y = sample(1000L, N, TRUE), val = runif(N), key = c("x", "y")) print(object.size(DT), units = "Mb") key(DT) ## ------------------------------------------------------------------------------------------------- ## (1) Usual way of subsetting - vector scan approach t1 <- system.time(ans1 <- DT[x == "g" & y == 877L]) t1 head(ans1) dim(ans1) ## ------------------------------------------------------------------------------------------------- ## (2) Subsetting using keys t2 <- system.time(ans2 <- DT[.("g", 877L)]) t2 head(ans2) dim(ans2) identical(ans1$val, ans2$val) ## ----eval = FALSE--------------------------------------------------------------------------------- # 1, 5, 10, 19, 22, 23, 30 data.table/tests/0000755000175100001440000000000013172210047013426 5ustar hornikusersdata.table/tests/autoprint.R0000644000175100001440000000525113172210047015601 0ustar hornikusersrequire(data.table) # Tests the suppression of := output # Since this tests autoprinting at the console, it needs to use the .Rout.save mechanism in R CMD check DT = data.table(a=1:2) # Should print at console? DT # yes DT[1] # yes DT[2,a:=3L] # no DT # yes DT[FALSE,a:=3L] # no DT[a==4L,a:=5L] # no DT[a %in% 4:8, a:=5L] # no DT # yes print(DT[2,a:=4L]) # no print(DT) # yes if (TRUE) DT[2,a:=5L] # no. used to print before v1.9.5 if (TRUE) if (TRUE) DT[2,a:=6L] # no. used to print before v1.9.5 (function(){DT[2,a:=5L];NULL})() # print NULL DT # no (from v1.9.5+). := suppresses next auto print (can't distinguish just "DT" symbol alone at the prompt) DT # yes. 2nd time needed, or solutions below (function(){DT[2,a:=5L];NULL})() # print NULL DT[] # yes. guaranteed print (function(){DT[2,a:=5L];NULL})() # print NULL print(DT) # no. only DT[] is guaranteed print from v1.9.6 and R 3.2.0 (function(){DT[2,a:=5L][];NULL})() # print NULL DT # yes. i) function needs to add [] after last one, so that "DT" alone is guaranteed anyway (function(){DT[2,a:=5L];DT[];NULL})() # print NULL DT # yes. ii) or as a separate DT[] after the last := inside the function DT2 = data.table(b=3:4) # no (function(){DT[2,a:=6L];DT2[1,b:=7L];NULL})() DT # yes. last := was on DT2 not DT {DT[2,a:=6L];invisible()} # no print(DT) # no (function(){print(DT[2,a:=7L]);print(DT);invisible()})() # yes*2 {print(DT[2,a:=8L]);print(DT);invisible()} # yes*1 Not within function so as at prompt DT[1][,a:=9L] # no (was too tricky to detect that DT[1] is a new object). Simple rule is that := always doesn't print DT[2,a:=10L][1] # yes DT[1,a:=10L][1,a:=10L] # no DT[,a:=as.integer(a)] # no DT[1,a:=as.integer(a)] # no DT[1,a:=10L][] # yes. ...[] == oops, forgot print(...) # Test that error in := doesn't suppress next valid print, bug #2376 tryCatch(DT[,foo:=ColumnNameTypo], error=function(e) e$message) # error: not found. DT # yes DT # yes data.table/tests/autoprint.Rout.save0000644000175100001440000000744513172210047017275 0ustar hornikusers R version 3.1.1 (2014-07-10) -- "Sock it to Me" Copyright (C) 2014 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > require(data.table) Loading required package: data.table > # Tests the suppression of := output > # Since this tests autoprinting at the console, it needs to use the .Rout.save mechanism in R CMD check > DT = data.table(a=1:2) # Should print at console? > DT # yes a 1: 1 2: 2 > DT[1] # yes a 1: 1 > DT[2,a:=3L] # no > DT # yes a 1: 1 2: 3 > DT[FALSE,a:=3L] # no > DT[a==4L,a:=5L] # no > DT[a %in% 4:8, a:=5L] # no > DT # yes a 1: 1 2: 3 > print(DT[2,a:=4L]) # no > print(DT) # yes a 1: 1 2: 4 > if (TRUE) DT[2,a:=5L] # no. used to print before v1.9.5 > if (TRUE) if (TRUE) DT[2,a:=6L] # no. used to print before v1.9.5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT # no (from v1.9.5+). := suppresses next auto print (can't distinguish just "DT" symbol alone at the prompt) > DT # yes. 2nd time needed, or solutions below a 1: 1 2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > DT[] # yes. guaranteed print a 1: 1 2: 5 > (function(){DT[2,a:=5L];NULL})() # print NULL NULL > print(DT) # no. only DT[] is guaranteed print from v1.9.6 and R 3.2.0 > (function(){DT[2,a:=5L][];NULL})() # print NULL NULL > DT # yes. i) function needs to add [] after last one, so that "DT" alone is guaranteed anyway a 1: 1 2: 5 > (function(){DT[2,a:=5L];DT[];NULL})() # print NULL NULL > DT # yes. ii) or as a separate DT[] after the last := inside the function a 1: 1 2: 5 > DT2 = data.table(b=3:4) # no > (function(){DT[2,a:=6L];DT2[1,b:=7L];NULL})() NULL > DT # yes. last := was on DT2 not DT a 1: 1 2: 6 > {DT[2,a:=6L];invisible()} # no > print(DT) # no > (function(){print(DT[2,a:=7L]);print(DT);invisible()})() # yes*2 a 1: 1 2: 7 a 1: 1 2: 7 > {print(DT[2,a:=8L]);print(DT);invisible()} # yes*1 Not within function so as at prompt a 1: 1 2: 8 > DT[1][,a:=9L] # no (was too tricky to detect that DT[1] is a new object). Simple rule is that := always doesn't print > DT[2,a:=10L][1] # yes a 1: 1 > DT[1,a:=10L][1,a:=10L] # no > DT[,a:=as.integer(a)] # no > DT[1,a:=as.integer(a)] # no > DT[1,a:=10L][] # yes. ...[] == oops, forgot print(...) a 1: 10 2: 10 > > # Test that error in := doesn't suppress next valid print, bug #2376 > tryCatch(DT[,foo:=ColumnNameTypo], error=function(e) e$message) # error: not found. [1] "object 'ColumnNameTypo' not found" > DT # yes a 1: 10 2: 10 > DT # yes a 1: 10 2: 10 > > > proc.time() user system elapsed 3.14 0.10 3.22 data.table/tests/knitr.R0000644000175100001440000000037113172210047014701 0ustar hornikusersif (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { require(knitr) knit("knitr.Rmd", quiet=TRUE) cat(readLines("knitr.md"), sep="\n") } else { cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") } data.table/tests/knitr.Rout.mock0000644000175100001440000000106313172210047016360 0ustar hornikusersLoading required package: knitr Loading required package: data.table ```r require(data.table) # print? DT = data.table(x=1:3, y=4:6) # no DT # yes ``` ``` ## x y ## 1: 1 4 ## 2: 2 5 ## 3: 3 6 ``` ```r DT[, z := 7:9] # no print(DT[, z := 10:12]) # yes ``` ``` ## x y z ## 1: 1 4 10 ## 2: 2 5 11 ## 3: 3 6 12 ``` ```r if (1 < 2) DT[, a := 1L] # no DT # yes ``` ``` ## x y z a ## 1: 1 4 10 1 ## 2: 2 5 11 1 ## 3: 3 6 12 1 ``` Some text. data.table/tests/knitr.Rout.save0000644000175100001440000000276313172210047016375 0ustar hornikusers R version 3.1.1 (2014-07-10) -- "Sock it to Me" Copyright (C) 2014 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > if (suppressPackageStartupMessages(requireNamespace("knitr", quietly = TRUE))) { + require(knitr) + knit("knitr.Rmd", quiet=TRUE) + cat(readLines("knitr.md"), sep="\n") + } else { + cat(readLines("knitr.Rout.mock", warn = FALSE), sep="\n") + } Loading required package: knitr Loading required package: data.table ```r require(data.table) # print? DT = data.table(x=1:3, y=4:6) # no DT # yes ``` ``` ## x y ## 1: 1 4 ## 2: 2 5 ## 3: 3 6 ``` ```r DT[, z := 7:9] # no print(DT[, z := 10:12]) # yes ``` ``` ## x y z ## 1: 1 4 10 ## 2: 2 5 11 ## 3: 3 6 12 ``` ```r if (1 < 2) DT[, a := 1L] # no DT # yes ``` ``` ## x y z a ## 1: 1 4 10 1 ## 2: 2 5 11 1 ## 3: 3 6 12 1 ``` Some text. > > > proc.time() user system elapsed 3.116 0.128 3.257 data.table/tests/knitr.Rmd0000644000175100001440000000055113172210047015222 0ustar hornikusers```{r test_id, message=FALSE, results="show", echo=TRUE, warning=FALSE} require(data.table) # print? DT = data.table(x=1:3, y=4:6) # no DT # yes DT[, z := 7:9] # no print(DT[, z := 10:12]) # yes if (1 < 2) DT[, a := 1L] # no DT # yes ``` Some text. data.table/tests/testthat.R0000644000175100001440000000020013172210047015401 0ustar hornikusersif(requireNamespace("testthat", quietly = TRUE)){ library(testthat) library(data.table) test_check("data.table") } data.table/tests/testthat/0000755000175100001440000000000013174561362015301 5ustar hornikusersdata.table/tests/testthat/test-data.frame-like.R0000644000175100001440000001403013172210047021310 0ustar hornikuserscontext("data.frame like functions (merge, subset, transform)") ############################################################################### ## Merge test_that("`x` columns are valid (bug #1299)", { d1 <- data.table(x=c(1,3,8), y1=rnorm(3), key="x") d2 <- data.table(x=c(3,8,10), y2=rnorm(3), key="x") ans1 <- merge(d1, d2, by="x") ans2 <- cbind(d1[2:3], y2=d2[1:2]$y2) setkey(ans2, x) expect_equal(ans1, ans2, info="Original test #230") }) test_that("`xkey` column names are valid in merge (bug#1299", { d1 <- data.table(xkey=c(1,3,8), y1=rnorm(3), key="xkey") d2 <- data.table(xkey=c(3,8,10), y2=rnorm(3), key="xkey") ans2 <- cbind(d1[2:3], y2=d2[1:2]$y2) setkey(ans2, xkey) expect_equal(merge(d1, d2, by="xkey"), ans2, info="Original test #238") }) test_that("one column merges work (bug #1241)", { dt <- data.table(a=rep(1:2,each=3), b=1:6, key="a") y <- data.table(a=c(0,1), bb=c(10,11), key="a") expect_equal(merge(y, dt), data.table(a=1L, bb=11, b=1:3, key="a"), info="Original test #231") expect_equal(merge(y, dt, all=TRUE), data.table(a=rep(c(0L,1L,2L),c(1,3,3)), bb=rep(c(10,11,NA_real_),c(1,3,3)), b=c(NA_integer_,1:6), key="a"), info="Original test #232") ## y with only a key column y <- data.table(a=c(0,1), key="a") expect_equal(merge(y,dt), data.table(a=1L, b=1:3, key="a"), info="Original test #233") expect_equal(merge(y, dt, all=TRUE), data.table(a=rep(c(0L,1L,2L),c(1,3,3)), b=c(NA_integer_,1:6),key="a"), info="Original test #234") }) test_that("merging data.tables is almost like merging data.frames", { d1 <- data.table(a=sample(letters, 10), b=sample(1:100, 10), key='a') d2 <- data.table(a=d1$a, b=sample(1:50, 10), c=rnorm(10), key='a') dtm <- merge(d1, d2, by='a', suffixes=c(".xx", ".yy")) dtm.df <- as.data.frame(dtm) dfm <- merge(as.data.frame(d1), as.data.frame(d2), by='a', suffixes=c('.xx', '.yy')) expect_equal(unname(dtm.df), unname(dfm), info="Testing contents/data after merge") expect_equal(colnames(dtm), colnames(dfm), info="Original test #255 (testing suffixes parameter)") }) test_that("`suffixes` behavior can be toggled to pre 1.5.4 behavior", { dt1 <- data.table(a=letters[1:5], b=1:5, key="a") dt2 <- data.table(a=letters[3:8], b=1:6, key="a") # options(datatable.pre.suffixes=FALSE) # Option removed in 1.7.10 expect_equal(colnames(merge(dt1, dt2)), c("a", "b.x", "b.y")) #options(datatable.pre.suffixes=TRUE) #expect_equal(colnames(merge(dt1, dt2)), c("a", "b", "b.1"), # info="Pre 1.5.4 behavior not working") #options(datatable.pre.suffixes=FALSE) }) test_that("merge and auto-increment columns in y[x]", { ## merging tables that have common column names that end in *.1 gets ## tricky, because the y[x] mojo does some magic to increment the *.1 ## in the x (I think) and keep *.1 in the y x <- data.table(a=letters[1:10], b=1:10, b.1=1:10 * 10, key="a") y <- data.table(a=letters[1:10], b=letters[11:20], b.1=rnorm(10), key="a") M <- merge(x, y) m <- merge(as.data.frame(x), as.data.frame(y), by="a") expect_is(M, 'data.table') expect_is(m, 'data.frame') expect_true(all(names(M) %in% union(names(M), names(m)))) for (name in names(m)) { expect_equal(M[[name]], m[[name]]) } ## Original example that smoked out the bug M <- data.table(a=letters[1:10], b=1:10) m <- as.data.frame(M) ms <- lapply(1:3, function(x) data.table(a=letters[1:10], b=1:10 * 10^x)) for (i in 1:3) { M <- merge(M, ms[[i]], by='a', suffixes=c("", sprintf(".%d", i))) } for (i in 1:3) { m <- merge(m, as.data.frame(ms[[i]]), by='a', suffixes=c("", sprintf(".%d", i))) } expect_is(M, 'data.table') expect_is(m, 'data.frame') expect_true(all(names(M) %in% union(names(M), names(m)))) for (name in names(m)) { expect_equal(M[[name]], m[[name]]) } }) ############################################################################### ## subset test_that("simple subset maintains keys", { dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE), b=sample(c('a', 'b', 'c'), 20, replace=TRUE), c=sample(20), key='a') sub <- subset(dt, a == 'b') expect_equal(key(dt), key(sub)) }) test_that("subset using 'select' maintains key appropriately", { dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE), b=sample(c('a', 'b', 'c'), 20, replace=TRUE), c=sample(20), key=c('a', 'b')) sub.1 <- subset(dt, a == 'a', select=c('c', 'b', 'a')) expect_equal(key(sub.1), key(dt), info="reordering columns") sub.2 <- subset(dt, a == 'a', select=c('a', 'c')) expect_equal(key(sub.2), 'a', info="selected columns are prefix of key") sub.3 <- subset(dt, a == 'a', select=c('b', 'c')) expect_true(is.null(key(sub.3)), info="selected columns do not from a key prefix") sub.4 <- subset(dt, a == 'cc') expect_equal(nrow(sub.4), 0) expect_true(is.null(key(sub.4))) }) ############################################################################### ## transform test_that("transform maintains keys", { dt <- data.table(a=sample(c('a', 'b', 'c'), 20, replace=TRUE), b=sample(c('a', 'b', 'c'), 20, replace=TRUE), c=sample(20), key=c('a', 'b')) t1 <- transform(dt, d=c+4) expect_equal(key(t1), key(dt)) expect_equal(t1$d, dt$c + 4, info="transform was successful") t2 <- transform(dt, d=c+4, a=sample(c('x', 'y', 'z'), 20, replace=TRUE)) expect_true(is.null(key(t2)), info="transforming a key column nukes the key") ## This is probably not necessary, but let's just check that transforming ## a key column doesn't twist around the rows in the result. for (col in c('b', 'c')) { msg <- sprintf("mutating-key-transform maintains other columns [%s]", col) expect_equal(t2[[col]], dt[[col]], info=msg) } }) data.table/tests/testthat/test-S4.R0000644000175100001440000000353613172210047016663 0ustar hornikuserscontext("S4 Compatability") ## S4 class definitions to test setClass("Data.Table", contains="data.table") setClass("S4Composition", representation(data="data.table")) test_that("data.table can be a parent class", { ids <- sample(letters[1:3], 10, replace=TRUE) scores<- rnorm(10) dt <- data.table(id=ids, score=scores) dt.s4 <- new("Data.Table", data.table(id=ids, score=scores)) expect_true(isS4(dt.s4)) expect_true(inherits(dt.s4, 'data.table')) ## pull out data from S4 as.list, and compare to list from dt dt.s4.list <- dt.s4@.Data names(dt.s4.list) <- names(dt.s4) expect_identical(dt.s4.list, as.list(dt), info="Underlying data not identical") }) test_that("simple S4 conversion-isms work", { df = data.frame(a=sample(letters, 10), b=1:10) dt = as.data.table(df) expect_equal(as(df, 'data.table'), dt) expect_identical(as(dt, 'data.frame'), df) }) test_that("data.table can be used in an S4 slot", { ## A class with a data.table slot dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) expect_equal(dt.comp@data, dt) }) test_that("S4 methods dispatch properly on data.table slots", { ## Make toy accessor functions and compare results against normal data.table ## access dt <- data.table(a=sample(letters[1:3], 10, replace=TRUE), score=rnorm(10)) dt.comp <- new("S4Composition", data=dt) setGeneric("dtGet", function(x, what) standardGeneric("dtGet")) setMethod("dtGet", c(x="S4Composition", what="missing"), function(x, what) { x@data }) setMethod("dtGet", c(x="S4Composition", what="ANY"), function(x, what) { x@data[[what]] }) expect_equal(dtGet(dt.comp), dt, label='actually') expect_identical(dtGet(dt.comp, 1), dt[[1]]) expect_identical(dtGet(dt.comp, 'b'), dt$b) }) data.table/tests/main.R0000644000175100001440000000066113172210047014500 0ustar hornikusersrequire(data.table) test.data.table() # runs the main test suite of 5,000+ tests in /inst/tests/tests.Rraw # Turn off verbose repeat to save time (particularly Travis, but also CRAN) : # test.data.table(verbose=TRUE) # Calling it again in the past revealed some memory bugs but also verbose mode checks the verbose messages run ok # TO DO: check we test each verbose message at least once, instead of a full repeat of all tests data.table/src/0000755000175100001440000000000013172212367013062 5ustar hornikusersdata.table/src/Makevars0000644000175100001440000000040413172212367014554 0ustar hornikusers PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) all: $(SHLIB) mv $(SHLIB) datatable$(SHLIB_EXT) if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi data.table/src/ijoin.c0000644000175100001440000006630513172212367014350 0ustar hornikusers#include "data.table.h" #include #include // TODO: implement 'lookup' for 'gaps' and 'overlaps' arguments SEXP lookup(SEXP ux, SEXP xlen, SEXP indices, SEXP gaps, SEXP overlaps, SEXP multArg, SEXP typeArg, SEXP verbose) { SEXP vv, tt, lookup, type_lookup; R_len_t i,j,k,*idx,*len1,*len2,xrows=INTEGER(xlen)[0],uxrows=LENGTH(VECTOR_ELT(ux, 0)),uxcols=LENGTH(ux); int *from = (int *)INTEGER(VECTOR_ELT(indices, 0)); int *to = (int *)INTEGER(VECTOR_ELT(indices, 1)); clock_t pass1, pass2, pass3, start; enum {ALL, FIRST, LAST} mult = ALL; enum {ANY, WITHIN, START, END, EQUAL} type = ANY; if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST; else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST; else error("Internal error: invalid value for 'mult'; this should have been caught before. Please report to datatable-help"); if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "any")) type = ANY; else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "within")) type = WITHIN; else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "start")) type = START; else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "end")) type = END; else if (!strcmp(CHAR(STRING_ELT(typeArg, 0)), "equal")) type = EQUAL; else error("Internal error: invalid value for 'type'; this should have been caught before. Please report to datatable-help"); // For reference: uxcols-1 = type_count, uxcols-2 = count, uxcols-3 = type_lookup, uxcols-4 = lookup // first pass: calculate lengths first start = clock(); len1 = (int *)INTEGER(VECTOR_ELT(ux, uxcols-2)); len2 = (int *)INTEGER(VECTOR_ELT(ux, uxcols-1)); switch (mult) { case FIRST: for (i=0; i 0 && len2[from[i]-1]) ? len2[from[i]-1] : 1; break; case EQUAL: for (i=0; i0) ? from[i] : 1; if (k == to[i]) { wlen = len1[k-1]; } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); while (j INTEGER(tmp2)[m] ) { break; } else ++j; } } totlen += wlen; if (len == totlen) ++totlen; } break; case ANY: for (i=0; i 0) ? from[i] : 1; k = from[i]; if (k<=to[i]) totlen += len1[k-1]; for (j=k+1; j<=to[i]; j++) totlen += len2[j-1]; if (len == totlen) ++totlen; } break; case WITHIN: for (i=0; i 0) { if (k == to[i]) { totlen += len1[k-1]; } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(lookup, to[i]-1); while (j INTEGER(tmp2)[m] ) { ++m; } else ++j; } } } if (len == totlen) ++totlen; } break; } } else totlen = rows; end1 = clock() - start; if (LOGICAL(verbose)[0]) Rprintf("First pass on calculating lengths in overlaps ... done in %8.3f seconds\n", 1.0*(end1)/CLOCKS_PER_SEC); // ans[0] is the the position of 'query' and ans[1] is that of 'subject' // allocate f1__ and f2__ and assign 'nomatch' to f2__ ans = PROTECT(allocVector(VECSXP, 2)); f1__ = allocVector(INTSXP, totlen); SET_VECTOR_ELT(ans, 0, f1__); f2__ = allocVector(INTSXP, totlen); SET_VECTOR_ELT(ans, 1, f2__); thislen=0; start = clock(); // switching mult=ALL,FIRST,LAST separately to // - enhance performance for special cases, and // - easy to fix any bugs in the future switch (mult) { case ALL: switch (type) { case START : case END : for (i=0; i 0) { k = from[i]; tmp2 = VECTOR_ELT(type_lookup, k-1); for (j=0; j 0 && to[i] > 0) { k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); for (j=0; j INTEGER(tmp2)[m] ) { ++m; } else ++j; } } } if (len == thislen) { INTEGER(f1__)[thislen] = i+1; INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case ANY : for (i=0; i0) ? from[i] : 1; k = from[i]; if (k<=to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); for (m=0; m 0) { if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); for (j=0; j INTEGER(tmp2)[m] ) { ++m; } else ++j; } } } if (len == thislen) { INTEGER(f1__)[thislen] = i+1; INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; } break; case FIRST: switch (type) { case START: case END: for (i=0; i0) ? from[i] : 1; if (k <= to[i]) { // len1[k-1] is equal to len2[k-1] and will always be >0, so no length check necessary. tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; ++thislen; } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case EQUAL : for (i=0; i 0 && to[i] > 0) { k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; ++thislen; } else if (k < to[i]) { j=0; m=0; tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); while (j INTEGER(tmp2)[m] ) { ++m; } else ++j; } } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case ANY: for (i=0; i0) ? from[i] : 1; k = from[i]; for (j=k; j<=to[i]; j++) { if (len2[j-1]) { tmp2 = VECTOR_ELT(type_lookup, j-1); INTEGER(f2__)[thislen] = INTEGER(tmp2)[0]; ++thislen; break; } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case WITHIN: for (i=0; i 0) { if (k == to[i] && len1[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[0]; ++thislen; } else if (k < to[i]) { j=0; m=0; tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(lookup, to[i]-1); while (j INTEGER(tmp2)[m] ) { ++m;; } else ++j; } } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; } break; case LAST: switch (type) { case START: case END: for (i=0; i0) ? from[i] : 1; if (k <= to[i]) { // len1[k-1] is equal to len2[k-1] and will always be >0, so no length check necessary. tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[len1[k-1]-1]; ++thislen; } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case EQUAL : for (i=0; i 0 && to[i] > 0) { k = from[i]; if (k == to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[len1[k-1]-1]; ++thislen; } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(type_lookup, to[i]-1); j=len1[k-1]-1; m=len2[k-1]-1; while (j>=0 && m>=0) { if ( INTEGER(tmp1)[j] == INTEGER(tmp2)[m] ) { INTEGER(f2__)[thislen] = INTEGER(tmp1)[j]; ++thislen; --j; --m; break; } else if ( INTEGER(tmp1)[j] < INTEGER(tmp2)[m] ) { --m; } else --j; } } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; // OLD logic for 'any,last' which had to check for maximum for each 'i'. Better logic below. // for 'first' we need to just get the minimum of first non-zero-length element, but not the same case for 'last'. // We've to loop over from[i]:to[i] and get maximum of all tmp2 values (each is of length 1 already conveniently set uo) in that range // case ANY: // for (i=0; i0) ? from[i] : 1; // k = from[i]; // for (j=k; j<=to[i]; j++) { // if (len2[j-1]) { // tmp2 = VECTOR_ELT(type_lookup, j-1); // INTEGER(f2__)[thislen] = (INTEGER(f2__)[thislen] < INTEGER(tmp2)[len2[j-1]-1]) ? INTEGER(tmp2)[len2[j-1]-1] : INTEGER(f2__)[thislen]; // } // } // if (INTEGER(f2__)[thislen] == 0) // INTEGER(f2__)[thislen] = nomatch; // ++thislen; // } // break; case ANY: for (i=0; i0) ? from[i] : 1; k = from[i]; if (k <= to[i]) { if (k==to[i] && len1[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[len1[k-1]-1]; ++thislen; } else { for (j=to[i]; j>k; j--) { if (len2[j-1]) { tmp2 = VECTOR_ELT(type_lookup, j-1); INTEGER(f2__)[thislen] = INTEGER(tmp2)[0]; // tmp2 will be length 1 ++thislen; break; } } if (len == thislen && len1[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[len1[k-1]-1]; ++thislen; } } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; case WITHIN: for (i=0; i 0) { if (k == to[i] && len1[k-1]) { tmp1 = VECTOR_ELT(lookup, k-1); INTEGER(f2__)[thislen] = INTEGER(tmp1)[len1[k-1]-1]; ++thislen; } else if (k < to[i]) { tmp1 = VECTOR_ELT(lookup, k-1); tmp2 = VECTOR_ELT(lookup, to[i]-1); j=len1[k-1]-1; m=len1[to[i]-1]-1; while (j>=0 && m>=0) { if ( INTEGER(tmp1)[j] == INTEGER(tmp2)[m] ) { INTEGER(f2__)[thislen] = INTEGER(tmp1)[j]; ++thislen; --j; --m; break; } else if ( INTEGER(tmp1)[j] < INTEGER(tmp2)[m] ) { --m; } else --j; } } } if (len == thislen) { INTEGER(f2__)[thislen] = nomatch; ++thislen; } } break; } break; } end2 = clock() - start; if (LOGICAL(verbose)[0]) Rprintf("Final step, fetching indices in overlaps ... done in %8.3f seconds\n", 1.0*(end2)/CLOCKS_PER_SEC); UNPROTECT(1); return(ans); } data.table/src/fastmean.c0000644000175100001440000001043313172212367015025 0ustar hornikusers#include "data.table.h" #include #include /* Copied from src/main/summary.c with the following changes : i) Rather than mean.default doing x <- x[!is.na(x)] which incurs several vector allocations and scans, we just skip NA in the for loop in C. ii) Overhead of repeated calls to S3 dispatch of mean to mean.default is avoided by calling C directly. See wiki point 3 for the large difference this makes. Ordinarily we prefer not to duplicate base R stats functions in case we introduce a bug or create additonal maintenance burden. But, mean() is so slow, and so much slower than sum(), and so commonly used and benchmarked, that it warrants this fast mean. We can't call .Internal(mean(x)) because that's disallowed by QC.R, but anyway .Internal(mean(x)) doesn't respect na.rm=TRUE. We are careful to retain the double scan that summary.c does that adjusts for accumalated rounding errors in floating point. We explicitly test that fastmean returns the same result as base::mean in test.data.table(). For that we use exact equality under floating point, not all.equal. A stronger test. These tests run under R CMD check and run on all CRAN platforms daily to catch if we become out of line to base R (say if base R changed its mean). */ SEXP fastmean(SEXP args) { long double s = 0., t = 0.; R_len_t i, l = 0, n = 0; SEXP x, ans, tmp; Rboolean narm=FALSE; x=CADR(args); if (length(args)>2) { tmp = CADDR(args); if (!isLogical(tmp) || LENGTH(tmp)!=1 || LOGICAL(tmp)[0]==NA_LOGICAL) error("narm should be TRUE or FALSE"); narm=LOGICAL(tmp)[0]; } PROTECT(ans = allocNAVector(REALSXP, 1)); if (!isInteger(x) && !isReal(x) && !isLogical(x)) { warning("argument is not numeric or logical: returning NA"); UNPROTECT(1); return(ans); } l = LENGTH(x); if (narm) { switch(TYPEOF(x)) { case LGLSXP: case INTSXP: for (i = 0; i0) REAL(ans)[0] = (double) (s/n); else REAL(ans)[0] = R_NaN; // consistent with base: mean(NA,na.rm=TRUE)==NaN==mean(numeric(),na.rm=TRUE) break; case REALSXP: for (i = 0; i // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); // generate from 1 to n (a simple fun for melt, vecseq is convenient from R due to SEXP inputs) SEXP seq_int(int n, int start) { SEXP ans = R_NilValue; int i; if (n <= 0) return(ans); PROTECT(ans = allocVector(INTSXP, n)); for (i=0; i= 0"); for (i=0; i length(vec)) error("concat: 'idx' must take values between 0 and length(vec); 0 <= idx <= length(vec)"); } PROTECT(v = allocVector(STRSXP, nidx > 5 ? 5 : nidx)); for (i=0; i 5) SET_STRING_ELT(v, 4, mkChar("...")); PROTECT(t = s = allocList(3)); SET_TYPEOF(t, LANGSXP); SETCAR(t, install("paste")); t = CDR(t); SETCAR(t, v); t = CDR(t); SETCAR(t, mkString(", ")); SET_TAG(t, install("collapse")); UNPROTECT(2); // v, (t,s) return(eval(s, R_GlobalEnv)); } // deal with measure.vars of type VECSXP SEXP measurelist(SEXP measure, SEXP dtnames) { int i, n=length(measure), protecti=0; SEXP ans, tmp; ans = PROTECT(allocVector(VECSXP, n)); protecti++; for (i=0; i ncol) error("One or more values in 'id.vars' is invalid."); else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; u = 0; for (i=0; i ncol) error("One or more values in 'measure.vars' is invalid."); else if (!LOGICAL(booltmp)[i]) targetcols++; else continue; } unqtmp = PROTECT(allocVector(INTSXP, targetcols)); protecti++; u = 0; for (i=0; i ncol) error("One or more values in 'id.vars' is invalid."); } idcols = PROTECT(tmp); protecti++; switch(TYPEOF(measure)) { case STRSXP : tmp2 = PROTECT(chmatch(measure, dtnames, 0, FALSE)); protecti++; break; case REALSXP : tmp2 = PROTECT(coerceVector(measure, INTSXP)); protecti++; break; case INTSXP : tmp2 = measure; break; case VECSXP : tmp2 = PROTECT(measurelist(measure, dtnames)); protecti++; break; default : error("Unknown 'measure.vars' type %s, must be character or integer vector", type2char(TYPEOF(measure))); } tmp = tmp2; if (isNewList(measure)) { tmp = PROTECT(unlist_(tmp2)); protecti++; } for (i=0; i ncol) error("One or more values in 'measure.vars' is invalid."); } if (isNewList(measure)) valuecols = tmp2; else { valuecols = PROTECT(allocVector(VECSXP, 1)); protecti++; SET_VECTOR_ELT(valuecols, 0, tmp2); } } ans = PROTECT(allocVector(VECSXP, 2)); protecti++; SET_VECTOR_ELT(ans, 0, idcols); SET_VECTOR_ELT(ans, 1, valuecols); UNPROTECT(protecti); return(ans); } struct processData { SEXP idcols, valuecols, naidx; int lids, lvalues, lmax, lmin, protecti, totlen, nrow; int *isfactor, *leach, *isidentical; SEXPTYPE *maxtype; Rboolean narm; }; static void preprocess(SEXP DT, SEXP id, SEXP measure, SEXP varnames, SEXP valnames, Rboolean narm, Rboolean verbose, struct processData *data) { SEXP vars,tmp,thiscol; SEXPTYPE type; int i,j; data->lmax = 0; data->lmin = 0; data->protecti = 0, data->totlen = 0, data->nrow = length(VECTOR_ELT(DT, 0)); vars = checkVars(DT, id, measure, verbose); data->idcols = PROTECT(VECTOR_ELT(vars, 0)); data->protecti++; data->valuecols = PROTECT(VECTOR_ELT(vars, 1)); data->protecti++; data->lids = length(data->idcols); data->lvalues = length(data->valuecols); data->narm = narm; if (length(valnames) != data->lvalues) { UNPROTECT(data->protecti); if (isNewList(measure)) error("When 'measure.vars' is a list, 'value.name' must be a character vector of length =1 or =length(measure.vars)."); else error("When 'measure.vars' is either not specified or a character/integer vector, 'value.name' must be a character vector of length =1."); } if (length(varnames) != 1) error("'variable.name' must be a character/integer vector of length=1."); data->leach = (int *)R_alloc(data->lvalues, sizeof(int)); data->isidentical = (int *)R_alloc(data->lvalues, sizeof(int)); data->isfactor = (int *)R_alloc(data->lvalues, sizeof(int)); data->maxtype = (SEXPTYPE *)R_alloc(data->lvalues, sizeof(SEXPTYPE)); for (i=0; ilvalues; i++) { tmp = VECTOR_ELT(data->valuecols, i); data->leach[i] = length(tmp); data->isidentical[i] = 1; // TODO - why 1 and not Rboolean TRUE? data->isfactor[i] = 0; // seems to hold 2 below, so not an Rboolean FALSE here. TODO - better name for variable? data->maxtype[i] = 0; // R_alloc doesn't initialize so careful to here, relied on below data->lmax = (data->lmax > data->leach[i]) ? data->lmax : data->leach[i]; data->lmin = (data->lmin < data->leach[i]) ? data->lmin : data->leach[i]; for (j=0; jleach[i]; j++) { thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); if (isFactor(thiscol)) { data->isfactor[i] = (isOrdered(thiscol)) ? 2 : 1; data->maxtype[i] = STRSXP; } else { type = TYPEOF(thiscol); if (type > data->maxtype[i]) data->maxtype[i] = type; } } for (j=0; jleach[i]; j++) { thiscol = VECTOR_ELT(DT, INTEGER(tmp)[j]-1); if ( (!isFactor(thiscol) && data->maxtype[i] != TYPEOF(thiscol)) || (isFactor(thiscol) && data->maxtype[i] != STRSXP) ) { data->isidentical[i] = 0; break; } } } if (data->narm) { data->naidx = PROTECT(allocVector(VECSXP, data->lmax)); data->protecti++; } } SEXP getvaluecols(SEXP DT, SEXP dtnames, Rboolean valfactor, Rboolean verbose, struct processData *data) { int i, j, k, protecti=0, counter=0, thislen=0; SEXP tmp, seqcols, thiscol, thisvaluecols, target, ansvals, thisidx=R_NilValue, flevels, clevels; Rboolean coerced=FALSE, thisfac=FALSE, copyattr = FALSE, thisvalfactor; size_t size; for (i=0; ilvalues; i++) { thisvaluecols = VECTOR_ELT(data->valuecols, i); if (!data->isidentical[i]) warning("'measure.vars' [%s] are not all of the same type. By order of hierarchy, the molten data value column will be of type '%s'. All measure variables not of type '%s' will be coerced to. Check DETAILS in ?melt.data.table for more on coercion.\n", CHAR(STRING_ELT(concat(dtnames, thisvaluecols), 0)), type2char(data->maxtype[i]), type2char(data->maxtype[i])); if (data->maxtype[i] == VECSXP && data->narm) { if (verbose) Rprintf("The molten data value type is a list at item %d. 'na.rm=TRUE' is ignored.\n", i+1); data->narm = FALSE; } } if (data->narm) { seqcols = PROTECT(seq_int(data->lvalues, 1)); protecti++; for (i=0; ilmax; i++) { tmp = PROTECT(allocVector(VECSXP, data->lvalues)); for (j=0; jlvalues; j++) { if (i < data->leach[j]) { thisvaluecols = VECTOR_ELT(data->valuecols, j); SET_VECTOR_ELT(tmp, j, VECTOR_ELT(DT, INTEGER(thisvaluecols)[i]-1)); } else { SET_VECTOR_ELT(tmp, j, allocNAVector(data->maxtype[j], data->nrow)); } } tmp = PROTECT(dt_na(tmp, seqcols)); SET_VECTOR_ELT(data->naidx, i, which(tmp, FALSE)); UNPROTECT(2); // tmp data->totlen += length(VECTOR_ELT(data->naidx, i)); } } else data->totlen = data->nrow * data->lmax; flevels = PROTECT(allocVector(VECSXP, data->lmax)); protecti++; Rboolean *isordered = (Rboolean *)R_alloc(data->lmax, sizeof(Rboolean)); ansvals = PROTECT(allocVector(VECSXP, data->lvalues)); protecti++; for (i=0; ilvalues; i++) { thisvalfactor = (data->maxtype[i] == VECSXP) ? FALSE : valfactor; target = allocVector(data->maxtype[i], data->totlen); SET_VECTOR_ELT(ansvals, i, target); thisvaluecols = VECTOR_ELT(data->valuecols, i); counter = 0; copyattr = FALSE; for (j=0; jlmax; j++) { thiscol = (j < data->leach[i]) ? VECTOR_ELT(DT, INTEGER(thisvaluecols)[j]-1) : allocNAVector(data->maxtype[i], data->nrow); if (!copyattr && data->isidentical[i] && !data->isfactor[i]) { copyMostAttrib(thiscol, target); copyattr = TRUE; } if (TYPEOF(thiscol) != TYPEOF(target) && (data->maxtype[i] == VECSXP || !isFactor(thiscol))) { thiscol = PROTECT(coerceVector(thiscol, TYPEOF(target))); coerced = TRUE; } if (data->narm) { thisidx = VECTOR_ELT(data->naidx, j); thislen = length(thisidx); } size = SIZEOF(thiscol); switch (TYPEOF(target)) { case VECSXP : if (data->narm) { for (k=0; knrow; k++) SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); } break; case STRSXP : if (data->isfactor[i]) { if (isFactor(thiscol)) { SET_VECTOR_ELT(flevels, j, getAttrib(thiscol, R_LevelsSymbol)); thiscol = PROTECT(asCharacterFactor(thiscol)); thisfac = TRUE; isordered[j] = isOrdered(thiscol); } else SET_VECTOR_ELT(flevels, j, thiscol); } if (data->narm) { for (k=0; knrow; k++) SET_STRING_ELT(target, j*data->nrow + k, STRING_ELT(thiscol, k)); } break; case REALSXP : if (data->narm) { for (k=0; knrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; case INTSXP : if (data->narm) { for (k=0; knrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; case LGLSXP : if (data->narm) { for (k=0; knrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; default : error("Unknown column type '%s' for column '%s'.", type2char(TYPEOF(thiscol)), CHAR(STRING_ELT(dtnames, INTEGER(thisvaluecols)[i]-1))); } if (data->narm) counter += thislen; if (coerced) { UNPROTECT(1); coerced = FALSE; } if (thisfac) { UNPROTECT(1); thisfac = FALSE; } } if (thisvalfactor && data->isfactor[i] && TYPEOF(target) != VECSXP) { clevels = combineFactorLevels(flevels, &(data->isfactor[i]), isordered); SEXP factorLangSxp = PROTECT(lang3(install(data->isfactor[i] == 1 ? "factor" : "ordered"), target, clevels)); SET_VECTOR_ELT(ansvals, i, eval(factorLangSxp, R_GlobalEnv)); UNPROTECT(2); // clevels, factorLangSxp } } UNPROTECT(protecti); return(ansvals); } SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, struct processData *data) { int i,j,k,cnt=0,nrows=0, nlevels=0, protecti=0, thislen, zerolen=0; SEXP ansvars, thisvaluecols, levels, target, matchvals, thisnames; ansvars = PROTECT(allocVector(VECSXP, 1)); protecti++; target = allocVector(INTSXP, data->totlen); SET_VECTOR_ELT(ansvars, 0, target); if (data->lvalues == 1) { thisvaluecols = VECTOR_ELT(data->valuecols, 0); // tmp fix for #1055 thisnames = PROTECT(allocVector(STRSXP, length(thisvaluecols))); for (i=0; inarm) { for (j=0; jlmax; j++) { thislen = length(VECTOR_ELT(data->naidx, j)); for (k=0; klmax - zerolen; } else { for (j=0; jlmax; j++) { for (k=0; knrow; k++) INTEGER(target)[data->nrow*j + k] = INTEGER(matchvals)[j]; } nlevels = data->lmax; } UNPROTECT(2); // matchvals, thisnames } else { if (data->narm) { for (j=0; jlmax; j++) { thislen = length(VECTOR_ELT(data->naidx, j)); for (k=0; klmax; j++) { for (k=0; knrow; k++) INTEGER(target)[data->nrow*j + k] = j+1; } nlevels = data->lmax; } } setAttrib(target, R_ClassSymbol, mkString("factor")); cnt = 0; if (data->lvalues == 1) { levels = PROTECT(allocVector(STRSXP, nlevels)); thisvaluecols = VECTOR_ELT(data->valuecols, 0); // levels will be column names for (i=0; ilmax; i++) { if (data->narm) { if (length(VECTOR_ELT(data->naidx, i)) == 0) continue; } SET_STRING_ELT(levels, cnt++, STRING_ELT(dtnames, INTEGER(thisvaluecols)[i]-1)); } } else levels = PROTECT(coerceVector(seq_int(nlevels, 1), STRSXP)); // generate levels = 1:nlevels // base::unique is fast on vectors, and the levels on variable columns are usually small SEXP uniqueLangSxp = PROTECT(lang2(install("unique"), levels)); setAttrib(target, R_LevelsSymbol, eval(uniqueLangSxp, R_GlobalEnv)); UNPROTECT(2); // levels, uniqueLangSxp if (!varfactor) SET_VECTOR_ELT(ansvars, 0, asCharacterFactor(target)); UNPROTECT(protecti); return(ansvars); } SEXP getidcols(SEXP DT, SEXP dtnames, Rboolean verbose, struct processData *data) { int i,j,k, counter=0, thislen; SEXP ansids, thiscol, target, thisidx; size_t size; ansids = PROTECT(allocVector(VECSXP, data->lids)); for (i=0; ilids; i++) { counter = 0; thiscol = VECTOR_ELT(DT, INTEGER(data->idcols)[i]-1); size = SIZEOF(thiscol); target = allocVector(TYPEOF(thiscol), data->totlen); SET_VECTOR_ELT(ansids, i, target); copyMostAttrib(thiscol, target); // all but names,dim and dimnames. And if so, we want a copy here, not keepattr's SET_ATTRIB. switch(TYPEOF(thiscol)) { case REALSXP : if (data->narm) { for (j=0; jlmax; j++) { thisidx = VECTOR_ELT(data->naidx, j); thislen = length(thisidx); for (k=0; klmax; j++) memcpy((char *)DATAPTR(target)+j*data->nrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; case INTSXP : if (data->narm) { for (j=0; jlmax; j++) { thisidx = VECTOR_ELT(data->naidx, j); thislen = length(thisidx); for (k=0; klmax; j++) memcpy((char *)DATAPTR(target)+j*data->nrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; case LGLSXP : if (data->narm) { for (j=0; jlmax; j++) { thisidx = VECTOR_ELT(data->naidx, j); thislen = length(thisidx); for (k=0; klmax; j++) memcpy((char *)DATAPTR(target)+j*data->nrow*size, (char *)DATAPTR(thiscol), data->nrow*size); } break; case STRSXP : if (data->narm) { for (j=0; jlmax; j++) { thisidx = VECTOR_ELT(data->naidx, j); thislen = length(thisidx); for (k=0; k0, WHY? // From assign.c's memcrecycle - only one SET_STRING_ELT per RHS item is needed to set generations (overhead) for (k=0; knrow; k++) SET_STRING_ELT(target, k, STRING_ELT(thiscol, k)); for (j=1; jlmax; j++) memcpy((char *)DATAPTR(target)+j*data->nrow*size, (char *)DATAPTR(target), data->nrow*size); } break; case VECSXP : for (j=0; jlmax; j++) { for (k=0; knrow; k++) { SET_VECTOR_ELT(target, j*data->nrow + k, VECTOR_ELT(thiscol, k)); } } break; default : error("Unknown column type '%s' for column '%s' in 'data'", type2char(TYPEOF(thiscol)), CHAR(STRING_ELT(dtnames, INTEGER(data->idcols)[i]-1))); } } UNPROTECT(1); return (ansids); } SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP varnames, SEXP valnames, SEXP narmArg, SEXP verboseArg) { int i, ncol, protecti=0; SEXP dtnames, ansvals, ansvars, ansids, ansnames, ans; Rboolean narm=FALSE, verbose=FALSE; struct processData data; if (!isNewList(DT)) error("Input is not of type VECSXP, expected a data.table, data.frame or list"); if (!isLogical(valfactor)) error("Argument 'value.factor' should be logical TRUE/FALSE"); if (!isLogical(varfactor)) error("Argument 'variable.factor' should be logical TRUE/FALSE"); if (!isLogical(narmArg)) error("Argument 'na.rm' should be logical TRUE/FALSE."); if (!isString(varnames)) error("Argument 'variable.name' must be a character vector"); if (!isString(valnames)) error("Argument 'value.name' must be a character vector"); if (!isLogical(verboseArg)) error("Argument 'verbose' should be logical TRUE/FALSE"); ncol = LENGTH(DT); if (!ncol) { if (verbose) Rprintf("ncol(data) is 0. Nothing to melt. Returning original data.table."); return(DT); } dtnames = getAttrib(DT, R_NamesSymbol); if (isNull(dtnames)) error("names(data) is NULL. Please report to data.table-help"); if (LOGICAL(narmArg)[0] == TRUE) narm = TRUE; if (LOGICAL(verboseArg)[0] == TRUE) verbose = TRUE; preprocess(DT, id, measure, varnames, valnames, narm, verbose, &data); protecti = data.protecti; // edge case no measure.vars if (!data.lmax) { ans = shallowwrapper(DT, data.idcols); ans = PROTECT(duplicate(ans)); protecti++; } else { ansvals = PROTECT(getvaluecols(DT, dtnames, LOGICAL(valfactor)[0], verbose, &data)); protecti++; ansvars = PROTECT(getvarcols(DT, dtnames, LOGICAL(varfactor)[0], verbose, &data)); protecti++; ansids = PROTECT(getidcols(DT, dtnames, verbose, &data)); protecti++; // populate 'ans' ans = allocVector(VECSXP, data.lids+1+data.lvalues); // 1 is for variable column for (i=0; i ull if (n<2) return; for (int i=1; i=0 && xtmp> fromBit & mask]++; tmp++; } int last = (*(unsigned long long *)--tmp - minULL) >> fromBit & mask; if (counts[last] == n) { // Single value for these bits here. All counted in one bucket which must be the bucket for the last item. counts[last] = 0; // clear ready for reuse. All other counts must be zero already so save time by not setting to 0. if (fromBit > 0) // move on to next bits (if any remain) to resolve dradix_r(in, working, n, fromBit<8 ? 0 : fromBit-8, toBit-8, counts+256); return; } R_xlen_t cumSum=0; for (R_xlen_t i=0; cumSum> fromBit & mask; working[ counts[thisx]++ ] = *tmp; tmp++; } memcpy(in, working, n*sizeof(double)); if (fromBit==0) { // nothing left to do other than reset the counts to 0, ready for next recursion // the final bucket must contain n and it might be close to the start. After that must be all 0 so no need to reset. // Also this way, we don't need to know how big thisCounts is and therefore no possibility of getting that wrong. // wasteful thisCounts[i]=0 even when already 0 is better than a branch. We are highly recursive at this point // so avoiding memset() is known to be worth it. for (int i=0; counts[i]0 if the element a goes after the element b // doesn't master if stable or not R_xlen_t x = qsort_data[*(int *)a]; R_xlen_t y = qsort_data[*(int *)b]; // return x-y; would like this, but this is long and the cast to int return may not preserve sign // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count // is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts. return (xy); // largest first in a safe branchless way casting long to int } SEXP fsort(SEXP x, SEXP verboseArg) { if (!isLogical(verboseArg) || LENGTH(verboseArg)!=1 || LOGICAL(verboseArg)[0]==NA_LOGICAL) error("verbose must be TRUE or FALSE"); Rboolean verbose = LOGICAL(verboseArg)[0]; if (!isNumeric(x)) error("x must be a vector of type 'double' currently"); // TODO: not only detect if already sorted, but if it is, just return x to save the duplicate SEXP ansVec = PROTECT(allocVector(REALSXP, xlength(x))); double *ans = REAL(ansVec); // allocate early in case fails if not enough RAM // TODO: document this is much cheaper than a copy followed by in-place. int nth = getDTthreads(); int nBatch=nth*2; // at least nth; more to reduce last-man-home; but not too large to keep counts small in cache if (verbose) Rprintf("nth=%d, nBatch=%d\n",nth,nBatch); R_xlen_t batchSize = (xlength(x)-1)/nBatch + 1; if (batchSize < 1024) batchSize = 1024; // simple attempt to work reasonably for short vector. 1024*8 = 2 4kb pages nBatch = (xlength(x)-1)/batchSize + 1; R_xlen_t lastBatchSize = xlength(x) - (nBatch-1)*batchSize; // could be that lastBatchSize == batchSize when i) xlength(x) is multiple of nBatch // and ii) for small vectors with just one batch double mins[nBatch], maxs[nBatch]; #pragma omp parallel for schedule(dynamic) num_threads(nth) for (int batch=0; batchmyMax) myMax=*d; d++; } mins[batch] = myMin; maxs[batch] = myMax; } double min=mins[0], max=maxs[0]; for (int i=1; imax) max=maxs[i]; } if (verbose) Rprintf("Range = [%g,%g]\n", min, max); if (min < 0.0) error("Cannot yet handle negatives."); // TODO: -0ULL should allow negatives // avoid twiddle function call as expensive in recent tests (0.34 vs 2.7) // possibly twiddle once to *ans, then untwiddle at the end in a fast parallel sweep u.d = max; unsigned long long maxULL = u.ull; u.d = min; minULL = u.ull; // set static global for use by dradix_r int maxBit = floor(log(maxULL-minULL) / log(2)); // 0 is the least significant bit int MSBNbits = maxBit > 15 ? 16 : maxBit+1; // how many bits make up the MSB int shift = maxBit + 1 - MSBNbits; // the right shift to leave the MSB bits remaining int MSBsize = 1< 65,536) if (verbose) Rprintf("maxBit=%d; MSBNbits=%d; shift=%d; MSBsize=%d\n", maxBit, MSBNbits, shift, MSBsize); R_xlen_t *counts = calloc(nBatch*(size_t)MSBsize, sizeof(R_xlen_t)); if (counts==NULL) error("Unable to allocate working memory"); // provided MSBsize>=9, each batch is a multiple of at least one 4k page, so no page overlap // TODO: change all calloc, malloc and free to Calloc and Free to be robust to error() and catch ooms. if (verbose) Rprintf("counts is %dMB (%d pages per nBatch=%d, batchSize=%lld, lastBatchSize=%lld)\n", nBatch*MSBsize*sizeof(R_xlen_t)/(1024*1024), nBatch*MSBsize*sizeof(R_xlen_t)/(4*1024*nBatch), nBatch, batchSize, lastBatchSize); #pragma omp parallel for num_threads(nth) for (int batch=0; batch> shift]++; tmp++; } } // cumulate columnwise; parallel histogram; small so no need to parallelize R_xlen_t rollSum=0; for (int msb=0; msb> shift]++ ] = *source; // This assignment to ans is not random access as it may seem, but cache efficient by // design since target pages are written to contiguously. MSBsize * 4k < cache. // TODO: therefore 16 bit MSB seems too big for this step. Time this step and reduce 16 a lot. // 20MB cache / nth / 4k => MSBsize=160 source++; } } // Done with batches now. Will not use batch dimension again. // TODO: add a timing point up to here if (shift > 0) { // otherwise, no more bits left to resolve ties and we're done int toBit = shift-1; int fromBit = toBit>7 ? toBit-7 : 0; // sort bins by size, largest first to minimise last-man-home R_xlen_t *msbCounts = counts + (nBatch-1)*(size_t)MSBsize; // msbCounts currently contains the ending position of each MSB (the starting location of the next) even across empty if (msbCounts[MSBsize-1] != xlength(x)) error("Internal error: counts[nBatch-1][MSBsize-1] != length(x)"); R_xlen_t *msbFrom = malloc(MSBsize*sizeof(R_xlen_t)); int *order = malloc(MSBsize*sizeof(int)); R_xlen_t cumSum = 0; for (int i=0; i0 && msbCounts[order[MSBsize-1]] < 2) MSBsize--; if (verbose) { Rprintf("%d by excluding 0 and 1 counts\n", MSBsize); } #pragma omp parallel num_threads(getDTthreads()) { R_xlen_t *counts = calloc((toBit/8 + 1)*256, sizeof(R_xlen_t)); // each thread has its own (small) stack of counts // don't use VLAs here: perhaps too big for stack yes but more that VLAs apparently fail with schedule(dynamic) double *working=NULL; // the working memory (for the largest groups) is allocated the first time the thread is assigned to // an iteration. #pragma omp for schedule(dynamic,1) // All we assume here is that a thread can never be assigned to an earlier iteration; i.e. threads 0:(nth-1) // get iterations 0:(nth-1) possibly out of order, then first-come-first-served in order after that. // If a thread deals with an msb lower than the first one it dealt with, then its *working will be too small. for (int msb=0; msb 65,536) that the largest MSB should be // relatively small anyway (n/65,536 if uniformly distributed). // For msb>=nth, that thread's *working will already be big // enough because the smallest *working (for thread nth-1) is big enough for all iterations following. // Progressively, less and less of the working will be needed by the thread (just the first thisN will be // used) and the unused pages will simply not be cached. // TODO: Calloc isn't thread-safe. But this deep malloc should be ok here as no possible error() points // before free. Just need to add the check and exit thread safely somehow. if (thisN <= INSERT_THRESH) { dinsert(ans+from, thisN); } else { dradix_r(ans+from, working, thisN, fromBit, toBit, counts); } } free(counts); free(working); } free(msbFrom); free(order); } free(counts); // TODO: parallel sweep to check sorted using <= on original input. Feasible that twiddling messed up. // After a few years of heavy use remove this check for speed, and move into unit tests. // It's a perfectly contiguous and cache efficient parallel scan so should be relatively negligible. UNPROTECT(1); return(ansVec); } data.table/src/fwriteLookups.h0000644000175100001440000031157513172212367016124 0ustar hornikusers// // Generated by fwrite.c:genLookups() // // 3 vectors: sigparts, expsig and exppow // Includes precision higher than double; leave this compiler on this machine // to parse the literals at reduced precision. // 2^(-1023:1024) is held more accurately than double provides by storing its // exponent separately (expsig and exppow) // We don't want to depend on 'long double' (>64bit) availability to generate // these at runtime; libraries and hardware vary. // These small lookup tables are used for speed. // double sigparts[53] = { 0.0, 5.0000000000000000000000000000000000000000e-01, 2.5000000000000000000000000000000000000000e-01, 1.2500000000000000000000000000000000000000e-01, 6.2500000000000000000000000000000000000000e-02, 3.1250000000000000000000000000000000000000e-02, 1.5625000000000000000000000000000000000000e-02, 7.8125000000000000000000000000000000000000e-03, 3.9062500000000000000000000000000000000000e-03, 1.9531250000000000000000000000000000000000e-03, 9.7656250000000000000000000000000000000000e-04, 4.8828125000000000000000000000000000000000e-04, 2.4414062500000000000000000000000000000000e-04, 1.2207031250000000000000000000000000000000e-04, 6.1035156250000000000000000000000000000000e-05, 3.0517578125000000000000000000000000000000e-05, 1.5258789062500000000000000000000000000000e-05, 7.6293945312500000000000000000000000000000e-06, 3.8146972656250000000000000000000000000000e-06, 1.9073486328125000000000000000000000000000e-06, 9.5367431640625000000000000000000000000000e-07, 4.7683715820312500000000000000000000000000e-07, 2.3841857910156250000000000000000000000000e-07, 1.1920928955078125000000000000000000000000e-07, 5.9604644775390625000000000000000000000000e-08, 2.9802322387695312500000000000000000000000e-08, 1.4901161193847656250000000000000000000000e-08, 7.4505805969238281250000000000000000000000e-09, 3.7252902984619140625000000000000000000000e-09, 1.8626451492309570312500000000000000000000e-09, 9.3132257461547851562500000000000000000000e-10, 4.6566128730773925781250000000000000000000e-10, 2.3283064365386962890625000000000000000000e-10, 1.1641532182693481445312500000000000000000e-10, 5.8207660913467407226562500000000000000000e-11, 2.9103830456733703613281250000000000000000e-11, 1.4551915228366851806640625000000000000000e-11, 7.2759576141834259033203125000000000000000e-12, 3.6379788070917129516601562500000000000000e-12, 1.8189894035458564758300781250000000000000e-12, 9.0949470177292823791503906250000000000000e-13, 4.5474735088646411895751953125000000000000e-13, 2.2737367544323205947875976562500000000000e-13, 1.1368683772161602973937988281250000000000e-13, 5.6843418860808014869689941406250000000000e-14, 2.8421709430404007434844970703125000000000e-14, 1.4210854715202003717422485351562500000000e-14, 7.1054273576010018587112426757812500000000e-15, 3.5527136788005009293556213378906250000000e-15, 1.7763568394002504646778106689453125000000e-15, 8.8817841970012523233890533447265625000000e-16, 4.4408920985006261616945266723632812500000e-16, 2.2204460492503130808472633361816406250000e-16 }; double expsig[2048] = { 1.1125369292536006915451163586662020321096, 2.2250738585072013830902327173324040642192, 4.4501477170144027661804654346648081284384, 8.9002954340288055323609308693296162568769, 1.7800590868057611064721861738659232513754, 3.5601181736115222129443723477318465027507, 7.1202363472230444258887446954636930055015, 1.4240472694446088851777489390927386011003, 2.8480945388892177703554978781854772022006, 5.6961890777784355407109957563709544044012, 1.1392378155556871081421991512741908808802, 2.2784756311113742162843983025483817617605, 4.5569512622227484325687966050967635235210, 9.1139025244454968651375932101935270470419, 1.8227805048890993730275186420387054094084, 3.6455610097781987460550372840774108188168, 7.2911220195563974921100745681548216376335, 1.4582244039112794984220149136309643275267, 2.9164488078225589968440298272619286550534, 5.8328976156451179936880596545238573101068, 1.1665795231290235987376119309047714620214, 2.3331590462580471974752238618095429240427, 4.6663180925160943949504477236190858480855, 9.3326361850321887899008954472381716961709, 1.8665272370064377579801790894476343392342, 3.7330544740128755159603581788952686784684, 7.4661089480257510319207163577905373569367, 1.4932217896051502063841432715581074713873, 2.9864435792103004127682865431162149427747, 5.9728871584206008255365730862324298855494, 1.1945774316841201651073146172464859771099, 2.3891548633682403302146292344929719542198, 4.7783097267364806604292584689859439084395, 9.5566194534729613208585169379718878168790, 1.9113238906945922641717033875943775633758, 3.8226477813891845283434067751887551267516, 7.6452955627783690566868135503775102535032, 1.5290591125556738113373627100755020507006, 3.0581182251113476226747254201510041014013, 6.1162364502226952453494508403020082028026, 1.2232472900445390490698901680604016405605, 2.4464945800890780981397803361208032811210, 4.8929891601781561962795606722416065622421, 9.7859783203563123925591213444832131244841, 1.9571956640712624785118242688966426248968, 3.9143913281425249570236485377932852497936, 7.8287826562850499140472970755865704995873, 1.5657565312570099828094594151173140999175, 3.1315130625140199656189188302346281998349, 6.2630261250280399312378376604692563996698, 1.2526052250056079862475675320938512799340, 2.5052104500112159724951350641877025598679, 5.0104209000224319449902701283754051197359, 1.0020841800044863889980540256750810239472, 2.0041683600089727779961080513501620478943, 4.0083367200179455559922161027003240957887, 8.0166734400358911119844322054006481915774, 1.6033346880071782223968864410801296383155, 3.2066693760143564447937728821602592766310, 6.4133387520287128895875457643205185532619, 1.2826677504057425779175091528641037106524, 2.5653355008114851558350183057282074213048, 5.1306710016229703116700366114564148426095, 1.0261342003245940623340073222912829685219, 2.0522684006491881246680146445825659370438, 4.1045368012983762493360292891651318740876, 8.2090736025967524986720585783302637481752, 1.6418147205193504997344117156660527496350, 3.2836294410387009994688234313321054992701, 6.5672588820774019989376468626642109985402, 1.3134517764154803997875293725328421997080, 2.6269035528309607995750587450656843994161, 5.2538071056619215991501174901313687988322, 1.0507614211323843198300234980262737597664, 2.1015228422647686396600469960525475195329, 4.2030456845295372793200939921050950390657, 8.4060913690590745586401879842101900781314, 1.6812182738118149117280375968420380156263, 3.3624365476236298234560751936840760312526, 6.7248730952472596469121503873681520625052, 1.3449746190494519293824300774736304125010, 2.6899492380989038587648601549472608250021, 5.3798984761978077175297203098945216500041, 1.0759796952395615435059440619789043300008, 2.1519593904791230870118881239578086600017, 4.3039187809582461740237762479156173200033, 8.6078375619164923480475524958312346400066, 1.7215675123832984696095104991662469280013, 3.4431350247665969392190209983324938560026, 6.8862700495331938784380419966649877120053, 1.3772540099066387756876083993329975424011, 2.7545080198132775513752167986659950848021, 5.5090160396265551027504335973319901696042, 1.1018032079253110205500867194663980339208, 2.2036064158506220411001734389327960678417, 4.4072128317012440822003468778655921356834, 8.8144256634024881644006937557311842713668, 1.7628851326804976328801387511462368542734, 3.5257702653609952657602775022924737085467, 7.0515405307219905315205550045849474170934, 1.4103081061443981063041110009169894834187, 2.8206162122887962126082220018339789668374, 5.6412324245775924252164440036679579336747, 1.1282464849155184850432888007335915867349, 2.2564929698310369700865776014671831734699, 4.5129859396620739401731552029343663469398, 9.0259718793241478803463104058687326938796, 1.8051943758648295760692620811737465387759, 3.6103887517296591521385241623474930775518, 7.2207775034593183042770483246949861551037, 1.4441555006918636608554096649389972310207, 2.8883110013837273217108193298779944620415, 5.7766220027674546434216386597559889240829, 1.1553244005534909286843277319511977848166, 2.3106488011069818573686554639023955696332, 4.6212976022139637147373109278047911392663, 9.2425952044279274294746218556095822785327, 1.8485190408855854858949243711219164557065, 3.6970380817711709717898487422438329114131, 7.3940761635423419435796974844876658228261, 1.4788152327084683887159394968975331645652, 2.9576304654169367774318789937950663291305, 5.9152609308338735548637579875901326582609, 1.1830521861667747109727515975180265316522, 2.3661043723335494219455031950360530633044, 4.7322087446670988438910063900721061266087, 9.4644174893341976877820127801442122532175, 1.8928834978668395375564025560288424506435, 3.7857669957336790751128051120576849012870, 7.5715339914673581502256102241153698025740, 1.5143067982934716300451220448230739605148, 3.0286135965869432600902440896461479210296, 6.0572271931738865201804881792922958420592, 1.2114454386347773040360976358584591684118, 2.4228908772695546080721952717169183368237, 4.8457817545391092161443905434338366736473, 9.6915635090782184322887810868676733472947, 1.9383127018156436864577562173735346694589, 3.8766254036312873729155124347470693389179, 7.7532508072625747458310248694941386778357, 1.5506501614525149491662049738988277355671, 3.1013003229050298983324099477976554711343, 6.2026006458100597966648198955953109422686, 1.2405201291620119593329639791190621884537, 2.4810402583240239186659279582381243769074, 4.9620805166480478373318559164762487538149, 9.9241610332960956746637118329524975076297, 1.9848322066592191349327423665904995015259, 3.9696644133184382698654847331809990030519, 7.9393288266368765397309694663619980061038, 1.5878657653273753079461938932723996012208, 3.1757315306547506158923877865447992024415, 6.3514630613095012317847755730895984048830, 1.2702926122619002463569551146179196809766, 2.5405852245238004927139102292358393619532, 5.0811704490476009854278204584716787239064, 1.0162340898095201970855640916943357447813, 2.0324681796190403941711281833886714895626, 4.0649363592380807883422563667773429791251, 8.1298727184761615766845127335546859582503, 1.6259745436952323153369025467109371916501, 3.2519490873904646306738050934218743833001, 6.5038981747809292613476101868437487666002, 1.3007796349561858522695220373687497533200, 2.6015592699123717045390440747374995066401, 5.2031185398247434090780881494749990132802, 1.0406237079649486818156176298949998026560, 2.0812474159298973636312352597899996053121, 4.1624948318597947272624705195799992106241, 8.3249896637195894545249410391599984212483, 1.6649979327439178909049882078319996842497, 3.3299958654878357818099764156639993684993, 6.6599917309756715636199528313279987369986, 1.3319983461951343127239905662655997473997, 2.6639966923902686254479811325311994947995, 5.3279933847805372508959622650623989895989, 1.0655986769561074501791924530124797979198, 2.1311973539122149003583849060249595958396, 4.2623947078244298007167698120499191916791, 8.5247894156488596014335396240998383833583, 1.7049578831297719202867079248199676766717, 3.4099157662595438405734158496399353533433, 6.8198315325190876811468316992798707066866, 1.3639663065038175362293663398559741413373, 2.7279326130076350724587326797119482826746, 5.4558652260152701449174653594238965653493, 1.0911730452030540289834930718847793130699, 2.1823460904061080579669861437695586261397, 4.3646921808122161159339722875391172522794, 8.7293843616244322318679445750782345045589, 1.7458768723248864463735889150156469009118, 3.4917537446497728927471778300312938018235, 6.9835074892995457854943556600625876036471, 1.3967014978599091570988711320125175207294, 2.7934029957198183141977422640250350414588, 5.5868059914396366283954845280500700829177, 1.1173611982879273256790969056100140165835, 2.2347223965758546513581938112200280331671, 4.4694447931517093027163876224400560663341, 8.9388895863034186054327752448801121326683, 1.7877779172606837210865550489760224265337, 3.5755558345213674421731100979520448530673, 7.1511116690427348843462201959040897061346, 1.4302223338085469768692440391808179412269, 2.8604446676170939537384880783616358824538, 5.7208893352341879074769761567232717649077, 1.1441778670468375814953952313446543529815, 2.2883557340936751629907904626893087059631, 4.5767114681873503259815809253786174119262, 9.1534229363747006519631618507572348238523, 1.8306845872749401303926323701514469647705, 3.6613691745498802607852647403028939295409, 7.3227383490997605215705294806057878590818, 1.4645476698199521043141058961211575718164, 2.9290953396399042086282117922423151436327, 5.8581906792798084172564235844846302872655, 1.1716381358559616834512847168969260574531, 2.3432762717119233669025694337938521149062, 4.6865525434238467338051388675877042298124, 9.3731050868476934676102777351754084596248, 1.8746210173695386935220555470350816919250, 3.7492420347390773870441110940701633838499, 7.4984840694781547740882221881403267676998, 1.4996968138956309548176444376280653535400, 2.9993936277912619096352888752561307070799, 5.9987872555825238192705777505122614141598, 1.1997574511165047638541155501024522828320, 2.3995149022330095277082311002049045656639, 4.7990298044660190554164622004098091313279, 9.5980596089320381108329244008196182626558, 1.9196119217864076221665848801639236525312, 3.8392238435728152443331697603278473050623, 7.6784476871456304886663395206556946101246, 1.5356895374291260977332679041311389220249, 3.0713790748582521954665358082622778440498, 6.1427581497165043909330716165245556880997, 1.2285516299433008781866143233049111376199, 2.4571032598866017563732286466098222752399, 4.9142065197732035127464572932196445504797, 9.8284130395464070254929145864392891009595, 1.9656826079092814050985829172878578201919, 3.9313652158185628101971658345757156403838, 7.8627304316371256203943316691514312807676, 1.5725460863274251240788663338302862561535, 3.1450921726548502481577326676605725123070, 6.2901843453097004963154653353211450246141, 1.2580368690619400992630930670642290049228, 2.5160737381238801985261861341284580098456, 5.0321474762477603970523722682569160196913, 1.0064294952495520794104744536513832039383, 2.0128589904991041588209489073027664078765, 4.0257179809982083176418978146055328157530, 8.0514359619964166352837956292110656315060, 1.6102871923992833270567591258422131263012, 3.2205743847985666541135182516844262526024, 6.4411487695971333082270365033688525052048, 1.2882297539194266616454073006737705010410, 2.5764595078388533232908146013475410020819, 5.1529190156777066465816292026950820041639, 1.0305838031355413293163258405390164008328, 2.0611676062710826586326516810780328016655, 4.1223352125421653172653033621560656033311, 8.2446704250843306345306067243121312066622, 1.6489340850168661269061213448624262413324, 3.2978681700337322538122426897248524826649, 6.5957363400674645076244853794497049653297, 1.3191472680134929015248970758899409930659, 2.6382945360269858030497941517798819861319, 5.2765890720539716060995883035597639722638, 1.0553178144107943212199176607119527944528, 2.1106356288215886424398353214239055889055, 4.2212712576431772848796706428478111778110, 8.4425425152863545697593412856956223556221, 1.6885085030572709139518682571391244711244, 3.3770170061145418279037365142782489422488, 6.7540340122290836558074730285564978844976, 1.3508068024458167311614946057112995768995, 2.7016136048916334623229892114225991537991, 5.4032272097832669246459784228451983075981, 1.0806454419566533849291956845690396615196, 2.1612908839133067698583913691380793230392, 4.3225817678266135397167827382761586460785, 8.6451635356532270794335654765523172921570, 1.7290327071306454158867130953104634584314, 3.4580654142612908317734261906209269168628, 6.9161308285225816635468523812418538337256, 1.3832261657045163327093704762483707667451, 2.7664523314090326654187409524967415334902, 5.5329046628180653308374819049934830669805, 1.1065809325636130661674963809986966133961, 2.2131618651272261323349927619973932267922, 4.4263237302544522646699855239947864535844, 8.8526474605089045293399710479895729071687, 1.7705294921017809058679942095979145814337, 3.5410589842035618117359884191958291628675, 7.0821179684071236234719768383916583257350, 1.4164235936814247246943953676783316651470, 2.8328471873628494493887907353566633302940, 5.6656943747256988987775814707133266605880, 1.1331388749451397797555162941426653321176, 2.2662777498902795595110325882853306642352, 4.5325554997805591190220651765706613284704, 9.0651109995611182380441303531413226569408, 1.8130221999122236476088260706282645313882, 3.6260443998244472952176521412565290627763, 7.2520887996488945904353042825130581255526, 1.4504177599297789180870608565026116251105, 2.9008355198595578361741217130052232502211, 5.8016710397191156723482434260104465004421, 1.1603342079438231344696486852020893000884, 2.3206684158876462689392973704041786001768, 4.6413368317752925378785947408083572003537, 9.2826736635505850757571894816167144007074, 1.8565347327101170151514378963233428801415, 3.7130694654202340303028757926466857602830, 7.4261389308404680606057515852933715205659, 1.4852277861680936121211503170586743041132, 2.9704555723361872242423006341173486082264, 5.9409111446723744484846012682346972164527, 1.1881822289344748896969202536469394432905, 2.3763644578689497793938405072938788865811, 4.7527289157378995587876810145877577731622, 9.5054578314757991175753620291755155463244, 1.9010915662951598235150724058351031092649, 3.8021831325903196470301448116702062185297, 7.6043662651806392940602896233404124370595, 1.5208732530361278588120579246680824874119, 3.0417465060722557176241158493361649748238, 6.0834930121445114352482316986723299496476, 1.2166986024289022870496463397344659899295, 2.4333972048578045740992926794689319798590, 4.8667944097156091481985853589378639597181, 9.7335888194312182963971707178757279194361, 1.9467177638862436592794341435751455838872, 3.8934355277724873185588682871502911677745, 7.7868710555449746371177365743005823355489, 1.5573742111089949274235473148601164671098, 3.1147484222179898548470946297202329342196, 6.2294968444359797096941892594404658684391, 1.2458993688871959419388378518880931736878, 2.4917987377743918838776757037761863473757, 4.9835974755487837677553514075523726947513, 9.9671949510975675355107028151047453895026, 1.9934389902195135071021405630209490779005, 3.9868779804390270142042811260418981558010, 7.9737559608780540284085622520837963116021, 1.5947511921756108056817124504167592623204, 3.1895023843512216113634249008335185246408, 6.3790047687024432227268498016670370492817, 1.2758009537404886445453699603334074098563, 2.5516019074809772890907399206668148197127, 5.1032038149619545781814798413336296394253, 1.0206407629923909156362959682667259278851, 2.0412815259847818312725919365334518557701, 4.0825630519695636625451838730669037115403, 8.1651261039391273250903677461338074230805, 1.6330252207878254650180735492267614846161, 3.2660504415756509300361470984535229692322, 6.5321008831513018600722941969070459384644, 1.3064201766302603720144588393814091876929, 2.6128403532605207440289176787628183753858, 5.2256807065210414880578353575256367507715, 1.0451361413042082976115670715051273501543, 2.0902722826084165952231341430102547003086, 4.1805445652168331904462682860205094006172, 8.3610891304336663808925365720410188012345, 1.6722178260867332761785073144082037602469, 3.3444356521734665523570146288164075204938, 6.6888713043469331047140292576328150409876, 1.3377742608693866209428058515265630081975, 2.6755485217387732418856117030531260163950, 5.3510970434775464837712234061062520327901, 1.0702194086955092967542446812212504065580, 2.1404388173910185935084893624425008131160, 4.2808776347820371870169787248850016262320, 8.5617552695640743740339574497700032524641, 1.7123510539128148748067914899540006504928, 3.4247021078256297496135829799080013009856, 6.8494042156512594992271659598160026019713, 1.3698808431302518998454331919632005203943, 2.7397616862605037996908663839264010407885, 5.4795233725210075993817327678528020815770, 1.0959046745042015198763465535705604163154, 2.1918093490084030397526931071411208326308, 4.3836186980168060795053862142822416652616, 8.7672373960336121590107724285644833305232, 1.7534474792067224318021544857128966661046, 3.5068949584134448636043089714257933322093, 7.0137899168268897272086179428515866644186, 1.4027579833653779454417235885703173328837, 2.8055159667307558908834471771406346657674, 5.6110319334615117817668943542812693315349, 1.1222063866923023563533788708562538663070, 2.2444127733846047127067577417125077326139, 4.4888255467692094254135154834250154652279, 8.9776510935384188508270309668500309304558, 1.7955302187076837701654061933700061860912, 3.5910604374153675403308123867400123721823, 7.1821208748307350806616247734800247443646, 1.4364241749661470161323249546960049488729, 2.8728483499322940322646499093920098977459, 5.7456966998645880645292998187840197954917, 1.1491393399729176129058599637568039590983, 2.2982786799458352258117199275136079181967, 4.5965573598916704516234398550272158363934, 9.1931147197833409032468797100544316727867, 1.8386229439566681806493759420108863345573, 3.6772458879133363612987518840217726691147, 7.3544917758266727225975037680435453382294, 1.4708983551653345445195007536087090676459, 2.9417967103306690890390015072174181352918, 5.8835934206613381780780030144348362705835, 1.1767186841322676356156006028869672541167, 2.3534373682645352712312012057739345082334, 4.7068747365290705424624024115478690164668, 9.4137494730581410849248048230957380329336, 1.8827498946116282169849609646191476065867, 3.7654997892232564339699219292382952131734, 7.5309995784465128679398438584765904263469, 1.5061999156893025735879687716953180852694, 3.0123998313786051471759375433906361705388, 6.0247996627572102943518750867812723410775, 1.2049599325514420588703750173562544682155, 2.4099198651028841177407500347125089364310, 4.8198397302057682354815000694250178728620, 9.6396794604115364709630001388500357457240, 1.9279358920823072941926000277700071491448, 3.8558717841646145883852000555400142982896, 7.7117435683292291767704001110800285965792, 1.5423487136658458353540800222160057193158, 3.0846974273316916707081600444320114386317, 6.1693948546633833414163200888640228772634, 1.2338789709326766682832640177728045754527, 2.4677579418653533365665280355456091509053, 4.9355158837307066731330560710912183018107, 9.8710317674614133462661121421824366036214, 1.9742063534922826692532224284364873207243, 3.9484127069845653385064448568729746414486, 7.8968254139691306770128897137459492828971, 1.5793650827938261354025779427491898565794, 3.1587301655876522708051558854983797131588, 6.3174603311753045416103117709967594263177, 1.2634920662350609083220623541993518852635, 2.5269841324701218166441247083987037705271, 5.0539682649402436332882494167974075410542, 1.0107936529880487266576498833594815082108, 2.0215873059760974533152997667189630164217, 4.0431746119521949066305995334379260328433, 8.0863492239043898132611990668758520656866, 1.6172698447808779626522398133751704131373, 3.2345396895617559253044796267503408262747, 6.4690793791235118506089592535006816525493, 1.2938158758247023701217918507001363305099, 2.5876317516494047402435837014002726610197, 5.1752635032988094804871674028005453220395, 1.0350527006597618960974334805601090644079, 2.0701054013195237921948669611202181288158, 4.1402108026390475843897339222404362576316, 8.2804216052780951687794678444808725152631, 1.6560843210556190337558935688961745030526, 3.3121686421112380675117871377923490061053, 6.6243372842224761350235742755846980122105, 1.3248674568444952270047148551169396024421, 2.6497349136889904540094297102338792048842, 5.2994698273779809080188594204677584097684, 1.0598939654755961816037718840935516819537, 2.1197879309511923632075437681871033639074, 4.2395758619023847264150875363742067278147, 8.4791517238047694528301750727484134556294, 1.6958303447609538905660350145496826911259, 3.3916606895219077811320700290993653822518, 6.7833213790438155622641400581987307645036, 1.3566642758087631124528280116397461529007, 2.7133285516175262249056560232794923058014, 5.4266571032350524498113120465589846116028, 1.0853314206470104899622624093117969223206, 2.1706628412940209799245248186235938446411, 4.3413256825880419598490496372471876892823, 8.6826513651760839196980992744943753785645, 1.7365302730352167839396198548988750757129, 3.4730605460704335678792397097977501514258, 6.9461210921408671357584794195955003028516, 1.3892242184281734271516958839191000605703, 2.7784484368563468543033917678382001211407, 5.5568968737126937086067835356764002422813, 1.1113793747425387417213567071352800484563, 2.2227587494850774834427134142705600969125, 4.4455174989701549668854268285411201938250, 8.8910349979403099337708536570822403876501, 1.7782069995880619867541707314164480775300, 3.5564139991761239735083414628328961550600, 7.1128279983522479470166829256657923101201, 1.4225655996704495894033365851331584620240, 2.8451311993408991788066731702663169240480, 5.6902623986817983576133463405326338480961, 1.1380524797363596715226692681065267696192, 2.2761049594727193430453385362130535392384, 4.5522099189454386860906770724261070784769, 9.1044198378908773721813541448522141569537, 1.8208839675781754744362708289704428313907, 3.6417679351563509488725416579408856627815, 7.2835358703127018977450833158817713255630, 1.4567071740625403795490166631763542651126, 2.9134143481250807590980333263527085302252, 5.8268286962501615181960666527054170604504, 1.1653657392500323036392133305410834120901, 2.3307314785000646072784266610821668241801, 4.6614629570001292145568533221643336483603, 9.3229259140002584291137066443286672967206, 1.8645851828000516858227413288657334593441, 3.7291703656001033716454826577314669186882, 7.4583407312002067432909653154629338373765, 1.4916681462400413486581930630925867674753, 2.9833362924800826973163861261851735349506, 5.9666725849601653946327722523703470699012, 1.1933345169920330789265544504740694139802, 2.3866690339840661578531089009481388279605, 4.7733380679681323157062178018962776559209, 9.5466761359362646314124356037925553118419, 1.9093352271872529262824871207585110623684, 3.8186704543745058525649742415170221247368, 7.6373409087490117051299484830340442494735, 1.5274681817498023410259896966068088498947, 3.0549363634996046820519793932136176997894, 6.1098727269992093641039587864272353995788, 1.2219745453998418728207917572854470799158, 2.4439490907996837456415835145708941598315, 4.8878981815993674912831670291417883196630, 9.7757963631987349825663340582835766393261, 1.9551592726397469965132668116567153278652, 3.9103185452794939930265336233134306557304, 7.8206370905589879860530672466268613114609, 1.5641274181117975972106134493253722622922, 3.1282548362235951944212268986507445245843, 6.2565096724471903888424537973014890491687, 1.2513019344894380777684907594602978098337, 2.5026038689788761555369815189205956196675, 5.0052077379577523110739630378411912393350, 1.0010415475915504622147926075682382478670, 2.0020830951831009244295852151364764957340, 4.0041661903662018488591704302729529914680, 8.0083323807324036977183408605459059829359, 1.6016664761464807395436681721091811965872, 3.2033329522929614790873363442183623931744, 6.4066659045859229581746726884367247863487, 1.2813331809171845916349345376873449572697, 2.5626663618343691832698690753746899145395, 5.1253327236687383665397381507493798290790, 1.0250665447337476733079476301498759658158, 2.0501330894674953466158952602997519316316, 4.1002661789349906932317905205995038632632, 8.2005323578699813864635810411990077265264, 1.6401064715739962772927162082398015453053, 3.2802129431479925545854324164796030906106, 6.5604258862959851091708648329592061812211, 1.3120851772591970218341729665918412362442, 2.6241703545183940436683459331836824724884, 5.2483407090367880873366918663673649449769, 1.0496681418073576174673383732734729889954, 2.0993362836147152349346767465469459779908, 4.1986725672294304698693534930938919559815, 8.3973451344588609397387069861877839119630, 1.6794690268917721879477413972375567823926, 3.3589380537835443758954827944751135647852, 6.7178761075670887517909655889502271295704, 1.3435752215134177503581931177900454259141, 2.6871504430268355007163862355800908518282, 5.3743008860536710014327724711601817036563, 1.0748601772107342002865544942320363407313, 2.1497203544214684005731089884640726814625, 4.2994407088429368011462179769281453629251, 8.5988814176858736022924359538562907258501, 1.7197762835371747204584871907712581451700, 3.4395525670743494409169743815425162903401, 6.8791051341486988818339487630850325806801, 1.3758210268297397763667897526170065161360, 2.7516420536594795527335795052340130322720, 5.5032841073189591054671590104680260645441, 1.1006568214637918210934318020936052129088, 2.2013136429275836421868636041872104258176, 4.4026272858551672843737272083744208516353, 8.8052545717103345687474544167488417032705, 1.7610509143420669137494908833497683406541, 3.5221018286841338274989817666995366813082, 7.0442036573682676549979635333990733626164, 1.4088407314736535309995927066798146725233, 2.8176814629473070619991854133596293450466, 5.6353629258946141239983708267192586900931, 1.1270725851789228247996741653438517380186, 2.2541451703578456495993483306877034760373, 4.5082903407156912991986966613754069520745, 9.0165806814313825983973933227508139041490, 1.8033161362862765196794786645501627808298, 3.6066322725725530393589573291003255616596, 7.2132645451451060787179146582006511233192, 1.4426529090290212157435829316401302246638, 2.8853058180580424314871658632802604493277, 5.7706116361160848629743317265605208986554, 1.1541223272232169725948663453121041797311, 2.3082446544464339451897326906242083594622, 4.6164893088928678903794653812484167189243, 9.2329786177857357807589307624968334378486, 1.8465957235571471561517861524993666875697, 3.6931914471142943123035723049987333751394, 7.3863828942285886246071446099974667502789, 1.4772765788457177249214289219994933500558, 2.9545531576914354498428578439989867001116, 5.9091063153828708996857156879979734002231, 1.1818212630765741799371431375995946800446, 2.3636425261531483598742862751991893600892, 4.7272850523062967197485725503983787201785, 9.4545701046125934394971451007967574403570, 1.8909140209225186878994290201593514880714, 3.7818280418450373757988580403187029761428, 7.5636560836900747515977160806374059522856, 1.5127312167380149503195432161274811904571, 3.0254624334760299006390864322549623809142, 6.0509248669520598012781728645099247618285, 1.2101849733904119602556345729019849523657, 2.4203699467808239205112691458039699047314, 4.8407398935616478410225382916079398094628, 9.6814797871232956820450765832158796189255, 1.9362959574246591364090153166431759237851, 3.8725919148493182728180306332863518475702, 7.7451838296986365456360612665727036951404, 1.5490367659397273091272122533145407390281, 3.0980735318794546182544245066290814780562, 6.1961470637589092365088490132581629561124, 1.2392294127517818473017698026516325912225, 2.4784588255035636946035396053032651824449, 4.9569176510071273892070792106065303648899, 9.9138353020142547784141584212130607297798, 1.9827670604028509556828316842426121459560, 3.9655341208057019113656633684852242919119, 7.9310682416114038227313267369704485838238, 1.5862136483222807645462653473940897167648, 3.1724272966445615290925306947881794335295, 6.3448545932891230581850613895763588670590, 1.2689709186578246116370122779152717734118, 2.5379418373156492232740245558305435468236, 5.0758836746312984465480491116610870936472, 1.0151767349262596893096098223322174187294, 2.0303534698525193786192196446644348374589, 4.0607069397050387572384392893288696749178, 8.1214138794100775144768785786577393498356, 1.6242827758820155028953757157315478699671, 3.2485655517640310057907514314630957399342, 6.4971311035280620115815028629261914798685, 1.2994262207056124023163005725852382959737, 2.5988524414112248046326011451704765919474, 5.1977048828224496092652022903409531838948, 1.0395409765644899218530404580681906367790, 2.0790819531289798437060809161363812735579, 4.1581639062579596874121618322727625471158, 8.3163278125159193748243236645455250942316, 1.6632655625031838749648647329091050188463, 3.3265311250063677499297294658182100376927, 6.6530622500127354998594589316364200753853, 1.3306124500025470999718917863272840150771, 2.6612249000050941999437835726545680301541, 5.3224498000101883998875671453091360603082, 1.0644899600020376799775134290618272120616, 2.1289799200040753599550268581236544241233, 4.2579598400081507199100537162473088482466, 8.5159196800163014398201074324946176964932, 1.7031839360032602879640214864989235392986, 3.4063678720065205759280429729978470785973, 6.8127357440130411518560859459956941571946, 1.3625471488026082303712171891991388314389, 2.7250942976052164607424343783982776628778, 5.4501885952104329214848687567965553257556, 1.0900377190420865842969737513593110651511, 2.1800754380841731685939475027186221303023, 4.3601508761683463371878950054372442606045, 8.7203017523366926743757900108744885212090, 1.7440603504673385348751580021748977042418, 3.4881207009346770697503160043497954084836, 6.9762414018693541395006320086995908169672, 1.3952482803738708279001264017399181633934, 2.7904965607477416558002528034798363267869, 5.5809931214954833116005056069596726535738, 1.1161986242990966623201011213919345307148, 2.2323972485981933246402022427838690614295, 4.4647944971963866492804044855677381228590, 8.9295889943927732985608089711354762457180, 1.7859177988785546597121617942270952491436, 3.5718355977571093194243235884541904982872, 7.1436711955142186388486471769083809965744, 1.4287342391028437277697294353816761993149, 2.8574684782056874555394588707633523986298, 5.7149369564113749110789177415267047972595, 1.1429873912822749822157835483053409594519, 2.2859747825645499644315670966106819189038, 4.5719495651290999288631341932213638378076, 9.1438991302581998577262683864427276756153, 1.8287798260516399715452536772885455351231, 3.6575596521032799430905073545770910702461, 7.3151193042065598861810147091541821404922, 1.4630238608413119772362029418308364280984, 2.9260477216826239544724058836616728561969, 5.8520954433652479089448117673233457123938, 1.1704190886730495817889623534646691424788, 2.3408381773460991635779247069293382849575, 4.6816763546921983271558494138586765699150, 9.3633527093843966543116988277173531398300, 1.8726705418768793308623397655434706279660, 3.7453410837537586617246795310869412559320, 7.4906821675075173234493590621738825118640, 1.4981364335015034646898718124347765023728, 2.9962728670030069293797436248695530047456, 5.9925457340060138587594872497391060094912, 1.1985091468012027717518974499478212018982, 2.3970182936024055435037948998956424037965, 4.7940365872048110870075897997912848075930, 9.5880731744096221740151795995825696151860, 1.9176146348819244348030359199165139230372, 3.8352292697638488696060718398330278460744, 7.6704585395276977392121436796660556921488, 1.5340917079055395478424287359332111384298, 3.0681834158110790956848574718664222768595, 6.1363668316221581913697149437328445537190, 1.2272733663244316382739429887465689107438, 2.4545467326488632765478859774931378214876, 4.9090934652977265530957719549862756429752, 9.8181869305954531061915439099725512859504, 1.9636373861190906212383087819945102571901, 3.9272747722381812424766175639890205143802, 7.8545495444763624849532351279780410287603, 1.5709099088952724969906470255956082057521, 3.1418198177905449939812940511912164115041, 6.2836396355810899879625881023824328230083, 1.2567279271162179975925176204764865646017, 2.5134558542324359951850352409529731292033, 5.0269117084648719903700704819059462584066, 1.0053823416929743980740140963811892516813, 2.0107646833859487961480281927623785033626, 4.0215293667718975922960563855247570067253, 8.0430587335437951845921127710495140134506, 1.6086117467087590369184225542099028026901, 3.2172234934175180738368451084198056053802, 6.4344469868350361476736902168396112107605, 1.2868893973670072295347380433679222421521, 2.5737787947340144590694760867358444843042, 5.1475575894680289181389521734716889686084, 1.0295115178936057836277904346943377937217, 2.0590230357872115672555808693886755874434, 4.1180460715744231345111617387773511748867, 8.2360921431488462690223234775547023497734, 1.6472184286297692538044646955109404699547, 3.2944368572595385076089293910218809399094, 6.5888737145190770152178587820437618798187, 1.3177747429038154030435717564087523759637, 2.6355494858076308060871435128175047519275, 5.2710989716152616121742870256350095038550, 1.0542197943230523224348574051270019007710, 2.1084395886461046448697148102540038015420, 4.2168791772922092897394296205080076030840, 8.4337583545844185794788592410160152061680, 1.6867516709168837158957718482032030412336, 3.3735033418337674317915436964064060824672, 6.7470066836675348635830873928128121649344, 1.3494013367335069727166174785625624329869, 2.6988026734670139454332349571251248659738, 5.3976053469340278908664699142502497319475, 1.0795210693868055781732939828500499463895, 2.1590421387736111563465879657000998927790, 4.3180842775472223126931759314001997855580, 8.6361685550944446253863518628003995711160, 1.7272337110188889250772703725600799142232, 3.4544674220377778501545407451201598284464, 6.9089348440755557003090814902403196568928, 1.3817869688151111400618162980480639313786, 2.7635739376302222801236325960961278627571, 5.5271478752604445602472651921922557255142, 1.1054295750520889120494530384384511451028, 2.2108591501041778240989060768769022902057, 4.4217183002083556481978121537538045804114, 8.8434366004167112963956243075076091608228, 1.7686873200833422592791248615015218321646, 3.5373746401666845185582497230030436643291, 7.0747492803333690371164994460060873286582, 1.4149498560666738074232998892012174657316, 2.8298997121333476148465997784024349314633, 5.6597994242666952296931995568048698629266, 1.1319598848533390459386399113609739725853, 2.2639197697066780918772798227219479451706, 4.5278395394133561837545596454438958903413, 9.0556790788267123675091192908877917806825, 1.8111358157653424735018238581775583561365, 3.6222716315306849470036477163551167122730, 7.2445432630613698940072954327102334245460, 1.4489086526122739788014590865420466849092, 2.8978173052245479576029181730840933698184, 5.7956346104490959152058363461681867396368, 1.1591269220898191830411672692336373479274, 2.3182538441796383660823345384672746958547, 4.6365076883592767321646690769345493917095, 9.2730153767185534643293381538690987834189, 1.8546030753437106928658676307738197566838, 3.7092061506874213857317352615476395133676, 7.4184123013748427714634705230952790267351, 1.4836824602749685542926941046190558053470, 2.9673649205499371085853882092381116106941, 5.9347298410998742171707764184762232213881, 1.1869459682199748434341552836952446442776, 2.3738919364399496868683105673904892885552, 4.7477838728798993737366211347809785771105, 9.4955677457597987474732422695619571542210, 1.8991135491519597494946484539123914308442, 3.7982270983039194989892969078247828616884, 7.5964541966078389979785938156495657233768, 1.5192908393215677995957187631299131446754, 3.0385816786431355991914375262598262893507, 6.0771633572862711983828750525196525787014, 1.2154326714572542396765750105039305157403, 2.4308653429145084793531500210078610314806, 4.8617306858290169587063000420157220629611, 9.7234613716580339174126000840314441259223, 1.9446922743316067834825200168062888251845, 3.8893845486632135669650400336125776503689, 7.7787690973264271339300800672251553007378, 1.5557538194652854267860160134450310601476, 3.1115076389305708535720320268900621202951, 6.2230152778611417071440640537801242405903, 1.2446030555722283414288128107560248481181, 2.4892061111444566828576256215120496962361, 4.9784122222889133657152512430240993924722, 9.9568244445778267314305024860481987849444, 1.9913648889155653462861004972096397569889, 3.9827297778311306925722009944192795139778, 7.9654595556622613851444019888385590279555, 1.5930919111324522770288803977677118055911, 3.1861838222649045540577607955354236111822, 6.3723676445298091081155215910708472223644, 1.2744735289059618216231043182141694444729, 2.5489470578119236432462086364283388889458, 5.0978941156238472864924172728566777778915, 1.0195788231247694572984834545713355555783, 2.0391576462495389145969669091426711111566, 4.0783152924990778291939338182853422223132, 8.1566305849981556583878676365706844446265, 1.6313261169996311316775735273141368889253, 3.2626522339992622633551470546282737778506, 6.5253044679985245267102941092565475557012, 1.3050608935997049053420588218513095111402, 2.6101217871994098106841176437026190222805, 5.2202435743988196213682352874052380445609, 1.0440487148797639242736470574810476089122, 2.0880974297595278485472941149620952178244, 4.1761948595190556970945882299241904356487, 8.3523897190381113941891764598483808712975, 1.6704779438076222788378352919696761742595, 3.3409558876152445576756705839393523485190, 6.6819117752304891153513411678787046970380, 1.3363823550460978230702682335757409394076, 2.6727647100921956461405364671514818788152, 5.3455294201843912922810729343029637576304, 1.0691058840368782584562145868605927515261, 2.1382117680737565169124291737211855030522, 4.2764235361475130338248583474423710061043, 8.5528470722950260676497166948847420122086, 1.7105694144590052135299433389769484024417, 3.4211388289180104270598866779538968048835, 6.8422776578360208541197733559077936097669, 1.3684555315672041708239546711815587219534, 2.7369110631344083416479093423631174439068, 5.4738221262688166832958186847262348878135, 1.0947644252537633366591637369452469775627, 2.1895288505075266733183274738904939551254, 4.3790577010150533466366549477809879102508, 8.7581154020301066932733098955619758205016, 1.7516230804060213386546619791123951641003, 3.5032461608120426773093239582247903282007, 7.0064923216240853546186479164495806564013, 1.4012984643248170709237295832899161312803, 2.8025969286496341418474591665798322625605, 5.6051938572992682836949183331596645251210, 1.1210387714598536567389836666319329050242, 2.2420775429197073134779673332638658100484, 4.4841550858394146269559346665277316200968, 8.9683101716788292539118693330554632401937, 1.7936620343357658507823738666110926480387, 3.5873240686715317015647477332221852960775, 7.1746481373430634031294954664443705921549, 1.4349296274686126806258990932888741184310, 2.8698592549372253612517981865777482368620, 5.7397185098744507225035963731554964737240, 1.1479437019748901445007192746310992947448, 2.2958874039497802890014385492621985894896, 4.5917748078995605780028770985243971789792, 9.1835496157991211560057541970487943579583, 1.8367099231598242312011508394097588715917, 3.6734198463196484624023016788195177431833, 7.3468396926392969248046033576390354863667, 1.4693679385278593849609206715278070972733, 2.9387358770557187699218413430556141945467, 5.8774717541114375398436826861112283890933, 1.1754943508222875079687365372222456778187, 2.3509887016445750159374730744444913556373, 4.7019774032891500318749461488889827112747, 9.4039548065783000637498922977779654225493, 1.8807909613156600127499784595555930845099, 3.7615819226313200254999569191111861690197, 7.5231638452626400509999138382223723380395, 1.5046327690525280101999827676444744676079, 3.0092655381050560203999655352889489352158, 6.0185310762101120407999310705778978704316, 1.2037062152420224081599862141155795740863, 2.4074124304840448163199724282311591481726, 4.8148248609680896326399448564623182963453, 9.6296497219361792652798897129246365926905, 1.9259299443872358530559779425849273185381, 3.8518598887744717061119558851698546370762, 7.7037197775489434122239117703397092741524, 1.5407439555097886824447823540679418548305, 3.0814879110195773648895647081358837096610, 6.1629758220391547297791294162717674193219, 1.2325951644078309459558258832543534838644, 2.4651903288156618919116517665087069677288, 4.9303806576313237838233035330174139354575, 9.8607613152626475676466070660348278709151, 1.9721522630525295135293214132069655741830, 3.9443045261050590270586428264139311483660, 7.8886090522101180541172856528278622967321, 1.5777218104420236108234571305655724593464, 3.1554436208840472216469142611311449186928, 6.3108872417680944432938285222622898373857, 1.2621774483536188886587657044524579674771, 2.5243548967072377773175314089049159349543, 5.0487097934144755546350628178098318699085, 1.0097419586828951109270125635619663739817, 2.0194839173657902218540251271239327479634, 4.0389678347315804437080502542478654959268, 8.0779356694631608874161005084957309918536, 1.6155871338926321774832201016991461983707, 3.2311742677852643549664402033982923967415, 6.4623485355705287099328804067965847934829, 1.2924697071141057419865760813593169586966, 2.5849394142282114839731521627186339173932, 5.1698788284564229679463043254372678347863, 1.0339757656912845935892608650874535669573, 2.0679515313825691871785217301749071339145, 4.1359030627651383743570434603498142678291, 8.2718061255302767487140869206996285356581, 1.6543612251060553497428173841399257071316, 3.3087224502121106994856347682798514142632, 6.6174449004242213989712695365597028285265, 1.3234889800848442797942539073119405657053, 2.6469779601696885595885078146238811314106, 5.2939559203393771191770156292477622628212, 1.0587911840678754238354031258495524525642, 2.1175823681357508476708062516991049051285, 4.2351647362715016953416125033982098102570, 8.4703294725430033906832250067964196205139, 1.6940658945086006781366450013592839241028, 3.3881317890172013562732900027185678482056, 6.7762635780344027125465800054371356964111, 1.3552527156068805425093160010874271392822, 2.7105054312137610850186320021748542785645, 5.4210108624275221700372640043497085571289, 1.0842021724855044340074528008699417114258, 2.1684043449710088680149056017398834228516, 4.3368086899420177360298112034797668457031, 8.6736173798840354720596224069595336914062, 1.7347234759768070944119244813919067382812, 3.4694469519536141888238489627838134765625, 6.9388939039072283776476979255676269531250, 1.3877787807814456755295395851135253906250, 2.7755575615628913510590791702270507812500, 5.5511151231257827021181583404541015625000, 1.1102230246251565404236316680908203125000, 2.2204460492503130808472633361816406250000, 4.4408920985006261616945266723632812500000, 8.8817841970012523233890533447265625000000, 1.7763568394002504646778106689453125000000, 3.5527136788005009293556213378906250000000, 7.1054273576010018587112426757812500000000, 1.4210854715202003717422485351562500000000, 2.8421709430404007434844970703125000000000, 5.6843418860808014869689941406250000000000, 1.1368683772161602973937988281250000000000, 2.2737367544323205947875976562500000000000, 4.5474735088646411895751953125000000000000, 9.0949470177292823791503906250000000000000, 1.8189894035458564758300781250000000000000, 3.6379788070917129516601562500000000000000, 7.2759576141834259033203125000000000000000, 1.4551915228366851806640625000000000000000, 2.9103830456733703613281250000000000000000, 5.8207660913467407226562500000000000000000, 1.1641532182693481445312500000000000000000, 2.3283064365386962890625000000000000000000, 4.6566128730773925781250000000000000000000, 9.3132257461547851562500000000000000000000, 1.8626451492309570312500000000000000000000, 3.7252902984619140625000000000000000000000, 7.4505805969238281250000000000000000000000, 1.4901161193847656250000000000000000000000, 2.9802322387695312500000000000000000000000, 5.9604644775390625000000000000000000000000, 1.1920928955078125000000000000000000000000, 2.3841857910156250000000000000000000000000, 4.7683715820312500000000000000000000000000, 9.5367431640625000000000000000000000000000, 1.9073486328125000000000000000000000000000, 3.8146972656250000000000000000000000000000, 7.6293945312500000000000000000000000000000, 1.5258789062500000000000000000000000000000, 3.0517578125000000000000000000000000000000, 6.1035156250000000000000000000000000000000, 1.2207031250000000000000000000000000000000, 2.4414062500000000000000000000000000000000, 4.8828125000000000000000000000000000000000, 9.7656250000000000000000000000000000000000, 1.9531250000000000000000000000000000000000, 3.9062500000000000000000000000000000000000, 7.8125000000000000000000000000000000000000, 1.5625000000000000000000000000000000000000, 3.1250000000000000000000000000000000000000, 6.2500000000000000000000000000000000000000, 1.2500000000000000000000000000000000000000, 2.5000000000000000000000000000000000000000, 5.0000000000000000000000000000000000000000, 1.0000000000000000000000000000000000000000, 2.0000000000000000000000000000000000000000, 4.0000000000000000000000000000000000000000, 8.0000000000000000000000000000000000000000, 1.6000000000000000000000000000000000000000, 3.2000000000000000000000000000000000000000, 6.4000000000000000000000000000000000000000, 1.2800000000000000000000000000000000000000, 2.5600000000000000000000000000000000000000, 5.1200000000000000000000000000000000000000, 1.0240000000000000000000000000000000000000, 2.0480000000000000000000000000000000000000, 4.0960000000000000000000000000000000000000, 8.1920000000000000000000000000000000000000, 1.6384000000000000000000000000000000000000, 3.2768000000000000000000000000000000000000, 6.5536000000000000000000000000000000000000, 1.3107200000000000000000000000000000000000, 2.6214400000000000000000000000000000000000, 5.2428800000000000000000000000000000000000, 1.0485760000000000000000000000000000000000, 2.0971520000000000000000000000000000000000, 4.1943040000000000000000000000000000000000, 8.3886080000000000000000000000000000000000, 1.6777216000000000000000000000000000000000, 3.3554432000000000000000000000000000000000, 6.7108864000000000000000000000000000000000, 1.3421772800000000000000000000000000000000, 2.6843545600000000000000000000000000000000, 5.3687091200000000000000000000000000000000, 1.0737418240000000000000000000000000000000, 2.1474836480000000000000000000000000000000, 4.2949672960000000000000000000000000000000, 8.5899345920000000000000000000000000000000, 1.7179869184000000000000000000000000000000, 3.4359738368000000000000000000000000000000, 6.8719476736000000000000000000000000000000, 1.3743895347200000000000000000000000000000, 2.7487790694400000000000000000000000000000, 5.4975581388800000000000000000000000000000, 1.0995116277760000000000000000000000000000, 2.1990232555520000000000000000000000000000, 4.3980465111040000000000000000000000000000, 8.7960930222080000000000000000000000000000, 1.7592186044416000000000000000000000000000, 3.5184372088832000000000000000000000000000, 7.0368744177664000000000000000000000000000, 1.4073748835532800000000000000000000000000, 2.8147497671065600000000000000000000000000, 5.6294995342131200000000000000000000000000, 1.1258999068426240000000000000000000000000, 2.2517998136852480000000000000000000000000, 4.5035996273704960000000000000000000000000, 9.0071992547409920000000000000000000000000, 1.8014398509481984000000000000000000000000, 3.6028797018963968000000000000000000000000, 7.2057594037927936000000000000000000000000, 1.4411518807585587200000000000000000000000, 2.8823037615171174400000000000000000000000, 5.7646075230342348800000000000000000000000, 1.1529215046068469760000000000000000000000, 2.3058430092136939520000000000000000000000, 4.6116860184273879040000000000000000000000, 9.2233720368547758080000000000000000000000, 1.8446744073709551616000000000000000000000, 3.6893488147419103232000000000000000000000, 7.3786976294838206464000000000000000000000, 1.4757395258967641292800000000000000000000, 2.9514790517935282585600000000000000000000, 5.9029581035870565171200000000000000000000, 1.1805916207174113034240000000000000000000, 2.3611832414348226068480000000000000000000, 4.7223664828696452136960000000000000000000, 9.4447329657392904273920000000000000000000, 1.8889465931478580854784000000000000000000, 3.7778931862957161709568000000000000000000, 7.5557863725914323419136000000000000000000, 1.5111572745182864683827200000000000000000, 3.0223145490365729367654400000000000000000, 6.0446290980731458735308800000000000000000, 1.2089258196146291747061760000000000000000, 2.4178516392292583494123520000000000000000, 4.8357032784585166988247040000000000000000, 9.6714065569170333976494080000000000000000, 1.9342813113834066795298816000000000000000, 3.8685626227668133590597632000000000000000, 7.7371252455336267181195264000000000000000, 1.5474250491067253436239052800000000000000, 3.0948500982134506872478105600000000000000, 6.1897001964269013744956211200000000000000, 1.2379400392853802748991242240000000000000, 2.4758800785707605497982484480000000000000, 4.9517601571415210995964968960000000000000, 9.9035203142830421991929937920000000000000, 1.9807040628566084398385987584000000000000, 3.9614081257132168796771975168000000000000, 7.9228162514264337593543950336000000000000, 1.5845632502852867518708790067200000000000, 3.1691265005705735037417580134400000000000, 6.3382530011411470074835160268800000000000, 1.2676506002282294014967032053760000000000, 2.5353012004564588029934064107520000000000, 5.0706024009129176059868128215040000000000, 1.0141204801825835211973625643008000000000, 2.0282409603651670423947251286016000000000, 4.0564819207303340847894502572032000000000, 8.1129638414606681695789005144064000000000, 1.6225927682921336339157801028812800000000, 3.2451855365842672678315602057625600000000, 6.4903710731685345356631204115251200000000, 1.2980742146337069071326240823050240000000, 2.5961484292674138142652481646100480000000, 5.1922968585348276285304963292200960000000, 1.0384593717069655257060992658440192000000, 2.0769187434139310514121985316880384000000, 4.1538374868278621028243970633760768000000, 8.3076749736557242056487941267521536000000, 1.6615349947311448411297588253504307200000, 3.3230699894622896822595176507008614400000, 6.6461399789245793645190353014017228800000, 1.3292279957849158729038070602803445760000, 2.6584559915698317458076141205606891520000, 5.3169119831396634916152282411213783040000, 1.0633823966279326983230456482242756608000, 2.1267647932558653966460912964485513216000, 4.2535295865117307932921825928971026432000, 8.5070591730234615865843651857942052864000, 1.7014118346046923173168730371588410572800, 3.4028236692093846346337460743176821145600, 6.8056473384187692692674921486353642291200, 1.3611294676837538538534984297270728458240, 2.7222589353675077077069968594541456916480, 5.4445178707350154154139937189082913832960, 1.0889035741470030830827987437816582766592, 2.1778071482940061661655974875633165533184, 4.3556142965880123323311949751266331066368, 8.7112285931760246646623899502532662132736, 1.7422457186352049329324779900506532426547, 3.4844914372704098658649559801013064853094, 6.9689828745408197317299119602026129706189, 1.3937965749081639463459823920405225941238, 2.7875931498163278926919647840810451882476, 5.5751862996326557853839295681620903764951, 1.1150372599265311570767859136324180752990, 2.2300745198530623141535718272648361505980, 4.4601490397061246283071436545296723011961, 8.9202980794122492566142873090593446023922, 1.7840596158824498513228574618118689204784, 3.5681192317648997026457149236237378409569, 7.1362384635297994052914298472474756819137, 1.4272476927059598810582859694494951363827, 2.8544953854119197621165719388989902727655, 5.7089907708238395242331438777979805455310, 1.1417981541647679048466287755595961091062, 2.2835963083295358096932575511191922182124, 4.5671926166590716193865151022383844364248, 9.1343852333181432387730302044767688728496, 1.8268770466636286477546060408953537745699, 3.6537540933272572955092120817907075491398, 7.3075081866545145910184241635814150982797, 1.4615016373309029182036848327162830196559, 2.9230032746618058364073696654325660393119, 5.8460065493236116728147393308651320786237, 1.1692013098647223345629478661730264157247, 2.3384026197294446691258957323460528314495, 4.6768052394588893382517914646921056628990, 9.3536104789177786765035829293842113257980, 1.8707220957835557353007165858768422651596, 3.7414441915671114706014331717536845303192, 7.4828883831342229412028663435073690606384, 1.4965776766268445882405732687014738121277, 2.9931553532536891764811465374029476242553, 5.9863107065073783529622930748058952485107, 1.1972621413014756705924586149611790497021, 2.3945242826029513411849172299223580994043, 4.7890485652059026823698344598447161988086, 9.5780971304118053647396689196894323976171, 1.9156194260823610729479337839378864795234, 3.8312388521647221458958675678757729590468, 7.6624777043294442917917351357515459180937, 1.5324955408658888583583470271503091836187, 3.0649910817317777167166940543006183672375, 6.1299821634635554334333881086012367344750, 1.2259964326927110866866776217202473468950, 2.4519928653854221733733552434404946937900, 4.9039857307708443467467104868809893875800, 9.8079714615416886934934209737619787751599, 1.9615942923083377386986841947523957550320, 3.9231885846166754773973683895047915100640, 7.8463771692333509547947367790095830201279, 1.5692754338466701909589473558019166040256, 3.1385508676933403819178947116038332080512, 6.2771017353866807638357894232076664161024, 1.2554203470773361527671578846415332832205, 2.5108406941546723055343157692830665664409, 5.0216813883093446110686315385661331328819, 1.0043362776618689222137263077132266265764, 2.0086725553237378444274526154264532531528, 4.0173451106474756888549052308529065063055, 8.0346902212949513777098104617058130126110, 1.6069380442589902755419620923411626025222, 3.2138760885179805510839241846823252050444, 6.4277521770359611021678483693646504100888, 1.2855504354071922204335696738729300820178, 2.5711008708143844408671393477458601640355, 5.1422017416287688817342786954917203280710, 1.0284403483257537763468557390983440656142, 2.0568806966515075526937114781966881312284, 4.1137613933030151053874229563933762624568, 8.2275227866060302107748459127867525249137, 1.6455045573212060421549691825573505049827, 3.2910091146424120843099383651147010099655, 6.5820182292848241686198767302294020199309, 1.3164036458569648337239753460458804039862, 2.6328072917139296674479506920917608079724, 5.2656145834278593348959013841835216159448, 1.0531229166855718669791802768367043231890, 2.1062458333711437339583605536734086463779, 4.2124916667422874679167211073468172927558, 8.4249833334845749358334422146936345855116, 1.6849966666969149871666884429387269171023, 3.3699933333938299743333768858774538342046, 6.7399866667876599486667537717549076684093, 1.3479973333575319897333507543509815336819, 2.6959946667150639794667015087019630673637, 5.3919893334301279589334030174039261347274, 1.0783978666860255917866806034807852269455, 2.1567957333720511835733612069615704538910, 4.3135914667441023671467224139231409077819, 8.6271829334882047342934448278462818155639, 1.7254365866976409468586889655692563631128, 3.4508731733952818937173779311385127262256, 6.9017463467905637874347558622770254524511, 1.3803492693581127574869511724554050904902, 2.7606985387162255149739023449108101809804, 5.5213970774324510299478046898216203619609, 1.1042794154864902059895609379643240723922, 2.2085588309729804119791218759286481447844, 4.4171176619459608239582437518572962895687, 8.8342353238919216479164875037145925791374, 1.7668470647783843295832975007429185158275, 3.5336941295567686591665950014858370316550, 7.0673882591135373183331900029716740633099, 1.4134776518227074636666380005943348126620, 2.8269553036454149273332760011886696253240, 5.6539106072908298546665520023773392506479, 1.1307821214581659709333104004754678501296, 2.2615642429163319418666208009509357002592, 4.5231284858326638837332416019018714005184, 9.0462569716653277674664832038037428010367, 1.8092513943330655534932966407607485602073, 3.6185027886661311069865932815214971204147, 7.2370055773322622139731865630429942408294, 1.4474011154664524427946373126085988481659, 2.8948022309329048855892746252171976963317, 5.7896044618658097711785492504343953926635, 1.1579208923731619542357098500868790785327, 2.3158417847463239084714197001737581570654, 4.6316835694926478169428394003475163141308, 9.2633671389852956338856788006950326282616, 1.8526734277970591267771357601390065256523, 3.7053468555941182535542715202780130513046, 7.4106937111882365071085430405560261026093, 1.4821387422376473014217086081112052205219, 2.9642774844752946028434172162224104410437, 5.9285549689505892056868344324448208820874, 1.1857109937901178411373668864889641764175, 2.3714219875802356822747337729779283528350, 4.7428439751604713645494675459558567056699, 9.4856879503209427290989350919117134113399, 1.8971375900641885458197870183823426822680, 3.7942751801283770916395740367646853645360, 7.5885503602567541832791480735293707290719, 1.5177100720513508366558296147058741458144, 3.0354201441027016733116592294117482916288, 6.0708402882054033466233184588234965832575, 1.2141680576410806693246636917646993166515, 2.4283361152821613386493273835293986333030, 4.8566722305643226772986547670587972666060, 9.7133444611286453545973095341175945332120, 1.9426688922257290709194619068235189066424, 3.8853377844514581418389238136470378132848, 7.7706755689029162836778476272940756265696, 1.5541351137805832567355695254588151253139, 3.1082702275611665134711390509176302506279, 6.2165404551223330269422781018352605012557, 1.2433080910244666053884556203670521002511, 2.4866161820489332107769112407341042005023, 4.9732323640978664215538224814682084010046, 9.9464647281957328431076449629364168020091, 1.9892929456391465686215289925872833604018, 3.9785858912782931372430579851745667208036, 7.9571717825565862744861159703491334416073, 1.5914343565113172548972231940698266883215, 3.1828687130226345097944463881396533766429, 6.3657374260452690195888927762793067532858, 1.2731474852090538039177785552558613506572, 2.5462949704181076078355571105117227013143, 5.0925899408362152156711142210234454026287, 1.0185179881672430431342228442046890805257, 2.0370359763344860862684456884093781610515, 4.0740719526689721725368913768187563221029, 8.1481439053379443450737827536375126442059, 1.6296287810675888690147565507275025288412, 3.2592575621351777380295131014550050576823, 6.5185151242703554760590262029100101153647, 1.3037030248540710952118052405820020230729, 2.6074060497081421904236104811640040461459, 5.2148120994162843808472209623280080922918, 1.0429624198832568761694441924656016184584, 2.0859248397665137523388883849312032369167, 4.1718496795330275046777767698624064738334, 8.3436993590660550093555535397248129476668, 1.6687398718132110018711107079449625895334, 3.3374797436264220037422214158899251790667, 6.6749594872528440074844428317798503581335, 1.3349918974505688014968885663559700716267, 2.6699837949011376029937771327119401432534, 5.3399675898022752059875542654238802865068, 1.0679935179604550411975108530847760573014, 2.1359870359209100823950217061695521146027, 4.2719740718418201647900434123391042292054, 8.5439481436836403295800868246782084584108, 1.7087896287367280659160173649356416916822, 3.4175792574734561318320347298712833833643, 6.8351585149469122636640694597425667667287, 1.3670317029893824527328138919485133533457, 2.7340634059787649054656277838970267066915, 5.4681268119575298109312555677940534133829, 1.0936253623915059621862511135588106826766, 2.1872507247830119243725022271176213653532, 4.3745014495660238487450044542352427307063, 8.7490028991320476974900089084704854614127, 1.7498005798264095394980017816940970922825, 3.4996011596528190789960035633881941845651, 6.9992023193056381579920071267763883691301, 1.3998404638611276315984014253552776738260, 2.7996809277222552631968028507105553476521, 5.5993618554445105263936057014211106953041, 1.1198723710889021052787211402842221390608, 2.2397447421778042105574422805684442781216, 4.4794894843556084211148845611368885562433, 8.9589789687112168422297691222737771124866, 1.7917957937422433684459538244547554224973, 3.5835915874844867368919076489095108449946, 7.1671831749689734737838152978190216899893, 1.4334366349937946947567630595638043379979, 2.8668732699875893895135261191276086759957, 5.7337465399751787790270522382552173519914, 1.1467493079950357558054104476510434703983, 2.2934986159900715116108208953020869407966, 4.5869972319801430232216417906041738815931, 9.1739944639602860464432835812083477631863, 1.8347988927920572092886567162416695526373, 3.6695977855841144185773134324833391052745, 7.3391955711682288371546268649666782105490, 1.4678391142336457674309253729933356421098, 2.9356782284672915348618507459866712842196, 5.8713564569345830697237014919733425684392, 1.1742712913869166139447402983946685136878, 2.3485425827738332278894805967893370273757, 4.6970851655476664557789611935786740547514, 9.3941703310953329115579223871573481095027, 1.8788340662190665823115844774314696219005, 3.7576681324381331646231689548629392438011, 7.5153362648762663292463379097258784876022, 1.5030672529752532658492675819451756975204, 3.0061345059505065316985351638903513950409, 6.0122690119010130633970703277807027900817, 1.2024538023802026126794140655561405580163, 2.4049076047604052253588281311122811160327, 4.8098152095208104507176562622245622320654, 9.6196304190416209014353125244491244641308, 1.9239260838083241802870625048898248928262, 3.8478521676166483605741250097796497856523, 7.6957043352332967211482500195592995713046, 1.5391408670466593442296500039118599142609, 3.0782817340933186884593000078237198285219, 6.1565634681866373769186000156474396570437, 1.2313126936373274753837200031294879314087, 2.4626253872746549507674400062589758628175, 4.9252507745493099015348800125179517256350, 9.8505015490986198030697600250359034512699, 1.9701003098197239606139520050071806902540, 3.9402006196394479212279040100143613805080, 7.8804012392788958424558080200287227610159, 1.5760802478557791684911616040057445522032, 3.1521604957115583369823232080114891044064, 6.3043209914231166739646464160229782088128, 1.2608641982846233347929292832045956417626, 2.5217283965692466695858585664091912835251, 5.0434567931384933391717171328183825670502, 1.0086913586276986678343434265636765134100, 2.0173827172553973356686868531273530268201, 4.0347654345107946713373737062547060536402, 8.0695308690215893426747474125094121072803, 1.6139061738043178685349494825018824214561, 3.2278123476086357370698989650037648429121, 6.4556246952172714741397979300075296858243, 1.2911249390434542948279595860015059371649, 2.5822498780869085896559191720030118743297, 5.1644997561738171793118383440060237486594, 1.0328999512347634358623676688012047497319, 2.0657999024695268717247353376024094994638, 4.1315998049390537434494706752048189989275, 8.2631996098781074868989413504096379978551, 1.6526399219756214973797882700819275995710, 3.3052798439512429947595765401638551991420, 6.6105596879024859895191530803277103982840, 1.3221119375804971979038306160655420796568, 2.6442238751609943958076612321310841593136, 5.2884477503219887916153224642621683186272, 1.0576895500643977583230644928524336637254, 2.1153791001287955166461289857048673274509, 4.2307582002575910332922579714097346549018, 8.4615164005151820665845159428194693098036, 1.6923032801030364133169031885638938619607, 3.3846065602060728266338063771277877239214, 6.7692131204121456532676127542555754478429, 1.3538426240824291306535225508511150895686, 2.7076852481648582613070451017022301791371, 5.4153704963297165226140902034044603582743, 1.0830740992659433045228180406808920716549, 2.1661481985318866090456360813617841433097, 4.3322963970637732180912721627235682866194, 8.6645927941275464361825443254471365732389, 1.7329185588255092872365088650894273146478, 3.4658371176510185744730177301788546292955, 6.9316742353020371489460354603577092585911, 1.3863348470604074297892070920715418517182, 2.7726696941208148595784141841430837034364, 5.5453393882416297191568283682861674068729, 1.1090678776483259438313656736572334813746, 2.2181357552966518876627313473144669627491, 4.4362715105933037753254626946289339254983, 8.8725430211866075506509253892578678509966, 1.7745086042373215101301850778515735701993, 3.5490172084746430202603701557031471403986, 7.0980344169492860405207403114062942807973, 1.4196068833898572081041480622812588561595, 2.8392137667797144162082961245625177123189, 5.6784275335594288324165922491250354246378, 1.1356855067118857664833184498250070849276, 2.2713710134237715329666368996500141698551, 4.5427420268475430659332737993000283397103, 9.0854840536950861318665475986000566794205, 1.8170968107390172263733095197200113358841, 3.6341936214780344527466190394400226717682, 7.2683872429560689054932380788800453435364, 1.4536774485912137810986476157760090687073, 2.9073548971824275621972952315520181374146, 5.8147097943648551243945904631040362748291, 1.1629419588729710248789180926208072549658, 2.3258839177459420497578361852416145099317, 4.6517678354918840995156723704832290198633, 9.3035356709837681990313447409664580397266, 1.8607071341967536398062689481932916079453, 3.7214142683935072796125378963865832158906, 7.4428285367870145592250757927731664317813, 1.4885657073574029118450151585546332863563, 2.9771314147148058236900303171092665727125, 5.9542628294296116473800606342185331454250, 1.1908525658859223294760121268437066290850, 2.3817051317718446589520242536874132581700, 4.7634102635436893179040485073748265163400, 9.5268205270873786358080970147496530326800, 1.9053641054174757271616194029499306065360, 3.8107282108349514543232388058998612130720, 7.6214564216699029086464776117997224261440, 1.5242912843339805817292955223599444852288, 3.0485825686679611634585910447198889704576, 6.0971651373359223269171820894397779409152, 1.2194330274671844653834364178879555881830, 2.4388660549343689307668728357759111763661, 4.8777321098687378615337456715518223527322, 9.7554642197374757230674913431036447054644, 1.9510928439474951446134982686207289410929, 3.9021856878949902892269965372414578821857, 7.8043713757899805784539930744829157643715, 1.5608742751579961156907986148965831528743, 3.1217485503159922313815972297931663057486, 6.2434971006319844627631944595863326114972, 1.2486994201263968925526388919172665222994, 2.4973988402527937851052777838345330445989, 4.9947976805055875702105555676690660891978, 9.9895953610111751404211111353381321783955, 1.9979190722022350280842222270676264356791, 3.9958381444044700561684444541352528713582, 7.9916762888089401123368889082705057427164, 1.5983352577617880224673777816541011485433, 3.1966705155235760449347555633082022970866, 6.3933410310471520898695111266164045941731, 1.2786682062094304179739022253232809188346, 2.5573364124188608359478044506465618376693, 5.1146728248377216718956089012931236753385, 1.0229345649675443343791217802586247350677, 2.0458691299350886687582435605172494701354, 4.0917382598701773375164871210344989402708, 8.1834765197403546750329742420689978805416, 1.6366953039480709350065948484137995761083, 3.2733906078961418700131896968275991522166, 6.5467812157922837400263793936551983044333, 1.3093562431584567480052758787310396608867, 2.6187124863169134960105517574620793217733, 5.2374249726338269920211035149241586435466, 1.0474849945267653984042207029848317287093, 2.0949699890535307968084414059696634574187, 4.1899399781070615936168828119393269148373, 8.3798799562141231872337656238786538296746, 1.6759759912428246374467531247757307659349, 3.3519519824856492748935062495514615318698, 6.7039039649712985497870124991029230637397, 1.3407807929942597099574024998205846127479, 2.6815615859885194199148049996411692254959, 5.3631231719770388398296099992823384509917, 1.0726246343954077679659219998564676901983, 2.1452492687908155359318439997129353803967, 4.2904985375816310718636879994258707607934, 8.5809970751632621437273759988517415215868, 1.7161994150326524287454751997703483043174, 3.4323988300653048574909503995406966086347, 6.8647976601306097149819007990813932172694, 1.3729595320261219429963801598162786434539, 2.7459190640522438859927603196325572869078, 5.4918381281044877719855206392651145738155, 1.0983676256208975543971041278530229147631, 2.1967352512417951087942082557060458295262, 4.3934705024835902175884165114120916590524, 8.7869410049671804351768330228241833181049, 1.7573882009934360870353666045648366636210, 3.5147764019868721740707332091296733272420, 7.0295528039737443481414664182593466544839, 1.4059105607947488696282932836518693308968, 2.8118211215894977392565865673037386617936, 5.6236422431789954785131731346074773235871, 1.1247284486357990957026346269214954647174, 2.2494568972715981914052692538429909294348, 4.4989137945431963828105385076859818588697, 8.9978275890863927656210770153719637177394, 1.7995655178172785531242154030743927435479, 3.5991310356345571062484308061487854870958, 7.1982620712691142124968616122975709741915, 1.4396524142538228424993723224595141948383, 2.8793048285076456849987446449190283896766, 5.7586096570152913699974892898380567793532, 1.1517219314030582739994978579676113558706, 2.3034438628061165479989957159352227117413, 4.6068877256122330959979914318704454234826, 9.2137754512244661919959828637408908469651, 1.8427550902448932383991965727481781693930, 3.6855101804897864767983931454963563387861, 7.3710203609795729535967862909927126775721, 1.4742040721959145907193572581985425355144, 2.9484081443918291814387145163970850710288, 5.8968162887836583628774290327941701420577, 1.1793632577567316725754858065588340284115, 2.3587265155134633451509716131176680568231, 4.7174530310269266903019432262353361136462, 9.4349060620538533806038864524706722272923, 1.8869812124107706761207772904941344454585, 3.7739624248215413522415545809882688909169, 7.5479248496430827044831091619765377818338, 1.5095849699286165408966218323953075563668, 3.0191699398572330817932436647906151127335, 6.0383398797144661635864873295812302254671, 1.2076679759428932327172974659162460450934, 2.4153359518857864654345949318324920901868, 4.8306719037715729308691898636649841803737, 9.6613438075431458617383797273299683607473, 1.9322687615086291723476759454659936721495, 3.8645375230172583446953518909319873442989, 7.7290750460345166893907037818639746885979, 1.5458150092069033378781407563727949377196, 3.0916300184138066757562815127455898754391, 6.1832600368276133515125630254911797508783, 1.2366520073655226703025126050982359501757, 2.4733040147310453406050252101964719003513, 4.9466080294620906812100504203929438007026, 9.8932160589241813624201008407858876014053, 1.9786432117848362724840201681571775202811, 3.9572864235696725449680403363143550405621, 7.9145728471393450899360806726287100811242, 1.5829145694278690179872161345257420162248, 3.1658291388557380359744322690514840324497, 6.3316582777114760719488645381029680648994, 1.2663316555422952143897729076205936129799, 2.5326633110845904287795458152411872259597, 5.0653266221691808575590916304823744519195, 1.0130653244338361715118183260964748903839, 2.0261306488676723430236366521929497807678, 4.0522612977353446860472733043858995615356, 8.1045225954706893720945466087717991230712, 1.6209045190941378744189093217543598246142, 3.2418090381882757488378186435087196492285, 6.4836180763765514976756372870174392984569, 1.2967236152753102995351274574034878596914, 2.5934472305506205990702549148069757193828, 5.1868944611012411981405098296139514387656, 1.0373788922202482396281019659227902877531, 2.0747577844404964792562039318455805755062, 4.1495155688809929585124078636911611510124, 8.2990311377619859170248157273823223020249, 1.6598062275523971834049631454764644604050, 3.3196124551047943668099262909529289208100, 6.6392249102095887336198525819058578416199, 1.3278449820419177467239705163811715683240, 2.6556899640838354934479410327623431366480, 5.3113799281676709868958820655246862732959, 1.0622759856335341973791764131049372546592, 2.1245519712670683947583528262098745093184, 4.2491039425341367895167056524197490186367, 8.4982078850682735790334113048394980372735, 1.6996415770136547158066822609678996074547, 3.3992831540273094316133645219357992149094, 6.7985663080546188632267290438715984298188, 1.3597132616109237726453458087743196859638, 2.7194265232218475452906916175486393719275, 5.4388530464436950905813832350972787438550, 1.0877706092887390181162766470194557487710, 2.1755412185774780362325532940389114975420, 4.3510824371549560724651065880778229950840, 8.7021648743099121449302131761556459901681, 1.7404329748619824289860426352311291980336, 3.4808659497239648579720852704622583960672, 6.9617318994479297159441705409245167921344, 1.3923463798895859431888341081849033584269, 2.7846927597791718863776682163698067168538, 5.5693855195583437727553364327396134337076, 1.1138771039116687545510672865479226867415, 2.2277542078233375091021345730958453734830, 4.4555084156466750182042691461916907469660, 8.9110168312933500364085382923833814939321, 1.7822033662586700072817076584766762987864, 3.5644067325173400145634153169533525975728, 7.1288134650346800291268306339067051951457, 1.4257626930069360058253661267813410390291, 2.8515253860138720116507322535626820780583, 5.7030507720277440233014645071253641561165, 1.1406101544055488046602929014250728312233, 2.2812203088110976093205858028501456624466, 4.5624406176221952186411716057002913248932, 9.1248812352443904372823432114005826497865, 1.8249762470488780874564686422801165299573, 3.6499524940977561749129372845602330599146, 7.2999049881955123498258745691204661198292, 1.4599809976391024699651749138240932239658, 2.9199619952782049399303498276481864479317, 5.8399239905564098798606996552963728958633, 1.1679847981112819759721399310592745791727, 2.3359695962225639519442798621185491583453, 4.6719391924451279038885597242370983166907, 9.3438783848902558077771194484741966333813, 1.8687756769780511615554238896948393266763, 3.7375513539561023231108477793896786533525, 7.4751027079122046462216955587793573067051, 1.4950205415824409292443391117558714613410, 2.9900410831648818584886782235117429226820, 5.9800821663297637169773564470234858453641, 1.1960164332659527433954712894046971690728, 2.3920328665319054867909425788093943381456, 4.7840657330638109735818851576187886762912, 9.5681314661276219471637703152375773525825, 1.9136262932255243894327540630475154705165, 3.8272525864510487788655081260950309410330, 7.6545051729020975577310162521900618820660, 1.5309010345804195115462032504380123764132, 3.0618020691608390230924065008760247528264, 6.1236041383216780461848130017520495056528, 1.2247208276643356092369626003504099011306, 2.4494416553286712184739252007008198022611, 4.8988833106573424369478504014016396045222, 9.7977666213146848738957008028032792090445, 1.9595533242629369747791401605606558418089, 3.9191066485258739495582803211213116836178, 7.8382132970517478991165606422426233672356, 1.5676426594103495798233121284485246734471, 3.1352853188206991596466242568970493468942, 6.2705706376413983192932485137940986937885, 1.2541141275282796638586497027588197387577, 2.5082282550565593277172994055176394775154, 5.0164565101131186554345988110352789550308, 1.0032913020226237310869197622070557910062, 2.0065826040452474621738395244141115820123, 4.0131652080904949243476790488282231640246, 8.0263304161809898486953580976564463280492, 1.6052660832361979697390716195312892656098, 3.2105321664723959394781432390625785312197, 6.4210643329447918789562864781251570624394, 1.2842128665889583757912572956250314124879, 2.5684257331779167515825145912500628249758, 5.1368514663558335031650291825001256499515, 1.0273702932711667006330058365000251299903, 2.0547405865423334012660116730000502599806, 4.1094811730846668025320233460001005199612, 8.2189623461693336050640466920002010399224, 1.6437924692338667210128093384000402079845, 3.2875849384677334420256186768000804159690, 6.5751698769354668840512373536001608319379, 1.3150339753870933768102474707200321663876, 2.6300679507741867536204949414400643327752, 5.2601359015483735072409898828801286655503, 1.0520271803096747014481979765760257331101, 2.1040543606193494028963959531520514662201, 4.2081087212386988057927919063041029324403, 8.4162174424773976115855838126082058648805, 1.6832434884954795223171167625216411729761, 3.3664869769909590446342335250432823459522, 6.7329739539819180892684670500865646919044, 1.3465947907963836178536934100173129383809, 2.6931895815927672357073868200346258767618, 5.3863791631855344714147736400692517535235, 1.0772758326371068942829547280138503507047, 2.1545516652742137885659094560277007014094, 4.3091033305484275771318189120554014028188, 8.6182066610968551542636378241108028056377, 1.7236413322193710308527275648221605611275, 3.4472826644387420617054551296443211222551, 6.8945653288774841234109102592886422445101, 1.3789130657754968246821820518577284489020, 2.7578261315509936493643641037154568978041, 5.5156522631019872987287282074309137956081, 1.1031304526203974597457456414861827591216, 2.2062609052407949194914912829723655182432, 4.4125218104815898389829825659447310364865, 8.8250436209631796779659651318894620729730, 1.7650087241926359355931930263778924145946, 3.5300174483852718711863860527557848291892, 7.0600348967705437423727721055115696583784, 1.4120069793541087484745544211023139316757, 2.8240139587082174969491088422046278633514, 5.6480279174164349938982176844092557267027, 1.1296055834832869987796435368818511453405, 2.2592111669665739975592870737637022906811, 4.5184223339331479951185741475274045813622, 9.0368446678662959902371482950548091627243, 1.8073689335732591980474296590109618325449, 3.6147378671465183960948593180219236650897, 7.2294757342930367921897186360438473301795, 1.4458951468586073584379437272087694660359, 2.8917902937172147168758874544175389320718, 5.7835805874344294337517749088350778641436, 1.1567161174868858867503549817670155728287, 2.3134322349737717735007099635340311456574, 4.6268644699475435470014199270680622913149, 9.2537289398950870940028398541361245826297, 1.8507457879790174188005679708272249165259, 3.7014915759580348376011359416544498330519, 7.4029831519160696752022718833088996661038, 1.4805966303832139350404543766617799332208, 2.9611932607664278700809087533235598664415, 5.9223865215328557401618175066471197328830, 1.1844773043065711480323635013294239465766, 2.3689546086131422960647270026588478931532, 4.7379092172262845921294540053176957863064, 9.4758184344525691842589080106353915726128, 1.8951636868905138368517816021270783145226, 3.7903273737810276737035632042541566290451, 7.5806547475620553474071264085083132580903, 1.5161309495124110694814252817016626516181, 3.0322618990248221389628505634033253032361, 6.0645237980496442779257011268066506064722, 1.2129047596099288555851402253613301212944, 2.4258095192198577111702804507226602425889, 4.8516190384397154223405609014453204851778, 9.7032380768794308446811218028906409703555, 1.9406476153758861689362243605781281940711, 3.8812952307517723378724487211562563881422, 7.7625904615035446757448974423125127762844, 1.5525180923007089351489794884625025552569, 3.1050361846014178702979589769250051105138, 6.2100723692028357405959179538500102210275, 1.2420144738405671481191835907700020442055, 2.4840289476811342962383671815400040884110, 4.9680578953622685924767343630800081768220, 9.9361157907245371849534687261600163536441, 1.9872231581449074369906937452320032707288, 3.9744463162898148739813874904640065414576, 7.9488926325796297479627749809280130829153, 1.5897785265159259495925549961856026165831, 3.1795570530318518991851099923712052331661, 6.3591141060637037983702199847424104663322, 1.2718228212127407596740439969484820932664, 2.5436456424254815193480879938969641865329, 5.0872912848509630386961759877939283730658, 1.0174582569701926077392351975587856746132, 2.0349165139403852154784703951175713492263, 4.0698330278807704309569407902351426984526, 8.1396660557615408619138815804702853969052, 1.6279332111523081723827763160940570793810, 3.2558664223046163447655526321881141587621, 6.5117328446092326895311052643762283175242, 1.3023465689218465379062210528752456635048, 2.6046931378436930758124421057504913270097, 5.2093862756873861516248842115009826540193, 1.0418772551374772303249768423001965308039, 2.0837545102749544606499536846003930616077, 4.1675090205499089212999073692007861232155, 8.3350180410998178425998147384015722464309, 1.6670036082199635685199629476803144492862, 3.3340072164399271370399258953606288985724, 6.6680144328798542740798517907212577971448, 1.3336028865759708548159703581442515594290, 2.6672057731519417096319407162885031188579, 5.3344115463038834192638814325770062377158, 1.0668823092607766838527762865154012475432, 2.1337646185215533677055525730308024950863, 4.2675292370431067354111051460616049901726, 8.5350584740862134708222102921232099803453, 1.7070116948172426941644420584246419960691, 3.4140233896344853883288841168492839921381, 6.8280467792689707766577682336985679842762, 1.3656093558537941553315536467397135968552, 2.7312187117075883106631072934794271937105, 5.4624374234151766213262145869588543874210, 1.0924874846830353242652429173917708774842, 2.1849749693660706485304858347835417549684, 4.3699499387321412970609716695670835099368, 8.7398998774642825941219433391341670198736, 1.7479799754928565188243886678268334039747, 3.4959599509857130376487773356536668079494, 6.9919199019714260752975546713073336158989, 1.3983839803942852150595109342614667231798, 2.7967679607885704301190218685229334463595, 5.5935359215771408602380437370458668927191, 1.1187071843154281720476087474091733785438, 2.2374143686308563440952174948183467570876, 4.4748287372617126881904349896366935141753, 8.9496574745234253763808699792733870283505, 1.7899314949046850752761739958546774056701, 3.5798629898093701505523479917093548113402, 7.1597259796187403011046959834187096226804, 1.4319451959237480602209391966837419245361, 2.8638903918474961204418783933674838490722, 5.7277807836949922408837567867349676981443, 1.1455561567389984481767513573469935396289, 2.2911123134779968963535027146939870792577, 4.5822246269559937927070054293879741585155, 9.1644492539119875854140108587759483170310, 1.8328898507823975170828021717551896634062, 3.6657797015647950341656043435103793268124, 7.3315594031295900683312086870207586536248, 1.4663118806259180136662417374041517307250, 2.9326237612518360273324834748083034614499, 5.8652475225036720546649669496166069228998, 1.1730495045007344109329933899233213845800, 2.3460990090014688218659867798466427691599, 4.6921980180029376437319735596932855383198, 9.3843960360058752874639471193865710766397, 1.8768792072011750574927894238773142153279, 3.7537584144023501149855788477546284306559, 7.5075168288047002299711576955092568613118, 1.5015033657609400459942315391018513722624, 3.0030067315218800919884630782037027445247, 6.0060134630437601839769261564074054890494, 1.2012026926087520367953852312814810978099, 2.4024053852175040735907704625629621956198, 4.8048107704350081471815409251259243912395, 9.6096215408700162943630818502518487824791, 1.9219243081740032588726163700503697564958, 3.8438486163480065177452327401007395129916, 7.6876972326960130354904654802014790259832, 1.5375394465392026070980930960402958051966, 3.0750788930784052141961861920805916103933, 6.1501577861568104283923723841611832207866, 1.2300315572313620856784744768322366441573, 2.4600631144627241713569489536644732883146, 4.9201262289254483427138979073289465766293, 9.8402524578508966854277958146578931532585, 1.9680504915701793370855591629315786306517, 3.9361009831403586741711183258631572613034, 7.8722019662807173483422366517263145226068, 1.5744403932561434696684473303452629045214, 3.1488807865122869393368946606905258090427, 6.2977615730245738786737893213810516180855, 1.2595523146049147757347578642762103236171, 2.5191046292098295514695157285524206472342, 5.0382092584196591029390314571048412944684, 1.0076418516839318205878062914209682588937, 2.0152837033678636411756125828419365177874, 4.0305674067357272823512251656838730355747, 8.0611348134714545647024503313677460711494, 1.6122269626942909129404900662735492142299, 3.2244539253885818258809801325470984284598, 6.4489078507771636517619602650941968569195, 1.2897815701554327303523920530188393713839, 2.5795631403108654607047841060376787427678, 5.1591262806217309214095682120753574855356, 1.0318252561243461842819136424150714971071, 2.0636505122486923685638272848301429942142, 4.1273010244973847371276545696602859884285, 8.2546020489947694742553091393205719768570, 1.6509204097989538948510618278641143953714, 3.3018408195979077897021236557282287907428, 6.6036816391958155794042473114564575814856, 1.3207363278391631158808494622912915162971, 2.6414726556783262317616989245825830325942, 5.2829453113566524635233978491651660651885, 1.0565890622713304927046795698330332130377, 2.1131781245426609854093591396660664260754, 4.2263562490853219708187182793321328521508, 8.4527124981706439416374365586642657043016, 1.6905424996341287883274873117328531408603, 3.3810849992682575766549746234657062817206, 6.7621699985365151533099492469314125634412, 1.3524339997073030306619898493862825126882, 2.7048679994146060613239796987725650253765, 5.4097359988292121226479593975451300507530, 1.0819471997658424245295918795090260101506, 2.1638943995316848490591837590180520203012, 4.3277887990633696981183675180361040406024, 8.6555775981267393962367350360722080812048, 1.7311155196253478792473470072144416162410, 3.4622310392506957584946940144288832324819, 6.9244620785013915169893880288577664649638, 1.3848924157002783033978776057715532929928, 2.7697848314005566067957552115431065859855, 5.5395696628011132135915104230862131719711, 1.1079139325602226427183020846172426343942, 2.2158278651204452854366041692344852687884, 4.4316557302408905708732083384689705375769, 8.8633114604817811417464166769379410751537, 1.7726622920963562283492833353875882150307, 3.5453245841927124566985666707751764300615, 7.0906491683854249133971333415503528601230, 1.4181298336770849826794266683100705720246, 2.8362596673541699653588533366201411440492, 5.6725193347083399307177066732402822880984, 1.1345038669416679861435413346480564576197, 2.2690077338833359722870826692961129152393, 4.5380154677666719445741653385922258304787, 9.0760309355333438891483306771844516609574, 1.8152061871066687778296661354368903321915, 3.6304123742133375556593322708737806643830, 7.2608247484266751113186645417475613287659, 1.4521649496853350222637329083495122657532, 2.9043298993706700445274658166990245315064, 5.8086597987413400890549316333980490630127, 1.1617319597482680178109863266796098126025, 2.3234639194965360356219726533592196252051, 4.6469278389930720712439453067184392504102, 9.2938556779861441424878906134368785008204, 1.8587711355972288284975781226873757001641, 3.7175422711944576569951562453747514003282, 7.4350845423889153139903124907495028006563, 1.4870169084777830627980624981499005601313, 2.9740338169555661255961249962998011202625, 5.9480676339111322511922499925996022405250, 1.1896135267822264502384499985199204481050, 2.3792270535644529004768999970398408962100, 4.7584541071289058009537999940796817924200, 9.5169082142578116019075999881593635848401, 1.9033816428515623203815199976318727169680, 3.8067632857031246407630399952637454339360, 7.6135265714062492815260799905274908678721, 1.5227053142812498563052159981054981735744, 3.0454106285624997126104319962109963471488, 6.0908212571249994252208639924219926942976, 1.2181642514249998850441727984843985388595, 2.4363285028499997700883455969687970777191, 4.8726570056999995401766911939375941554381, 9.7453140113999990803533823878751883108762, 1.9490628022799998160706764775750376621752, 3.8981256045599996321413529551500753243505, 7.7962512091199992642827059103001506487010, 1.5592502418239998528565411820600301297402, 3.1185004836479997057130823641200602594804, 6.2370009672959994114261647282401205189608, 1.2474001934591998822852329456480241037922, 2.4948003869183997645704658912960482075843, 4.9896007738367995291409317825920964151686, 9.9792015476735990582818635651841928303373, 1.9958403095347198116563727130368385660675, 3.9916806190694396233127454260736771321349, 7.9833612381388792466254908521473542642698, 1.5966722476277758493250981704294708528540, 3.1933444952555516986501963408589417057079, 6.3866889905111033973003926817178834114158, 1.2773377981022206794600785363435766822832, 2.5546755962044413589201570726871533645663, 5.1093511924088827178403141453743067291327, 1.0218702384817765435680628290748613458265, 2.0437404769635530871361256581497226916531, 4.0874809539271061742722513162994453833061, 8.1749619078542123485445026325988907666123, 1.6349923815708424697089005265197781533225, 3.2699847631416849394178010530395563066449, 6.5399695262833698788356021060791126132898, 1.3079939052566739757671204212158225226580, 2.6159878105133479515342408424316450453159, 5.2319756210266959030684816848632900906319, 1.0463951242053391806136963369726580181264, 2.0927902484106783612273926739453160362527, 4.1855804968213567224547853478906320725055, 8.3711609936427134449095706957812641450110, 1.6742321987285426889819141391562528290022, 3.3484643974570853779638282783125056580044, 6.6969287949141707559276565566250113160088, 1.3393857589828341511855313113250022632018, 2.6787715179656683023710626226500045264035, 5.3575430359313366047421252453000090528070, 1.0715086071862673209484250490600018105614, 2.1430172143725346418968500981200036211228, 4.2860344287450692837937001962400072422456, 8.5720688574901385675874003924800144844912, 1.7144137714980277135174800784960028968982, 3.4288275429960554270349601569920057937965, 6.8576550859921108540699203139840115875930, 1.3715310171984221708139840627968023175186, 2.7430620343968443416279681255936046350372, 5.4861240687936886832559362511872092700744, 1.0972248137587377366511872502374418540149, 2.1944496275174754733023745004748837080298, 4.3888992550349509466047490009497674160595, 8.7777985100699018932094980018995348321190, 1.7555597020139803786418996003799069664238, 3.5111194040279607572837992007598139328476, 7.0222388080559215145675984015196278656952, 1.4044477616111843029135196803039255731390, 2.8088955232223686058270393606078511462781, 5.6177910464447372116540787212157022925562, 1.1235582092889474423308157442431404585112, 2.2471164185778948846616314884862809170225, 4.4942328371557897693232629769725618340449, 8.9884656743115795386465259539451236680899, 1.7976931348623159077293051907890247336180 }; int exppow[2048] = { -308,-308,-308,-308,-307,-307,-307,-306,-306,-306,-305,-305,-305,-305,-304,-304,-304,-303,-303,-303,-302,-302,-302,-302,-301,-301,-301,-300,-300,-300,-299,-299,-299,-299,-298,-298,-298,-297,-297,-297,-296,-296,-296,-296,-295,-295,-295,-294,-294,-294,-293,-293,-293,-292,-292,-292,-292,-291,-291,-291,-290,-290,-290,-289,-289,-289,-289,-288,-288,-288,-287,-287,-287,-286,-286,-286,-286,-285,-285,-285,-284,-284,-284,-283,-283,-283,-283,-282,-282,-282,-281,-281,-281,-280,-280,-280,-280,-279,-279,-279,-278,-278,-278,-277,-277,-277,-277,-276,-276,-276,-275,-275,-275,-274,-274,-274,-274,-273,-273,-273,-272,-272,-272,-271,-271,-271,-271,-270,-270,-270,-269,-269,-269,-268,-268,-268,-268,-267,-267,-267,-266,-266,-266,-265,-265,-265,-265,-264,-264,-264,-263,-263,-263,-262,-262,-262,-261,-261,-261,-261,-260,-260,-260,-259,-259,-259,-258,-258,-258,-258,-257,-257,-257,-256,-256,-256,-255,-255,-255,-255,-254,-254,-254,-253,-253,-253,-252,-252,-252,-252,-251,-251,-251,-250,-250,-250,-249,-249,-249,-249,-248,-248,-248,-247,-247,-247,-246,-246,-246,-246,-245,-245,-245,-244,-244,-244,-243,-243,-243,-243,-242,-242,-242,-241,-241,-241,-240,-240,-240,-240,-239,-239,-239,-238,-238,-238,-237,-237,-237,-237,-236,-236,-236,-235,-235,-235,-234,-234,-234,-233,-233,-233,-233,-232,-232,-232,-231,-231,-231,-230,-230,-230,-230,-229,-229,-229,-228,-228,-228,-227,-227,-227,-227,-226,-226,-226,-225,-225,-225,-224,-224,-224,-224,-223,-223,-223,-222,-222,-222,-221,-221,-221,-221,-220,-220,-220,-219,-219,-219,-218,-218,-218,-218,-217,-217,-217,-216,-216,-216,-215,-215,-215,-215,-214,-214,-214,-213,-213,-213,-212,-212,-212,-212,-211,-211,-211,-210,-210,-210,-209,-209,-209,-209,-208,-208,-208,-207,-207,-207,-206,-206,-206,-206,-205,-205,-205,-204,-204,-204,-203,-203,-203,-202,-202,-202,-202,-201,-201,-201,-200,-200,-200,-199,-199,-199,-199,-198,-198,-198,-197,-197,-197,-196,-196,-196,-196,-195,-195,-195,-194,-194,-194,-193,-193,-193,-193,-192,-192,-192,-191,-191,-191,-190,-190,-190,-190,-189,-189,-189,-188,-188,-188,-187,-187,-187,-187,-186,-186,-186,-185,-185,-185,-184,-184,-184,-184,-183,-183,-183,-182,-182,-182,-181,-181,-181,-181,-180,-180,-180,-179,-179,-179,-178,-178,-178,-178,-177,-177,-177,-176,-176,-176,-175,-175,-175,-174,-174,-174,-174,-173,-173,-173,-172,-172,-172,-171,-171,-171,-171,-170,-170,-170,-169,-169,-169,-168,-168,-168,-168,-167,-167,-167,-166,-166,-166,-165,-165,-165,-165,-164,-164,-164,-163,-163,-163,-162,-162,-162,-162,-161,-161,-161,-160,-160,-160,-159,-159,-159,-159,-158,-158,-158,-157,-157,-157,-156,-156,-156,-156,-155,-155,-155,-154,-154,-154,-153,-153,-153,-153,-152,-152,-152,-151,-151,-151,-150,-150,-150,-150,-149,-149,-149,-148,-148,-148,-147,-147,-147,-146,-146,-146,-146,-145,-145,-145,-144,-144,-144,-143,-143,-143,-143,-142,-142,-142,-141,-141,-141,-140,-140,-140,-140,-139,-139,-139,-138,-138,-138,-137,-137,-137,-137,-136,-136,-136,-135,-135,-135,-134,-134,-134,-134,-133,-133,-133,-132,-132,-132,-131,-131,-131,-131,-130,-130,-130,-129,-129,-129,-128,-128,-128,-128,-127,-127,-127,-126,-126,-126,-125,-125,-125,-125,-124,-124,-124,-123,-123,-123,-122,-122,-122,-122,-121,-121,-121,-120,-120,-120,-119,-119,-119,-119,-118,-118,-118,-117,-117,-117,-116,-116,-116,-115,-115,-115,-115,-114,-114,-114,-113,-113,-113,-112,-112,-112,-112,-111,-111,-111,-110,-110,-110,-109,-109,-109,-109,-108,-108,-108,-107,-107,-107,-106,-106,-106,-106,-105,-105,-105,-104,-104,-104,-103,-103,-103,-103,-102,-102,-102,-101,-101,-101,-100,-100,-100,-100,-99,-99,-99,-98,-98,-98,-97,-97,-97,-97,-96,-96,-96,-95,-95,-95,-94,-94,-94,-94,-93,-93,-93,-92,-92,-92,-91,-91,-91,-91,-90,-90,-90,-89,-89,-89,-88,-88,-88,-87,-87,-87,-87,-86,-86,-86,-85,-85,-85,-84,-84,-84,-84,-83,-83,-83,-82,-82,-82,-81,-81,-81,-81,-80,-80,-80,-79,-79,-79,-78,-78,-78,-78,-77,-77,-77,-76,-76,-76,-75,-75,-75,-75,-74,-74,-74,-73,-73,-73,-72,-72,-72,-72,-71,-71,-71,-70,-70,-70,-69,-69,-69,-69,-68,-68,-68,-67,-67,-67,-66,-66,-66,-66,-65,-65,-65,-64,-64,-64,-63,-63,-63,-63,-62,-62,-62,-61,-61,-61,-60,-60,-60,-60,-59,-59,-59,-58,-58,-58,-57,-57,-57,-56,-56,-56,-56,-55,-55,-55,-54,-54,-54,-53,-53,-53,-53,-52,-52,-52,-51,-51,-51,-50,-50,-50,-50,-49,-49,-49,-48,-48,-48,-47,-47,-47,-47,-46,-46,-46,-45,-45,-45,-44,-44,-44,-44,-43,-43,-43,-42,-42,-42,-41,-41,-41,-41,-40,-40,-40,-39,-39,-39,-38,-38,-38,-38,-37,-37,-37,-36,-36,-36,-35,-35,-35,-35,-34,-34,-34,-33,-33,-33,-32,-32,-32,-32,-31,-31,-31,-30,-30,-30,-29,-29,-29,-28,-28,-28,-28,-27,-27,-27,-26,-26,-26,-25,-25,-25,-25,-24,-24,-24,-23,-23,-23,-22,-22,-22,-22,-21,-21,-21,-20,-20,-20,-19,-19,-19,-19,-18,-18,-18,-17,-17,-17,-16,-16,-16,-16,-15,-15,-15,-14,-14,-14,-13,-13,-13,-13,-12,-12,-12,-11,-11,-11,-10,-10,-10,-10,-9,-9,-9,-8,-8,-8,-7,-7,-7,-7,-6,-6,-6,-5,-5,-5,-4,-4,-4,-4,-3,-3,-3,-2,-2,-2,-1,-1,-1,0,0,0,0,1,1,1,2,2,2,3,3,3,3,4,4,4,5,5,5,6,6,6,6,7,7,7,8,8,8,9,9,9,9,10,10,10,11,11,11,12,12,12,12,13,13,13,14,14,14,15,15,15,15,16,16,16,17,17,17,18,18,18,18,19,19,19,20,20,20,21,21,21,21,22,22,22,23,23,23,24,24,24,24,25,25,25,26,26,26,27,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31,31,32,32,32,33,33,33,34,34,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,39,39,39,40,40,40,40,41,41,41,42,42,42,43,43,43,43,44,44,44,45,45,45,46,46,46,46,47,47,47,48,48,48,49,49,49,49,50,50,50,51,51,51,52,52,52,52,53,53,53,54,54,54,55,55,55,55,56,56,56,57,57,57,58,58,58,59,59,59,59,60,60,60,61,61,61,62,62,62,62,63,63,63,64,64,64,65,65,65,65,66,66,66,67,67,67,68,68,68,68,69,69,69,70,70,70,71,71,71,71,72,72,72,73,73,73,74,74,74,74,75,75,75,76,76,76,77,77,77,77,78,78,78,79,79,79,80,80,80,80,81,81,81,82,82,82,83,83,83,83,84,84,84,85,85,85,86,86,86,86,87,87,87,88,88,88,89,89,89,90,90,90,90,91,91,91,92,92,92,93,93,93,93,94,94,94,95,95,95,96,96,96,96,97,97,97,98,98,98,99,99,99,99,100,100,100,101,101,101,102,102,102,102,103,103,103,104,104,104,105,105,105,105,106,106,106,107,107,107,108,108,108,108,109,109,109,110,110,110,111,111,111,111,112,112,112,113,113,113,114,114,114,114,115,115,115,116,116,116,117,117,117,118,118,118,118,119,119,119,120,120,120,121,121,121,121,122,122,122,123,123,123,124,124,124,124,125,125,125,126,126,126,127,127,127,127,128,128,128,129,129,129,130,130,130,130,131,131,131,132,132,132,133,133,133,133,134,134,134,135,135,135,136,136,136,136,137,137,137,138,138,138,139,139,139,139,140,140,140,141,141,141,142,142,142,142,143,143,143,144,144,144,145,145,145,145,146,146,146,147,147,147,148,148,148,149,149,149,149,150,150,150,151,151,151,152,152,152,152,153,153,153,154,154,154,155,155,155,155,156,156,156,157,157,157,158,158,158,158,159,159,159,160,160,160,161,161,161,161,162,162,162,163,163,163,164,164,164,164,165,165,165,166,166,166,167,167,167,167,168,168,168,169,169,169,170,170,170,170,171,171,171,172,172,172,173,173,173,173,174,174,174,175,175,175,176,176,176,177,177,177,177,178,178,178,179,179,179,180,180,180,180,181,181,181,182,182,182,183,183,183,183,184,184,184,185,185,185,186,186,186,186,187,187,187,188,188,188,189,189,189,189,190,190,190,191,191,191,192,192,192,192,193,193,193,194,194,194,195,195,195,195,196,196,196,197,197,197,198,198,198,198,199,199,199,200,200,200,201,201,201,201,202,202,202,203,203,203,204,204,204,205,205,205,205,206,206,206,207,207,207,208,208,208,208,209,209,209,210,210,210,211,211,211,211,212,212,212,213,213,213,214,214,214,214,215,215,215,216,216,216,217,217,217,217,218,218,218,219,219,219,220,220,220,220,221,221,221,222,222,222,223,223,223,223,224,224,224,225,225,225,226,226,226,226,227,227,227,228,228,228,229,229,229,229,230,230,230,231,231,231,232,232,232,232,233,233,233,234,234,234,235,235,235,236,236,236,236,237,237,237,238,238,238,239,239,239,239,240,240,240,241,241,241,242,242,242,242,243,243,243,244,244,244,245,245,245,245,246,246,246,247,247,247,248,248,248,248,249,249,249,250,250,250,251,251,251,251,252,252,252,253,253,253,254,254,254,254,255,255,255,256,256,256,257,257,257,257,258,258,258,259,259,259,260,260,260,260,261,261,261,262,262,262,263,263,263,264,264,264,264,265,265,265,266,266,266,267,267,267,267,268,268,268,269,269,269,270,270,270,270,271,271,271,272,272,272,273,273,273,273,274,274,274,275,275,275,276,276,276,276,277,277,277,278,278,278,279,279,279,279,280,280,280,281,281,281,282,282,282,282,283,283,283,284,284,284,285,285,285,285,286,286,286,287,287,287,288,288,288,288,289,289,289,290,290,290,291,291,291,291,292,292,292,293,293,293,294,294,294,295,295,295,295,296,296,296,297,297,297,298,298,298,298,299,299,299,300,300,300,301,301,301,301,302,302,302,303,303,303,304,304,304,304,305,305,305,306,306,306,307,307,307,307,308}; int monthday[366] = { 229,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326, 327,328,329,330,331,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424, 425,426,427,428,429,430,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523, 524,525,526,527,528,529,530,531,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621, 622,623,624,625,626,627,628,629,630,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720, 721,722,723,724,725,726,727,728,729,730,731,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818, 819,820,821,822,823,824,825,826,827,828,829,830,831,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916, 917,918,919,920,921,922,923,924,925,926,927,928,929,930,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012, 1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1030,1031,1101,1102,1103,1104, 1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1118,1119,1120,1121,1122,1123,1124,1125,1126,1127, 1128,1129,1130,1201,1202,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1218,1219,1220, 1221,1222,1223,1224,1225,1226,1227,1228,1229,1230,1231,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115, 116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,201,202,203,204,205,206,207,208,209,210,211,212,213, 214,215,216,217,218,219,220,221,222,223,224,225,226,227,228 }; data.table/src/dogroups.c0000644000175100001440000010056013172212367015072 0ustar hornikusers#include "data.table.h" #include //#include #include #include #include void setSizes() { // called by init.c int i; for (i=0;i<100;i++) sizes[i]=0; // only these types are currently allowed as column types : sizes[INTSXP] = sizeof(int); // integer and factor sizes[LGLSXP] = sizeof(int); // logical sizes[REALSXP] = sizeof(double); // numeric sizes[STRSXP] = sizeof(SEXP *); // character sizes[VECSXP] = sizeof(SEXP *); // a column itself can be a list() for (i=0;i<100;i++) { if (sizes[i]>8) error("Type %d is sizeof() greater than 8 bytes on this machine. We haven't tested on any architecture greater than 64bit, yet.", i); // One place we need the largest sizeof (assumed to be 8 bytes) is the working memory malloc in reorder.c } SelfRefSymbol = install(".internal.selfref"); } SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verbose) { R_len_t i, j, k, rownum, ngrp, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, r, thisansloc, grpn, thislen, igrp, vlen, origIlen=0, origSDnrow=0; int protecti=0; SEXP names, names2, xknames, bynames, dtnames, ans=NULL, jval, thiscol, SDall, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, listwrap, target, source, tmp; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE, recycleWarn=TRUE; size_t size; // must be size_t, otherwise bug #5305 (integer overflow in memcpy) clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; if (!isInteger(order)) error("Internal error: order not integer vector"); //if (TYPEOF(starts) != INTSXP) error("Internal error: starts not integer"); //if (TYPEOF(lens) != INTSXP) error("Internal error: lens not integer"); // starts can now be NA (<0): if (INTEGER(starts)[0]<0 || INTEGER(lens)[0]<0) error("starts[1]<0 or lens[1]<0"); if (!isNull(jiscols) && LENGTH(order) && !LOGICAL(on)[0]) error("Internal error: jiscols not NULL but o__ has length"); if (!isNull(xjiscols) && LENGTH(order) && !LOGICAL(on)[0]) error("Internal error: xjiscols not NULL but o__ has length"); if(!isEnvironment(env)) error("’env’ should be an environment"); ngrp = length(starts); // the number of groups (nrow(groups) will be larger when by) ngrpcols = length(grpcols); // fix for longstanding FR/bug, #495. E.g., DT[, c(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:v3] resulted in error.. the idea is, 1) we create .SDall, which is normally == .SD. But if extra vars are detected in jexp other than .SD, then .SD becomes a shallow copy of .SDall with only .SDcols in .SD. Since internally, we don't make a copy, changing .SDall will reflect in .SD. Hopefully this'll workout :-). SDall = findVar(install(".SDall"), env); defineVar(sym_BY, BY = allocVector(VECSXP, ngrpcols), env); bynames = PROTECT(allocVector(STRSXP, ngrpcols)); protecti++; // TO DO: do we really need bynames, can we assign names afterwards in one step? for (i=0; i