lmbench-3.0-a9/0000775000076400007640000000000010723011657013167 5ustar staelinstaelinlmbench-3.0-a9/doc/0000775000076400007640000000000010723011655013732 5ustar staelinstaelinlmbench-3.0-a9/doc/Makefile0000664000076400007640000000575407633046051015411 0ustar staelinstaelin# Makefile for lmbench doc subdir. # $Id$ SHELL=/bin/sh DESC = description.ms USENIX = tmac.usenix usenix96.ms PIC = ctx.pic mem.pic SCRIPTS = ../scripts/ BASE=/usr/local MANDIR=${BASE}/man MAN = \ bargraph.1 graph.1 \ lmbench.3 reporting.3 results.3 timing.3 \ lmbench.8 mhz.8 cache.8 line.8 tlb.8 lmdd.8 \ lat_proc.8 lat_mmap.8 lat_ctx.8 lat_syscall.8 lat_pipe.8 \ lat_http.8 lat_tcp.8 lat_udp.8 lat_rpc.8 lat_connect.8 lat_fs.8 \ lat_ops.8 lat_pagefault.8 lat_mem_rd.8 lat_select.8 \ lat_fifo.8 lat_fcntl.8 lat_sig.8 lat_unix.8 lat_unix_connect.8 \ bw_file_rd.8 bw_mem.8 bw_mmap_rd.8 \ bw_pipe.8 bw_tcp.8 bw_unix.8 \ par_ops.8 par_mem.8 ALL = $(DESC) $(USENIX) $(PIC) $(MAN) $(REFER) references .SUFFIXES: .pic .fig .fig.pic: fig2dev -L pic $< $*.pic PS ps: $(ALL) gindxbib references groff -t -e -G -s -p -R $(USENIX) > USENIX.PS #groff -s -p -mgs $(DESC) > DESC.PS #groff -fH -man $(MAN) > MAN.PS X x: $(ALL) gindxbib references $(SCRIPTS)xroff -t -e -s -p -R $(USENIX) #$(SCRIPTS)xroff -s -p -mgs $(DESC) #$(SCRIPTS)xroff -man -fH $(MAN) text: $(ALL) gindxbib references gsoelim usenix96.ms | sed "s/expand doublebox/center/" | \ sed s/doublebox// > Fixed.ms groff -Tascii -t -e -s -p -R -mgs Fixed.ms 2>/dev/null | colcrt - | more userguide.ps: $(ALL) references-userguide userguide.ms \ lmbench3_arch.pic lmbench3_signals.pic ctx.tbl \ bw_allmem.tbl bw_ipc.tbl bw_reread2.tbl bw_tcp.tbl \ lat_allmem.tbl lat_allproc.tbl lat_connect.tbl \ lat_disk.tbl lat_fs.tbl lat_ipc.tbl lat_nullsys.tbl \ lat_pipe.tbl lat_signal.tbl lat_tcp.tbl lat_udp.tbl gindxbib references-userguide groff -t -e -G -s -p -R tmac.usenix userguide.ms > userguide.ps memhier.ps: $(ALL) memhier-color.d memhier-tlb.d memhier-line.d references-memhier memhier.ms gindxbib references-memhier groff -G -t -e -s -p -R tmac.usenix memhier.ms > memhier.ps # ../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-color.graph > memhier-color.pic # ../scripts/graph -xm -logx -small -below -nomarks -nospace memhier-line.graph > memhier-line.pic # ../scripts/graph -logx -small -below -nomarks -nospace memhier-tlb.graph > memhier-tlb.pic lmbench3.ps: $(ALL) references-lmbench3 lmbench3.ms \ lmbench3_arch.pic lmbench3_signals.pic gindxbib references-lmbench3 groff -G -t -e -s -p -R tmac.usenix lmbench3.ms > lmbench3.ps parallel.ps: $(ALL) references-parallel parallel.ms gindxbib references-parallel groff -G -t -e -s -p -R tmac.usenix parallel.ms > parallel.ps install: $(MAN) for f in $(MAN); do \ for s in 1 2 3 4 5 6 7 8 9; do \ if [ ! -d ${MANDIR}/man$${s} ]; then \ mkdir -p ${MANDIR}/man$${s}; \ fi; \ base=`basename $${f} .$${s}`; \ if [ "$${base}.$${s}" = "$$f" ]; then \ cp $$f ${MANDIR}/man$${s}/; \ fi; \ done; \ done get: $(ALL) edit: get -e -s $(ALL) $(MAN): get -s $(MAN) $(PIC): get -s $(PIC) $(DESC): get -s $(DESC) $(USENIX): get -s $(USENIX) clean: -bk clean /bin/rm -f *.PS XXX bw.pic memrd_bcopy_comp.pic references.i lmbench-3.0-a9/doc/bargraph.10000664000076400007640000000710407045412511015603 0ustar staelinstaelin.\" $Id$ .TH BARGRAPH 1 .SH NAME bargraph \- compile bar graphs into pic input .SH SYNOPSIS .B bargraph [ .I filename \&.\|.\|. ] .SH DESCRIPTION .LP .B bargraph is a perl script which takes a set of Y data with labels and generates a (human readable) pic script that will produce the bar graph. The output (pic input) is commented and is designed such that you should be able to go in and adjust it to fit your document should you need to do so. .LP The input data format is: .sp .nf .in +4 3 foo bar 9 bigger foo "Silly example .in .fi .sp with output like .sp .nf .in +2 .ft CW bigger foo +----------+ | | foo | | bar | | +----------+ | | | | | | +----------+ +----------+ ------------------------------- 3 9 Silly example .ft .in .fi .SH OPTIONS The following command line options are available .TP 10 -big Make the x/y defaults be 7.5 inches, crank up the title size, and don't put a spacer at the top. Used for printing a graph on a full page. .TP -nobox Do not put an outline box around the bargraph. .SH "CONTROL OPTIONS" The following may be included in the graph to control the format of the graph. They must be at the beginning of a line and by themselves. .TP 18 %ps point size. Default is 10. .TP %ft font. Default is CB. .TP %labelgap the space in inches between fill labels. The bars may be filled with different fill values (no patterns yet, pic doesn't do that). If you want to label these, the labels are labelgap inches apart. Default is 1.5 inches. .TP %xsize the width of the graph in inches. Default is 7 inches. .TP %ysize the height of the graph in inches. Default is 6 inches. .TP %Title n|s the title of the bargraph. The title option is followed by a a "n"orth (top) or "s"outh (bottom) indicator which controls placement of the title. No default. .TP %titleplus <val> increases the size of the title in pointsize. Default is 0. .TP %boxpercent <val> a value between 0 and 100 that controls how closely the bars are to each other. A value of 100 means the bars touch. Default is 75. .TP %worse <D> <W> An idiot arrow is drawn to indicate which way is worse. <D> is the direction and must be "up" or "down". <W> is the location specifier and must be one of "n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc. .TP %better <D> <W> An idiot arrow is drawn to indicate which way is better. <D> is the direction and must be "up" or "down". <W> is the location specifier and must be one of "n"orth, "w"est, "e"ast, "s"outh, "nw" northwest, ne, sw, se, etc. .TP %fakemax pretend that one data point was this big when autoscaling. THis is used to make a series of bargraphs be all drawn to the same scale. .SH "FILL CONTROL" Each datum may be follwed by a fill specifier as follows .sp .5 .ti +.5i 3 foo bar %fill.5 .sp .5 Labels may be specified to group a set of data that all have the same data. If a line appears like .sp .5 .ti +.5i %label.5 The foo bar data .sp .5 then you get a label below the graph. .SH "SEE ALSO" .BR gtroff (1), .BR graph (1), .BR gpic (1) .SH TODO Make a -horizontal option that prints the graphs the other way. .LP Hack pick to get access to postscripts stipple patterns. .SH BUGS This isn't done. It isn't integrated with the groff preprocessor yet. It doesn't know about .GS/.GE thingys. I use it to manually generate a pic file and then include that. I have to talk to James to see if he wants it as part of the gpic stuff. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/benchmarks�����������������������������������������������������������������������0000664�0000764�0000764�00000003653�07045412511�016000� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������Theme Data movement and the cost thereof Latency Time per operation CPU cycles per operation Bandwidth MB / sec CPU cycles / MB Media Memory (load, bcopy) Disk (randoms, sequentials) File system (directory ops, sequential) Network (hot potato, transfer) Pipes (hot potato, transfer) VM system (mmaps/munmaps, bcopy) Systems All Unix systems Windows NT VMS (?) Mainframes (?) Memory Small transfers (randoms) Load latency Large transfers (sequential) Bcopy bandwidth Processes Null process execution time Context switching Misc Null entry into the system Networking Small transfers (randoms) Transfers per second CPU cycles per transfer socket/bind/close per second Large transfers (sequential) MB per second CPU cycles per MB Disks Small transfers (randoms) Transfers per second CPU cycles per transfer Large transfers (sequential) MB per second CPU cycles per MB File system Small transfers (randoms) Creates / second Removes / second Random I/O's per second in large file CPU cycles per transfer MB / sec when reading many related small files Large files MB / second read/write CPU cycles per MB Hardness Measure fsck time? Virtual memory system Creation mmaps per second munmaps per second Also vary size of mapped region Small transfers (randoms) Random reads per second of large mmaped file CPU cycles per read Large transfers (cached sequential) MB per second read rate CPU cycles per MB �������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_allmem.tbl��������������������������������������������������������������������0000664�0000764�0000764�00000002661�07507604561�016413� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; c|c s|c s l|c c|c c l|r r|r r. Bcopy Memory System \fBunrolled\fP libc read write = DEC Alpha 41 39 76 78\ DEC Alpha 46 46 88 91\ DEC Alpha 46 45 79 91\ DEC Alpha 38 40 69 84\ SunOS-5.4 sun4d 22 21 47 38\ DEC Alpha 36 36 55 72\ DEC Alpha 38 38 64 79\ SunOS-5.4 sun4m 25 23 64 51\ SunOS-5.4 sun4m 24 23 59 40\ SunOS-5.4 sun4d 16 14 36 28\ SunOS-5.4 sun4m 31 26 80 62\ Sun SC1000 17 15 38 31\ Sun Ultra1 85 167 129 152\ Linux alpha 40 40 74 72\ Linux i686 42 57 205 56\ Linux i586 30 31 61 50\ Linux alpha 39 39 73 71\ Unixware/i686 65 55 214 86\ Linux i586 38 42 74 75\ IBM Power2 242 171 205 364\ IBM PowerPC 21 21 63 26\ dgux mc88110 17 17 37 19\ DEC Alpha 15 15 46 20\ IRIX64 IP21 68 70 92 90\ IRIX64-601 IP26 41 32 65 61\ Linux i586 38 41 74 75\ Linux i586 20 21 60 31\ Linux i586 20 21 58 30\ Linux i586 20 21 60 31\ Linux i486 16 17 33 41\ HP-UX 9000/819 55 48 97 89\ FreeBSD/i586 39 42 73 83\ FreeBSD/i586 38 41 65 83\ FreeBSD/i586 38 41 65 83\ HP-UX 9000/735 32 26 55 52\ HP-UX 9000/735 32 26 54 51\ FreeBSD/i586 36 40 62 83\ IRIX64 IP25 53 41 87 72\ IRIX64 IP19 32 34 65 67\ HP-UX 9000/735 31 26 53 51\ HP-UX 9000/735 32 26 53 51\ HP-UX 9000/755 31 25 49 52\ HP-UX 9000/770 31 33 56 61\ HP-UX 9000/897 19 19 40 37\ IRIX64 IP19 35 36 65 67\ IRIX IP19 33 34 67 72\ IRIX5.3 IP19 32 34 65 68\ IRIX IP22 32 33 68 72\ IRIX5.3 IP22 31 32 69 66\ FreeBSD/i586 39 42 65 83\ .TE .KE �������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_file_rd.8���������������������������������������������������������������������0000664�0000764�0000764�00000003553�07172615471�016137� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_FILE_RD 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME bw_file_rd \- time the reading and summing of a file .SH SYNOPSIS .B bw_file_rd [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I size .I file .SH DESCRIPTION .B bw_file_rd times the read of the specified file in 64KB blocks. Each block is summed up as a seried of 4 byte integers in an unrolled loop. Results are reported in megabytes read per second. .LP The data is not accessed in the user program; the benchmark relies on the operating systems read interface to have actually moved the data. Systems that implement page flipping may fool this benchmark. .LP The benchmark is intended to be used on a file that is in memory, i.e., the benchmark is a reread benchmark. Other file benchmarking can be done with .BR lmdd (8). .LP The size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., .sp .ft CB 8.00 25.33 .ft .SH MEMORY UTILIZATION This benchmark can move up to three times the requested memory. Most Unix systems implement the read system call as a bcopy from kernel space to user space. Bcopy will use 2-3 times as much memory bandwidth: there is one read from the source and a write to the destionation. The write usually results in a cache line read and then a write back of the cache line at some later point. Memory utilization might be reduced by 1/3 if the processor architecture implemented ``load cache line'' and ``store cache line'' instructions (as well as ``getcachelinesize''). .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �����������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_ipc.tbl�����������������������������������������������������������������������0000664�0000764�0000764�00000002105�07507604561�015710� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c c l r r r. System bcopy \fBpipe\fP TCP = DEC Alpha 36 32 9\ DEC Alpha 46 54 11\ DEC Alpha 38 23 7\ DEC Alpha 45 35 9\ DEC Alpha 39 32 12\ Linux alpha 39 73 9\ Sun Ultra1 167 61 51\ SunOS-5.4 sun4m 26 11 11\ SunOS-5.4 sun4m 23 24 19\ DEC Alpha 40 24 6\ DEC Alpha 15 17 4\ Linux alpha 40 73 9\ Linux i586 42 34 7\ Linux i486 17 16 6\ Linux i586 31 24 3\ IBM Power2 171 84 10\ IBM PowerPC 21 30 17\ SunOS-5.4 sun4d 14 7 8\ HP-UX 9000/735 26 37 24\ SunOS-5.4 sun4m 23 7 9\ Linux i686 57 73 15\ Linux i586 41 22 5\ Linux i586 21 19 3\ Linux i586 21 18 3\ Linux i586 21 12 3\ Sun SC1000 15 9 11\ SunOS-5.4 sun4d 21 8 9\ IRIX5.3 IP22 32 34 22\ IRIX64-601 IP26 32 37 22\ HP-UX 9000/770 33 53 21\ HP-UX 9000/819 48 37 28\ HP-UX 9000/755 25 38 35\ IRIX IP22 33 32 7\ IRIX64 IP21 70 28 19\ HP-UX 9000/735 26 44 20\ HP-UX 9000/735 26 42 18\ HP-UX 9000/735 26 39 19\ IRIX64 IP25 41 40 26\ IRIX64 IP19 34 27 19\ IRIX64 IP19 36 17 31\ IRIX IP19 34 14 16\ IRIX5.3 IP19 34 12 12\ HP-UX 9000/897 19 26 17\ dgux mc88110 17 8 5\ .TE .KE �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_mem.8�������������������������������������������������������������������������0000664�0000764�0000764�00000005220�07172615471�015302� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_MEM 8 "$Date$" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" .SH NAME bw_mem \- time memory bandwidth .SH SYNOPSIS .B bw_mem_cp [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I size .I rd|wr|rdwr|cp|fwr|frd|bzero|bcopy .I [align] .SH DESCRIPTION .B bw_mem allocates twice the specified amount of memory, zeros it, and then times the copying of the first half to the second half. Results are reported in megabytes moved per second. .LP The size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., .sp .ft CB 8.00 25.33 .ft .LP There are nine different memory benchmarks in .BR bw_mem . They each measure slightly different methods for reading, writing or copying data. .TP .B "rd" measures the time to read data into the processor. It computes the sum of an array of integer values. It accesses every fourth word. .TP .B "wr" measures the time to write data to memory. It assigns a constant value to each memory of an array of integer values. It accesses every fourth word. .TP .B "rdwr" measures the time to read data into memory and then write data to the same memory location. For each element in an array it adds the current value to a running sum before assigning a new (constant) value to the element. It accesses every fourth word. .TP .B "cp" measures the time to copy data from one location to another. It does an array copy: dest[i] = source[i]. It accesses every fourth word. .TP .B "frd" measures the time to read data into the processor. It computes the sum of an array of integer values. .TP .B "fwr" measures the time to write data to memory. It assigns a constant value to each memory of an array of integer values. .TP .B "fcp" measures the time to copy data from one location to another. It does an array copy: dest[i] = source[i]. .TP .B "bzero" measures how fast the system can .I bzero memory. .TP .B "bcopy" measures how fast the system can .I bcopy data. .SH MEMORY UTILIZATION This benchmark can move up to three times the requested memory. Bcopy will use 2-3 times as much memory bandwidth: there is one read from the source and a write to the destionation. The write usually results in a cache line read and then a write back of the cache line at some later point. Memory utilization might be reduced by 1/3 if the processor architecture implemented ``load cache line'' and ``store cache line'' instructions (as well as ``getcachelinesize''). .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_mem_rd.8����������������������������������������������������������������������0000664�0000764�0000764�00000001522�07045412511�015755� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_MEM_RD 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME bw_mem_rd \- time memory read rate (with overhead) .SH SYNOPSIS .B bw_mem_rd .I size .SH DESCRIPTION .B bw_mem_rd allocates the specified amount of memory, zeros it, and then times the reading of that memory as a series of integer loads and adds. Each four byte integer is loaded and added to accumulator. .LP The size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., .sp .ft CB 8.00 25.33 .ft .SH MEMORY UTILIZATION This benchmark should move approximately the reported amount of memory. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_mmap_rd.8���������������������������������������������������������������������0000664�0000764�0000764�00000002162�07172615471�016145� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_MMAP_RD 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME bw_mmap_rd \- time the reading and summing of a file .SH SYNOPSIS .B bw_mmap_rd [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I size .I file .SH DESCRIPTION .B bw_mmap_rd creates a memory mapping to the file and then reads the mapping in an unrolled loop similar to that used in bw_mem_rd(8). The benchmark is intended to be used on a file that is in memory, i.e., the benchmark is a reread benchmark. Other file benchmarking can be done with .BR lmdd (8). .LP The size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., .sp .ft CB 8.00 25.33 .ft .SH MEMORY UTILIZATION This benchmark should move approximately the reported amount of memory. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_pipe.8������������������������������������������������������������������������0000664�0000764�0000764�00000003076�07172615471�015470� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_PIPE 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME bw_pipe \- time data movement through pipes .SH SYNOPSIS .B bw_pipe [ .I "-m <message size>" ] [ .I "-M <total bytes>" ] [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B bw_pipe creates a Unix pipe between two processes and moves .I "total bytes" through the pipe in .I "message size" chunks (note that pipes are typically sized smaller than that). The default .I "total bytes" is 10MB and the default .I "message size" is 64KB. .SH OUTPUT Output format is \f(CB"Pipe bandwidth: %0.2f MB/sec\\n", megabytes_per_second\fP, i.e., .sp .ft CB Pipe bandwidth: 4.87 MB/sec .ft .SH MEMORY UTILIZATION This benchmark can move up to six times the requested memory per process. There are two processes, the sender and the receiver. Most Unix systems implement the read/write system calls as a bcopy from/to kernel space to/from user space. Bcopy will use 2-3 times as much memory bandwidth: there is one read from the source and a write to the destionation. The write usually results in a cache line read and then a write back of the cache line at some later point. Memory utilization might be reduced by 1/3 if the processor architecture implemented "load cache line" and "store cache line" instructions (as well as getcachelinesize). .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_reread2.tbl�������������������������������������������������������������������0000664�0000764�0000764�00000002672�07507604561�016472� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; c|c c|c c l|c c|c c l|r r|r r. Libc \fBFile\fP Memory File System bcopy \fBread\fP read mmap = DEC Alpha 38 37 64 12\ DEC Alpha 45 40 79 50\ DEC Alpha 36 36 55 19\ DEC Alpha 40 44 69 14\ DEC Alpha 46 48 88 26\ DEC Alpha 39 39 76 23\ SunOS-5.4 sun4m 23 31 59 31\ SunOS-5.4 sun4m 26 23 80 30\ SunOS-5.4 sun4d 14 23 36 25\ SunOS-5.4 sun4d 21 23 47 17\ Sun SC1000 15 20 38 28\ DEC Alpha 15 20 46 14\ Sun Ultra1 167 85 129 101\ Linux alpha 40 25 74 23\ Linux i586 31 17 61 14\ SunOS-5.4 sun4m 23 21 64 39\ Linux alpha 39 24 73 18\ Unixware/i686 55 53 214 198\ Linux i586 42 23 74 9\ IBM Power2 171 187 205 106\ IBM PowerPC 21 40 63 51\ Linux i486 17 9 33 10\ IRIX64 IP21 70 65 92 72\ Linux i686 57 46 205 34\ IRIX64-601 IP26 32 75 65 56\ Linux i586 41 21 74 13\ Linux i586 21 14 60 11\ Linux i586 21 14 58 10\ Linux i586 21 13 60 8\ HP-UX 9000/735 26 47 55 36\ HP-UX 9000/819 48 64 97 41\ HP-UX 9000/755 25 45 49 32\ FreeBSD/i586 42 38 65 49\ FreeBSD/i586 42 30 73 54\ FreeBSD/i586 41 29 65 46\ IRIX64 IP19 34 34 65 56\ FreeBSD/i586 40 28 62 47\ IRIX64 IP25 41 60 87 76\ HP-UX 9000/735 26 43 53 33\ HP-UX 9000/735 26 43 54 34\ HP-UX 9000/735 26 43 53 35\ HP-UX 9000/770 33 43 56 37\ HP-UX 9000/897 19 39 40 28\ FreeBSD/i586 41 29 65 50\ dgux mc88110 17 16 37 13\ IRIX5.3 IP22 32 32 69 44\ IRIX IP19 34 39 67 43\ IRIX64 IP19 36 36 65 56\ IRIX5.3 IP19 34 36 65 43\ IRIX IP22 33 37 68 48\ .TE .KE ����������������������������������������������������������������������lmbench-3.0-a9/doc/bw_tcp.8�������������������������������������������������������������������������0000664�0000764�0000764�00000003610�07172615471�015313� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_TCP 1 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME bw_tcp \- time data movement through TCP/IP sockets .SH SYNOPSIS .B bw_tcp [ .I "-m <message size>" ] [ .I "-M <total bytes>" ] [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I "server" .br or .B bw_tcp .I -s .br or .B bw_tcp .I "-S <server>" .SH DESCRIPTION .B bw_tcp is a client/server program that moves data over a TCP/IP socket. Nothing is done with the data on either side; .I "total bytes" of data is moved in .I "message size" chunks. .LP .B bw_tcp has three forms of usage: as a server (-s), as a client (bw_tcp localhost), and as a shutdown (bw_tcp -S localhost). .LP The default amount of data is 10MB. The client form may specify a different amount of data. Specifications may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is .ft CB Socket bandwidth using localhost: 2.32 MB/sec .ft .SH MEMORY UTILIZATION This benchmark can move up to six times the requested memory per process when run through the loopback device. There are two processes, the sender and the receiver. Most Unix systems implement the read/write system calls as a bcopy from/to kernel space to/from user space. Bcopy will use 2-3 times as much memory bandwidth: there is one read from the source and a write to the destionation. The write usually results in a cache line read and then a write back of the cache line at some later point. Memory utilization might be reduced by 1/3 if the processor architecture implemented "load cache line" and "store cache line" instructions (as well as getcachelinesize). .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation and Silicon Graphics, Inc. .SH SEE ALSO lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_tcp.tbl�����������������������������������������������������������������������0000664�0000764�0000764�00000001620�07507607546�015732� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS center expand doublebox; l r. Linux alpha 8.9 Linux i486 5.5 Linux alpha 8.8 Linux i586 3.2 Linux i486 5.6 Linux i586 2.9 DEC Alpha 11.2 Linux i586 3.0 SunOS-5.4 sun4m 9.5 SunOS-5.4 sun4m 11.0 DEC Alpha 4.1 DEC Alpha 6.6 DEC Alpha 12.1 Linux i586 3.0 SunOS-5.4 sun4d 7.9 SunOS-5.4 sun4d 9.1 DEC Alpha 8.6 DEC Alpha 6.0 DEC Alpha 10.5 Sun SC1000 10.9 Linux i586 5.1 DEC Alpha 9.2 Linux i586 6.8 FreeBSD/i586 0.1 IRIX IP22 7.2 Linux i686 14.7 FreeBSD/i586 0.1 SunOS-5.4 sun4m 19.5 FreeBSD/i586 0.1 Sun Ultra1 51.3 FreeBSD/i586 0.2 FreeBSD/i586 0.2 IBM Power2 10.5 IBM PowerPC 16.6 dgux mc88110 4.6 IRIX64 IP21 18.8 IRIX IP19 16.4 HP-UX 9000/735 18.4 HP-UX 9000/735 19.0 HP-UX 9000/735 23.9 HP-UX 9000/897 16.9 IRIX64-601 IP26 21.5 IRIX5.3 IP22 22.1 IRIX5.3 IP19 12.2 IRIX64 IP19 18.8 IRIX64 IP25 26.1 IRIX64 IP19 30.8 HP-UX 9000/770 20.5 HP-UX 9000/819 27.7 HP-UX 9000/755 35.2 HP-UX 9000/735 19.6 .TE .KE ����������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/bw_unix.8������������������������������������������������������������������������0000664�0000764�0000764�00000002163�07172615471�015512� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH BW_UNIX 8 "$Date$" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" .SH NAME bw_unix \- UNIX pipe bandwidth .SH SYNOPSIS .B bw_unix [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I size .SH DESCRIPTION .B bw_unix creates a pipe and forks a child process which keeps writing data to the pipe as fast as it can. The benchmark measures how fast the parent process can .I read the data in .IR size -byte chunks from the pipe. Nothing is done with the data in either the parent (reader) or child (writer) processes. .LP The .I size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %.2f\\n", megabytes, megabytes_per_second\fP, i.e., .sp .ft CB 8.00 25.33 .ft .SH "MEMORY UTILIZATION" This benchmark should move approximately the reported amount of memory. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/cache.8��������������������������������������������������������������������������0000664�0000764�0000764�00000002562�07172615471�015105� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH CACHE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME cache \- cache parameters .SH SYNOPSIS .B cache [ .I "-L <line size>" ] [ .I "-M <len>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B cache tries to determine the characteristics of the memory hierarchy. It attempts to determine the number of caches, the size of each cache, the line size for each cache, and the available memory parallelism at each level in the memory hierarchy. The largest amount of memory it will examine is .I len bytes. .LP .B cache first attempts to determine the number and size of caches by measuring the memory latency for various memory sizes. Once it has identified the various caches it then measures the latency, parallelism, and line size for each cache. Unfortunately, determining the cache size merely from latency is exceedingly difficult due to variations in cache replacement and prefetching strategies. .SH BUGS .B cache is an experimental benchmark and is known to fail on many processors. In particular there are a large number of machines with weird caching behavior that confuse .B cache and prevent it from accurately determining the number and size of the various caches. .SH "SEE ALSO" lmbench(8), line(8), tlb(8), par_mem(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/ctx.pic��������������������������������������������������������������������������0000664�0000764�0000764�00000014612�07045412511�015230� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.sp .10i .in +.07i .PS .ps 9 .vs 9 .ft CB [ # Variables, tweak these. xtick = 2.000000 # width of an X tick xlower = 0.000000 # where the xtick start xupper = 22.000000 # upper range of graph xn = 11 # number of ticks to do ytick = 50.000000 # width of an Y tick ylower = 0.000000 # where the ytick start yupper = 450.000000 # upper range of graph yn = 9 # number of ticks to do xsize = 2.05 # width of the graph ysize = 2.1 # height of the graph yscale = ysize / (yupper - ylower) # scale data to paper xscale = xsize / (xupper - xlower) # scale data to paper tick = 0.10000000000000001 # distance towards numbers gthk = .1 # thickness of grid lines thk = 0.75 # thickness of data lines qthk = 2.0 # thickness of quartile lines vs = .10 # works for 10 point fonts # Draw the graph borders and tick marks O: box thick 1.5 ht ysize wid xsize j = ylower t = tick * .5 for i = 0 to yn by 1 do { ys = j - ylower g = ys * yscale line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g) if (i < yn) then { y2 = (ys + (ytick / 2)) * yscale line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2) } if (yupper - ylower > 999) then { sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) } else { if (yupper - ylower > 10) then { sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) } else { if (yupper - ylower > 1) then { sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02) } else { sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02) }}} j = j + ytick } j = xlower for i = 0 to xn by 1 do { xs = j - xlower g = xs * xscale line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0) if (i < xn) then { x2 = (xs + (xtick / 2)) * xscale line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t) } if (xupper - xlower > 999) then { sprintf("%.0f", j) at O.sw + (g, -.25) } else { if (xupper - xlower > 10) then { sprintf("%.0f", j) at O.sw + (g, -.25) } else { if (xupper - xlower > 1) then { sprintf("%.1f", j) at O.sw + (g, -.25) } else { sprintf("%.2f", j) at O.sw + (g, -.25) }}} j = j + xtick } # DATASET: Process size=0 overhead=10, MARK 0 [ "\(ci" ] at O.sw + \ (xscale * (2 - xlower), yscale * (6 - ylower)) [ "\(ci" ] at O.sw + \ (xscale * (4 - xlower), yscale * (7 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(ci" ] at O.sw + \ (xscale * (8 - xlower), yscale * (7 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(ci" ] at O.sw + \ (xscale * (16 - xlower), yscale * (8 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(ci" ] at O.sw + \ (xscale * (20 - xlower), yscale * (8 - ylower)) line thick thk from 2nd last [].c to last [].c # DATASET: Process size=4 overhead=19, MARK 1 [ "\(sq" ] at O.sw + \ (xscale * (2 - xlower), yscale * (7 - ylower)) [ "\(sq" ] at O.sw + \ (xscale * (4 - xlower), yscale * (8 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(sq" ] at O.sw + \ (xscale * (8 - xlower), yscale * (9 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(sq" ] at O.sw + \ (xscale * (16 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(sq" ] at O.sw + \ (xscale * (20 - xlower), yscale * (12 - ylower)) line thick thk from 2nd last [].c to last [].c # DATASET: Process size=16 overhead=66, MARK 2 [ "\(*D" ] at O.sw + \ (xscale * (2 - xlower), yscale * (14 - ylower)) [ "\(*D" ] at O.sw + \ (xscale * (4 - xlower), yscale * (15 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(*D" ] at O.sw + \ (xscale * (8 - xlower), yscale * (18 - ylower)) ".12M" at O.sw + \ (xscale * (8 - xlower), .12 + yscale * (18 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(*D" ] at O.sw + \ (xscale * (16 - xlower), yscale * (46 - ylower)) ".25M" at O.sw + \ (xscale * (16 - xlower), .12 + yscale * (46 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(*D" ] at O.sw + \ (xscale * (20 - xlower), yscale * (88 - ylower)) line thick thk from 2nd last [].c to last [].c # DATASET: Process size=32 overhead=129, MARK 3 [ "\(mu" ] at O.sw + \ (xscale * (2 - xlower), yscale * (22 - ylower)) [ "\(mu" ] at O.sw + \ (xscale * (4 - xlower), yscale * (24 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(mu" ] at O.sw + \ (xscale * (8 - xlower), yscale * (107 - ylower)) ".25M" at O.sw + \ (xscale * (8 - xlower), .12 + yscale * (107 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(mu" ] at O.sw + \ (xscale * (16 - xlower), yscale * (187 - ylower)) ".5M" at O.sw + \ (xscale * (16 - xlower), .12 + yscale * (187 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(mu" ] at O.sw + \ (xscale * (20 - xlower), yscale * (188 - ylower)) line thick thk from 2nd last [].c to last [].c # DATASET: Process size=64 overhead=255, MARK 4 [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (2 - xlower), yscale * (38 - ylower)) ".12M" at O.sw + \ (xscale * (2 - xlower), .12 + yscale * (38 - ylower)) [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (4 - xlower), yscale * (140 - ylower)) ".25M" at O.sw + \ (xscale * (4 - xlower) - .14, .12 + yscale * (140 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (8 - xlower), yscale * (363 - ylower)) ".5M" at O.sw + \ (xscale * (8 - xlower), .12 + yscale * (363 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (16 - xlower), yscale * (367 - ylower)) "1M" at O.sw + \ (xscale * (16 - xlower), .12 + yscale * (367 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (20 - xlower), yscale * (367 - ylower)) line thick thk from 2nd last [].c to last [].c # Xaxis title. "\s+1Processes\s0" rjust at O.se - (-.15, .6) # Yaxis title (Time in microseconds) .ps +1 "T" "i" "m" "e" " " "i" "n" at O.w - (.85, 0) "m" "i" "c" "r" "o" "s" "e" "c" "o" "n" "d" "s" at O.w - (.68, 0) .ps # Graph title. .vs 12 "\s+2Context switches for" "Linux i686@167Mhz\s0" at O.n + (-.5, .4) .vs # Title. [ "\(ci" ] at O.sw - (.80, .50 + 0 * vs) "size=0KB \ overhead=10" ljust at last [].e + (.1, 0) [ "\(sq" ] at last [] - (0, vs) "size=4KB \ overhead=19" ljust at last [].e + (.1, 0) [ "\(*D" ] at last [] - (0, vs) "size=16KB overhead=66" ljust at last [].e + (.1, 0) [ "\(mu" ] at last [] - (0, vs) "size=32KB overhead=129" ljust at last [].e + (.1, 0) [ "\s+4\(bu\s0" ] at last [] - (0, vs) "size=64KB overhead=255" ljust at last [].e + (.1, 0) ] .ft .ps .in .PE ����������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/ctx.tbl��������������������������������������������������������������������������0000664�0000764�0000764�00000003030�07507604561�015241� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; c|c s|c s l|c c|c c l|r r|r r. 2 processes 8 processes System \fB0KB\fP 32KB 0KB 32KB = Linux alpha 10 17 13 41\ Linux i486 11 394 18 594\ Linux alpha 11 73 13 92\ Linux i486 -1 70 -1 78\ Linux i586 10 163 13 215\ DEC Alpha 25 18 42 21\ SunOS-5.4 sun4m 37 128 52 73\ DEC Alpha 39 55 46 112\ DEC Alpha 53 50 56 62\ DEC Alpha 53 66 59 93\ DEC Alpha 59 68 115 134\ DEC Alpha 14 27 22 159\ DEC Alpha 40 42 46 205\ Sun Ultra1 14 27 20 73\ Unixware/i686 21 22 \ DEC Alpha 43 142 45 197\ SunOS-5.4 sun4m 54 65 85 102\ SunOS-5.4 sun4m 75 31 110 102\ IBM Power2 13 16 18 43\ HP-UX 9000/819 13 41 15 109\ HP-UX 9000/755 25 29 29 220\ HP-UX 9000/735 29 39 31 204\ HP-UX 9000/735 29 42 34 205\ HP-UX 9000/735 29 32 30 164\ Linux i586 36 163 47 222\ Linux i686 6 22 7 107\ Linux i586 13 178 20 273\ Linux i586 13 182 21 232\ Linux i586 16 218 22 266\ Linux i586 66 240 83 347\ Sun SC1000 107 135 104 362\ SunOS-5.4 sun4d 137 245 164 486\ SunOS-5.4 sun4d 224 113 245 134\ FreeBSD/i586 28 67 34 158\ IRIX5.3 IP22 40 47 38 104\ IBM PowerPC 16 87 26 144\ FreeBSD/i586 30 54 36 137\ FreeBSD/i586 24 54 28 137\ IRIX64 IP21 84 104 87 101\ dgux mc88110 89 119 122 263\ HP-UX 9000/897 20 39 23 111\ HP-UX 9000/735 27 37 30 222\ FreeBSD/i586 29 41 35 123\ FreeBSD/i586 29 -13 36 78\ IRIX IP22 38 50 42 74\ IRIX64-601 IP26 72 92 74 93\ IRIX64 IP19 59 68 79 91\ IRIX64 IP25 55 77 59 85\ IRIX64 IP19 63 80 69 93\ IRIX IP19 141 150 96 115\ HP-UX 9000/770 21 24 21 218\ IRIX5.3 IP19 150 157 102 167\ .TE .KE ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/description.ms�������������������������������������������������������������������0000664�0000764�0000764�00000053055�07045412511�016625� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $X$ xroff -mgs $file .\" $tty$ groff -mgs $file | colcrt - | more .\" $lpr$ groff -mgs $file > ${file}.lpr .\" Define a page top that looks cool .de PT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .de BT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 1994 \\*[author]'\\*(DY'%' . ps .. .\" Configuration .VARPS .nr HM 1.0i .nr FM 1i .if t .nr PO .75i .if t .nr LL 7.0i .if n .nr PO .25i .if n .nr LL 7.5i .nr PS 11 .nr VS \n(PS+2 .ds title Portable Tools for Performance Analysis .ds author Larry McVoy .TL lmbench: .sp .5 \*[title] .br \s8Revision $Revision$ of $Date$\s0 .AU \*[author] .AI .ps -2 lm@sgi.com\** (415) 390-1804 .ps +2 .AB A description of a set benchmarks for measuring system performance. The benchmarks include latency measurements of basic system operations such as memory, processes, networking, and disks, and bandwidth measurements of memory, disks, and networking. The benchmarks have been run under a wide variety of Unix systems. The benchmarks are freely distributed under the GNU General Public License, with the additional restriction that results may be reported only if the benchmarks are unmodified. .AE .sp 2 .if t .2C .FS This work was mostly done while the author was an employee of Sun Microsystems Computer Corporation. .FE .NH 1 Introduction .LP The purpose of this project is to provide the computer community with tools for performance analysis of basic operations of their computer systems. The tools are designed to be both portable and comparable over a wide set of Unix systems.\** .FS The tools have been run on AIX, BSDI, HP-UX, IRIX, Linux, NetBSD, OSF/1, Solaris, and SunOS by the author. .FE The interfaces that the tools use have been carefully chosen to be as portable and standard as possible. It is an explicit intent of the benchmark to measure standard interfaces. Users of this benchmark may not report results from modified versions of the benchmarks.\** .FS For example, the context switch benchmark may not use a \f(CWyield()\fP primitive instead of pipes; the networking benchmarks must use the socket interfaces, not TLI or some other interface. .FE .PP The purpose of this document is to describe each of the benchmarks. .PP The benchmarks are loosely divided into latency, bandwidth, and ``other'' categories. .NH 1 Latency measurements .LP The latency measurements included in this suite are process creation times (including address space extension via mmap()), basic operating system entry cost, context switching, inter process communication, file system latency, disk latency (you must be the super user to get disk latency results), and memory latency. .PP Process benchmarks are used to measure the basic process primitives, such as creating a new process, running a different program, and context switching. Process creation benchmarks are of particular interest to distributed systems since many remote operations include the creation of a remote process to shepherd the remote operation to completion. Context switching is important for the same reasons. .PP Inter process communication latency is important because many operations are control messages that tell another process (frequently on another system) to do something. The latency of telling the remote process to do something is pure overhead and is frequently in the critical path of important functions, such as distributed databases.\** .FS The performance of the TCP latency benchmark has proven to be a good estimate of the performance of the Oracle database lock manager. .FE .PP The inter process communication latency benchmarks are roughly the same idea: pass a small message (a byte or so) back and forth between two processes. The reported results are always the microseconds it takes to do one round trip. If you are interested in a one way timing, then about half the round trip is right (however, the CPU cycles tend to be somewhat asymmetric for a one trip). .NH 2 Process forks/exits .LP Create a child process which does nothing but terminate. Results are reported in creations per second. The benchmark is measuring how fast the OS can create a new address space and process context. The child process is spawned via the \f(CBfork\fP() interface, not the \f(CBvfork\fP() interface. .NH 2 Simple process creates I .LP Create a child process which then runs a new program that does nothing but print ``hello world'' and exit. The difference between this benchmark and the previous is the running of a new program. The time difference between this and the previous benchmark is the cost of starting a new (simple) program. That cost is especially noticeable on (some) systems that have shared libraries. Shared libraries can introduce a substantial (10s of milliseconds) start up cost. This benchmark is intended to quantify the time/space tradeoff of shared libraries. .NH 2 Simple process creates II .LP Create a child process which runs the same new program except that the program is started by the system shell. This is a clone of the C library \f(CBsystem\fP() interface. The intent is to educate users about the cost of this interface. I have long felt that using the Bourne shell, especially a dynamically linked Bourne shell, to start up processes is over kill; perhaps these numbers will convince others of the same thing. A better choice would be Plan 9's \f(CBrc\fP shell (which is, by the way, free software). .NH 2 Memory mapping .LP Memory mapping is the process of making a file part of a process' address space, allowing direct access to the file's pages. It is an alternative to the traditional read and write interfaces. Memory mapping is extensively used for linking in shared libraries at run time. This benchmark measures the speed at which mappings can be created as well as removed. Results are reported in mappings per second, and the results can be graphed as the test is run over a series of different sizes. .NH 2 Context switches .LP Measures process context switch time.\** A context switch is defined as the time it takes to save the state of one process and restore the state of another process. Typical context switch benchmarks measure just the minimal context switch time, i.e., the time to switch between two processes that are doing nothing but context switching. That approach is misleading because systems may have multiple active processes and the processes typically have more state (hot cache lines) than just the code required to force another context switch. This benchmark takes that into consideration and varies both the number and the size of the processes. .FS A previous version of this benchmark included several system calls in addition to the context switch, resulting in grossly over inflated context switch times. .FE .PP The benchmark is a ring of two to twenty processes that are connected with Unix pipes. A token is passed from process to process, forcing context switches. The benchmark measures the time it takes to pass the token two thousand times from process to process. Each hand off of the token has two costs: (a) the context switch, and (b) the cost of passing the token. In order to get just the context switching time, the benchmark first measures the cost of passing the token through a ring of pipes in a single process. This time is defined as the cost of passing the token and is not included in the reported context switch time. .PP When the processes are larger than the default baseline of ``zero'' (where zero means just big enough to do the benchmark), the cost of the context switch includes the cost of restoring user level state (cache lines). This is accomplished by having the process allocate an array of data and sum it as a series of integers after receiving the token but before passing the token to the next process. Note that the overhead mentioned above includes the cost of accessing the data but because it is measured in just one address space, the cost is typically the cost with hot caches. So the context switch time does not include anything other than the context switch provided that all the processes fit in the cache. If there are cache misses (as is common), the cost of the context switch includes the cost of those cache misses. .PP Results for an HP system running at 100 mhz are shown below. This is a particularly nice system for this benchmark because the results are quite close to what is expected from a machine with a 256KB cache. As the size and number of processes are both increased, processes start falling out of the cache, resulting in higher context switch times. .LP .so ctx.pic .NH 2 Null system calls .LP Measures the cost of entering and exiting (without pausing) the operating system. This is accomplished by repeatedly writing one byte to \f(CB/dev/null\fP, a pseudo device driver that does nothing but discard the data. Results are reported as system calls per second. .PP It is important to note that the system call chosen actually does the work on all systems, to the best of my knowledge. There are some systems that optimized trivial system calls, such as \f(CBgetpid\fP(), to return the answer without a true entry into the OS proper. Writing to \f(CB/dev/null\fP has not been optimized. .NH 2 Pipe latency .LP This benchmark measures the OS; there is almost no code executed at user level. The benchmark measures the round trip time of a small message being passed back and forth between two processes through a pair of Unix pipes. .NH 2 TCP/IP latency .LP This benchmark measures the OS networking code and the driver code; there is almost no code executed at user level. The benchmark measures the round trip time of a small message being passed back and forth between two processes through an AF_INET socket. Note that both remote and local results may be reported. .NH 2 UDP/IP latency .LP This benchmark measures the OS networking code and the driver code; there is almost no code executed at user level. The benchmark measures the round trip time of a small message being passed back and forth between two processes through an AF_INET socket. Note that both remote and local results may be reported. .LP It is interesting to note that the TCP performance is sometimes greater than the UDP performance. This is contrary to expectations since the TCP protocol is a reliable, connection oriented protocol, and as such is expected to carry more overhead. Why this is so is an exercise left to the reader. .NH 2 RPC latency (TCP and UDP) .LP Actually two latency benchmarks: Sun RPC over TCP/IP and over UDP/IP. This benchmark consists of the user level RPC code layered over the TCP or UDP sockets. The benchmark measures the round trip time of a small message being passed back and forth between two processes. Note that both remote and local results may be reported. .LP Using the TCP or the UDP benchmarks as a baseline, it is possible to see how much the RPC code is costing. .NH 2 TCP/IP connect latency .LP This benchmarks measures the time it takes to get a TCP/IP socket and connect it to a remote server. .NH 2 File system latency .LP A benchmark that measures how fast the file system can do basic, common operations, such as creates and deletes of small files. .NH 2 Page fault latency .LP A benchmark that measures how fast the file system can pagefault in a page that is not in memory. .NH 2 Disk latency .LP A benchmark that is designed to measure the overhead of a disk operation. Results are reported as operations per second. .PP The benchmark is designed with SCSI disks in mind. It actually simulates a large number of disks in the following way. The benchmark reads 512 byte chunks sequentially from the raw disk device (raw disks are unbuffered and are not read ahead by Unix). The benchmark ``knows'' that most disks have read ahead buffers that read ahead the next 32-128 kilobytes. Furthermore, the benchmark ``knows'' that the disks rotate and read ahead faster than the processor can request the chunks of data.\** .FS This may not always be true - a processor could be fast enough to make the requests faster than the rotating disk. If we take 3MB/sec to be disk speed, a fair speed, and divide that by 512, that is 6144 IOs/second, or 163 microseconds per IO. I don't know of any processor/OS/io controller combinations that can do an IO in 163 microseconds. .FE So the benchmark is basically reading small chunks of data from the disks track buffer. Another way to look at this is that the benchmark is doing memory to memory transfers across a SCSI channel. .PP No matter how you look at it, the resulting number represents a \fBlower\fP bound on the overhead of a disk I/O. In point of fact, the real numbers will be higher on SCSI systems. Most SCSI controllers will not disconnect if the request can be satisfied immediately; that is the case here. In practice, the overhead numbers will be higher because the processor will send the request, disconnect, get interrupted, reconnect, and transfer. .PP It is possible to generate loads of upwards of 500 IOPs on a single SCSI disk using this technique. It is useful to do that to figure out how many drives could be supported on a system before there are no more processor cycles to handle the load. Using this trick, you do not have to hook up 30 drives, you simulate them. .NH 2 Memory read latency .LP This is perhaps the most interesting benchmark in the suite. The entire memory hierarchy is measured, including onboard cache latency and size, external cache latency and size, main memory latency, and TLB miss latency. .PP The benchmark varies two parameters, array size and array stride. For each size, a list of pointers is created for all of the different strides. Then the list is walked like so .DS .ft CB mov r0,(r0) # C code: p = *p; .DE The time to do about fifty thousand loads (the list wraps) is measured and reported. The time reported is pure latency time and may be zero even though the load instruction does not execute in zero time. Zero is defined as one clock cycle; in other words, the time reported is \fBonly\fP memory latency time, as it does not include the instruction execution time. It is assumed that all processors can do a load instruction (not counting stalls) in one processor cycle. In other words, if the processor cache load time is 60 nanoseconds on a 20 nanosecond processor, the load latency reported would be 40 nanoseconds, the missing 20 seconds is for the load instruction itself. Processors that can manage to get the load address out to the address pins before the end of the load cycle get some free time in this benchmark (I don't think any processors can do that). .PP Note that this benchmark has been validated by logic analyzer measurements on an SGI indy. The clever reader might realize that last few nanoseconds of inaccuracy could be rounded off by realizing that the latency is always going to be a multiple of the processor clock rate. .PP The raw data is a series of data sets. Each data set is a stride size, with array size varied from about one kilobyte up to eight megabytes. When these data sets are all plotted together (using a log base 2 scale for the size variable), the data will be seen to contain a series of horizontal plateaus. The first is the onboard data cache latency (if there is an onboard cache). The point where the lines start to go up marks the size of the cache. The second is the external cache, the third is the main memory, and the last is main memory plus TLB miss cost. In addition to this information, the cache line size can be derived by noticing which strides are faster than main memory times. The first stride that is main memory speed is likely to be the cache line size. The reason is that the strides that are faster than memory indicate that the benchmark is getting more than one hit per cache line. Note that prefetching may confuse you. .PP The graph below shows a particularly nicely made machine, a DEC alpha. This machine is nice because (a) it shows the latencies and sizes of the on chip level 1 and motherboard level 2 caches, and (b) because it has the best all around numbers, especially considering it can support a 4MB level 2 cache. Nice work, DEC. .so mem.pic .NH 1 Bandwidth measurements .LP One of my former managers\** once noted that ``Unix is Swahili for bcopy().'' I believe that he was indicating his belief that the operating system spent most of its time moving data from one place to another, via various means. I tend to agree and have measured the various ways that data can be moved. The ways that are measured are: through pipes, TCP sockets, library bcopy() and hand unrolled bcopy(), the read() interface, through the mmap() interface, and direct memory read and write (no copying). .FS Ken Okin .FE .NH 2 Pipe bandwidth .LP Bandwidth measurement between two local processes communicating through a Unix pipe. Results are in megabytes per second. .NH 2 TCP/IP socket bandwidth .LP Bandwidth measurement using TCP/IP sockets. Results are reported in megabytes per second. Results are reported for local, ethernet, FDDI, and ATM, where possible. Results range from 1-10+ megabytes per second. Any system delivering more than 10 MB/second over TCP is doing very well by 1994 standards. .PP Note that for local measurements, the system is actually moving twice as much data, since the data is being moved to/from the same host. .PP Local bandwidths are (sometimes) useful for determining the overhead of the protocol stack (as well as other OS tasks, such as context switching). Note, however, that some implementations (such as Solaris 2.x) have ``fast pathed'' loopback IP which skews the results. The fast path uses a larger MTU and does not do checksums. .PP The sockets are configured to use the largest receive/send buffers that the OS will allow. This is done to allow maximum bandwidth. Sun's 4.x TCP/IP subsystem (and probably BSD's as well) default to 4KB send/receive buffers, which is too small. (It would be better if the OS noted that this was a high volume / high bandwidth connection and automatically grew the buffers. Hint, hint.) .NH 2 bcopy bandwidths .LP A simple benchmark that measures how fast data can be copied. A hand unrolled version and the C library version are tested. Results are reported in megabytes per second. Note that a typical system is actually moving about three times as much memory as the reported result. A copy is actually a read, a write which causes a cache line read, and a write back. .NH 2 Read bandwidth .LP Most VM system cache file pages for reuse. This benchmark measures the speed at which those pages can be reused. It is important to notice that this is not a disk read measurement, it is a memory read measurement. Results are reported in megabytes per second. .NH 2 Mmap read bandwidth .LP The same measurement as the previous benchmark except that it maps the file, avoiding the copy from kernel to user buffer. Results are reported in megabytes per second. .NH 2 Memory read bandwidth .LP A large array is repeatedly read sequentially. Results reported in megabytes per second. .NH 2 Memory write bandwidth .LP A large array is repeatedly written sequentially. Results reported in megabytes per second. .NH 1 Other measurements .LP .NH 2 Processor cycle time mhz .LP Calculates the megahertz and clock speed of the processor. This is the standard loop in which a series of interlocked operations are timed, and then the megahertz is derived from the timing. The operations are purposefully interlocked to overcome any super scalerness of the system under test. .PP There are actually three versions of mhz, a generic one that works on most systems, and two specific versions for SuperSPARC and rs6000 systems. .PP It turns out that the SuperSPARC processor has two ALU's that are run at twice the clock rate, allowing two interlocked operations to complete in one processor clock.\** .FS Credit and thanks to John Mashey of SGI/MIPS fame, who kindly took the time to out why the benchmark wasn't working on SuperSPARC systems. He explained the SuperSPARC pipeline and the solution to the problem. .FE Fortunately, the ALU's are asymmetric and can not do two shifts in one processor clock. Shifts are used on SuperSPARC systems. .PP IBM rs6000 systems have a C compiler that does not honor the ``register'' directive in unoptimized code. The IBM loop looks like it is doing half as many instructions as the others. This is on purpose, each add on the IBM is actually two instructions (I think it is a load/add/store or something like that). .NH 1 Acknowledgments .LP I would like to acknowledge Sun Microsystems for supporting the development of this project. In particular, my personal thanks to Paul Borrill, Director of the Architecture and Performance group, for conceiving and supporting the development of these benchmarks. .PP My thanks to John Mashey and Neal Nuckolls of Silicon Graphics for reviews, comments, and explanations of the more obscure problems. .PP My thanks to Satya Nishtala of Sun Microsystems for (a) listening to me complain about memory latencies over and over, (b) doing something about it in future SPARC systems, and (c) reviewing the memory latency results and explained IBM's sub blocking scheme (I still don't really understand it but he does. Ask him). .NH 1 Obtaining the benchmarks .LP The benchmarks will be posted to the Usenet comp.benchmarks group. In addition, mail sent to \f(CBarchives@slovax.engr.sgi.com\fP with a request for \f(CBlmbench.shar\fP sources will get the latest and greatest. �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/graph.1��������������������������������������������������������������������������0000664�0000764�0000764�00000006045�07045412511�015121� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .de DS . sp .5 . nf . in +4 . ft CW . vs -1 .. .de DE . sp .5 . fi . in . ft . vs .. .TH GRAPH 1 .SH NAME graph \- compile graphs into pic input .SH SYNOPSIS .B graph [ options ] [ .I filename \&.\|.\|. ] .SH DESCRIPTION .LP .B graph is a perl script which takes sets of X Y data and generates a (human readable) pic program that will produce the graphed data. The output is designed such that you can save it in a file and tweak it to make it fit your document. Try one and look at the output. The output is actually commented. .LP The graph is autosized and auto ticked. .LP The input data format is similar that of xgraph(1), i.e., .DS 1 1 2 2 3 3 "sloped across 1 4 2 4 3 4 "straight across .DE .SH "CONTROL OPTIONS" .LP You may set the graph title, the X title, and the Y title with the following control sequences in the data stream: .DS %T Graph title in +4 point font %X X axis title and/or units in +2 point font %Y Y axis title and/or units in +2 point font %fakemax-X <value> force graph to be that big %fakemax-Y <value> force graph to be that big %fakemin-X <value> force graph to be that small %fakemin-Y <value> force graph to be that small .DE .SH OPTIONS .IP -rev 12 reverse X/Y data sense (and titles). Note this is done after processing any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below). .IP -below put data set titles below the graph rather than to the right. .IP -close no extra space around the data's endpoints. .IP -qline connect the quartile center points. .IP -grid dotted line grid marks. .IP -nobox no box around whole graph. .IP -big make the graph take the whole page. .IP -medium make the graph take about 1/2 the page. .IP -small make the graph be small. .IP -grapheach draw each data set in its own graph. .IP -nolabels no X/Y/Title labels. .IP -nodatal no data set labels. .IP -nomarks do not mark each data point with distinct markers (endpoints are still marked). .IP -k print values larger than 1000 as value/1000. .IP -xk multiply X input by 1024 (blech). .IP -yk multiply Y input by 1024 (blech). .IP -xm multiply X input by 1024*1024 (blech). .IP -ym multiply Y input by 1024*1024 (blech). .IP -logx convert X input into lag base 2 of X input. .IP -logy convert Y input into lag base 2 of Y input. .SH EXAMPLE Workstation price performance from a Digital ad. Process with .DS .ps -2 graph -rev workstations | groff -TX75 "%T Workstation Price / Performance, 6/93 "%X SPECINT 92 Performance "%Y Price in $1000's 35 5 65 10 78 15 110 70 "Dec AXP line 25 4 25 8 38 16 48 21 52 23 64 27 "Sun SPARC line .DE .ps .SH "QUARTILE FORMAT" Data points are \f(CBx y1 y2 y3 y4 y5\fP. You get a two lines from the first two y values, a mark at the third, and another line from the last two. .SH "SEE ALSO" .BR gtroff (1), .BR gpic (1), .BR perl (1). .SH BUGS This should probably be called pic_graph or something like that. .LP This isn't done as much as I would like. It isn't integrated with the groff preprocessor yet. It doesn't know about .GS/.GE things. I use it to manually generate a pic file and then include that. �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_allmem.tbl�������������������������������������������������������������������0000664�0000764�0000764�00000002541�07507607546�016566� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c c l c c c l r r r. Level 1 Level 2 Main System cache cache memory = Linux i586 8 103 151\ DEC Alpha 12 67 291\ Linux i586 8 107 150\ DEC Alpha 10 56 321\ Unixware/i686 14 34 196\ DEC Alpha 9 51 288\ DEC Alpha 7 47 458\ DEC Alpha 12 57 468\ SunOS-5.4 sun4m 13 -- 180\ SunOS-5.4 sun4m 20 -- 291\ SunOS-5.4 sun4m 16 115 816\ Sun Ultra1 6 42 270\ SunOS-5.4 sun4d 16 116 995\ IBM Power2 -- 13 141\ IBM PowerPC 6 164 394\ DEC Alpha 10 53 477\ FreeBSD/i586 10 115 179\ FreeBSD/i586 7 111 181\ DEC Alpha 13 104 957\ FreeBSD/i586 10 118 180\ FreeBSD/i586 10 101 180\ HP-UX 9000/735 -- 10 347\ Sun SC1000 20 140 1236\ HP-UX 9000/770 -- 9 376\ SunOS-5.4 sun4d 24 173 1246\ Linux i686 12 90 194\ Linux i586 10 190 320\ Linux i586 10 148 320\ Linux i586 10 198 321\ Linux i586 10 222 321\ Linux i486 12 234 336\ Linux alpha 3 83 354\ Linux alpha 3 43 361\ DEC Alpha 3 42 396\ HP-UX 9000/735 -- 10 348\ IRIX5.3 IP22 10 76 1018\ IRIX64 IP25 8 58 1134\ HP-UX 9000/735 -- 10 347\ HP-UX 9000/897 -- 11 424\ HP-UX 9000/819 -- 10 430\ IRIX64 IP21 11 100 709\ IRIX64 IP19 10 75 1150\ IRIX IP19 8 64 1189\ IRIX5.3 IP19 10 75 1149\ IRIX64 IP19 10 70 1152\ IRIX IP22 8 64 1170\ FreeBSD/i586 10 106 181\ HP-UX 9000/735 -- 10 348\ HP-UX 9000/755 -- 10 393\ dgux mc88110 22 319 753\ IRIX64-601 IP26 13 120 1244\ .TE .KE ���������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_allproc.tbl������������������������������������������������������������������0000664�0000764�0000764�00000002654�07507604561�016752� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l|c|c|c l|r|r|r. fork \fBfork, exec\fP fork, exec System & exit \fB& exit\fP sh -c & exit = DEC Alpha 4.6 13\ 42\ DEC Alpha 3.3 11\ 44\ Linux alpha 0.7 3\ 12\ Linux alpha 1.0 2\ 16\ DEC Alpha 2.0 6\ 43\ DEC Alpha 4.8 16\ 64\ Linux i686 0.5 5\ 17\ DEC Alpha 3.1 10\ 281\ Linux i586 0.9 5\ 16\ DEC Alpha 5.3 14\ 27\ DEC Alpha 5.1 15\ 89\ Sun Ultra1 3.7 20\ 10\ SunOS-5.4 sun4m 8.0 46\ 237\ SunOS-5.4 sun4m 18.0 83\ 37\ SunOS-5.4 sun4m 10.7 57\ 87\ Linux i486 3.3 10\ 112\ Linux i586 1.6 12\ 44\ SunOS-5.4 sun4d 13.7 75\ 113\ IBM Power2 1.2 8\ 16\ IBM PowerPC 2.9 8\ 50\ SunOS-5.4 sun4d 20.8 93\ 136\ HP-UX 9000/735 1.3 3\ 17\ IRIX5.3 IP19 4.3 8\ 20\ IRIX5.3 IP22 3.1 8\ 19\ IRIX64-601 IP26 4.6 24\ 39\ IRIX IP22 3.0 8\ 22\ Linux i586 2.4 9\ 26\ Linux i586 1.8 15\ 30\ Linux i586 1.9 15\ 30\ Linux i586 3.1 24\ 73\ DEC Alpha 13.4 33\ 39\ Sun SC1000 14.0 69\ 175\ FreeBSD/i586 2.9 14\ 22\ FreeBSD/i586 2.7 13\ 21\ IRIX64 IP21 4.2 14\ 30\ HP-UX 9000/770 3.1 9\ 18\ FreeBSD/i586 2.8 13\ 22\ HP-UX 9000/735 3.5 10\ 20\ HP-UX 9000/735 3.5 10\ 19\ IRIX64 IP19 4.5 19\ 37\ HP-UX 9000/819 4.2 67\ 118\ HP-UX 9000/755 3.6 10\ 18\ HP-UX 9000/897 6.7 15\ 37\ IRIX IP19 6.2 19\ 46\ HP-UX 9000/735 3.5 10\ 20\ FreeBSD/i586 2.7 12\ 20\ FreeBSD/i586 3.0 14\ 23\ IRIX64 IP25 3.3 12\ 24\ IRIX64 IP19 4.0 14\ 24\ dgux mc88110 8.8 13\ 67\ .TE .KE ������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_connect.8��������������������������������������������������������������������0000664�0000764�0000764�00000002231�07172615471�016324� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_CONNECT 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_connect \- measure interprocess connection latency via TCP/IP .SH SYNOPSIS .B lat_connect .I -s .sp .5 .B lat_connect [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I hostname .sp .5 .B lat_connect .I "-S hostname" .SH DESCRIPTION .B lat_connect is a client/server program that measures interprocess connection latencies. The benchmark times the creation and connection of an AF_INET (aka TCP/IP) socket to a remote server. Care is take that the connection time does not include any other overhead, such as the \fIgethostbyname()\fP or remote port lookups since these add more overhead than the connection establishment itself. .LP .B lat_connect has three forms of usage: as a server (-s), as a client (lat_connect localhost), and as a shutdown (lat_connect -S localhost). .SH OUTPUT The reported time is in microseconds per connection. Output format is like so .sp .ft CB TCP/IP connection cost to localhost: 1006 microseconds .ft .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_connect.tbl������������������������������������������������������������������0000664�0000764�0000764�00000001256�07507604561�016744� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS center expand doublebox; l r. DEC Alpha 976 Linux i586 606 IRIX IP22 470 SunOS-5.4 sun4d 852 SunOS-5.4 sun4d 3123 Sun SC1000 4594 IRIX64-601 IP26 316 Linux i586 1155 IRIX5.3 IP22 349 IRIX64 IP21 667 IBM Power2 339 dgux mc88110 4635 DEC Alpha 4700 HP-UX 9000/770 319 HP-UX 9000/755 384 HP-UX 9000/735 389 IRIX64 IP25 716 IRIX64 IP19 763 IRIX5.3 IP19 694 Linux i686 746 Linux i586 775 Linux i586 779 Linux i586 835 Linux i586 1348 Linux i486 1439 DEC Alpha 3047 FreeBSD/i586 454 HP-UX 9000/897 765 FreeBSD/i586 465 FreeBSD/i586 454 FreeBSD/i586 397 IRIX IP19 697 HP-UX 9000/735 388 IRIX64 IP19 805 HP-UX 9000/735 459 HP-UX 9000/819 585 HP-UX 9000/735 740 FreeBSD/i586 481 .TE .KE ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_ctx.8������������������������������������������������������������������������0000664�0000764�0000764�00000006516�07172615471�015503� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_CTX 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_ctx \- context switching benchmark .SH SYNOPSIS .B lat_ctx [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] [ .I "-s <size_in_kbytes>" ] .I "#procs" [ .I "#procs ..." ] .SH DESCRIPTION .B lat_ctx measures context switching time for any reasonable number of processes of any reasonable size. The processes are connected in a ring of Unix pipes. Each process reads a token from its pipe, possibly does some work, and then writes the token to the next process. .LP Processes may vary in number. Smaller numbers of processes result in faster context switches. More than 20 processes is not supported. .LP Processes may vary in size. A size of zero is the baseline process that does nothing except pass the token on to the next process. A process size of greater than zero means that the process does some work before passing on the token. The work is simulated as the summing up of an array of the specified size. The summing is an unrolled loop of about a 2.7 thousand instructions. .LP The effect is that both the data and the instruction cache get polluted by some amount before the token is passed on. The data cache gets polluted by approximately the process ``size''. The instruction cache gets polluted by a constant amount, approximately 2.7 thousand instructions. .LP The pollution of the caches results in larger context switching times for the larger processes. This may be confusing because the benchmark takes pains to measure only the context switch time, not including the overhead of doing the work. The subtle point is that the overhead is measured using hot caches. As the number and size of the processes increases, the caches are more and more polluted until the set of processes do not fit. The context switch times go up because a context switch is defined as the switch time plus the time it takes to restore all of the process state, including cache state. This means that the switch includes the time for the cache misses on larger processes. .SH OUTPUT Output format is intended as input to \fBxgraph\fP or some similar program. The format is multi line, the first line is a title that specifies the size and non-context switching overhead of the test. Each subsequent line is a pair of numbers that indicates the number of processes and the cost of a context switch. The overhead and the context switch times are in micro second units. The numbers below are for a SPARCstation 2. .sp .ft CB .nf "size=0 ovr=179 2 71 4 104 8 134 16 333 20 438 .br .fi .ft .SH BUGS The numbers produced by this benchmark are somewhat inaccurate; they vary by about 10 to 15% from run to run. A series of runs may be done and the lowest numbers reported. The lower the number the more accurate the results. .LP The reasons for the inaccuracies are possibly interaction between the VM system and the processor caches. It is possible that sometimes the benchmark processes are laid out in memory such that there are fewer TLB/cache conflicts than other times. This is pure speculation on our part. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_disk.tbl���������������������������������������������������������������������0000664�0000764�0000764�00000000510�07507604561�016235� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS center expand doublebox; l r. SunOS-5.4 sun4m 2876 Sun SC1000 1466 DEC Alpha 1436 DEC Alpha 1995 IRIX IP22 984 Sun Ultra1 2242 HP-UX 9000/770 732 IRIX IP19 920 IRIX5.3 IP22 1265 IRIX5.3 IP19 991 DEC Alpha 2057 DEC Alpha 3729 FreeBSD/i586 297 FreeBSD/i586 306 FreeBSD/i586 2314 FreeBSD/i586 2284 FreeBSD/i586 310 .TE .KE ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_fcntl.8����������������������������������������������������������������������0000664�0000764�0000764�00000001510�07172615471�016000� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_fcntl \- fcntl file locking benchmark .SH SYNOPSIS .B lat_ctx [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B lat_fcntl is a client/server program that measures file locking latencies. The benchmark alternately locks and unlocks files so that only one of the client or server is running at a time, similar to ``hot potato'' message passing benchmarks. No other work is done in the processes. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_fifo.8�����������������������������������������������������������������������0000664�0000764�0000764�00000001553�07172615471�015624� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_FIFO 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_fifo \- FIFO benchmark .SH SYNOPSIS .B lat_ctx [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B lat_fifo is a client/server program that measures interprocess communication latencies. The benchmark passes a message back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. The message is passed back and forth using FIFOs. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_fcntl(8), lat_tcp(8), lat_udp(8), lat_unix(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �����������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_fs.8�������������������������������������������������������������������������0000664�0000764�0000764�00000001704�07045412511�015274� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_FS 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_fs \- measure file system create/delete performance .SH SYNOPSIS .B lat_fs [ .I dir ] .SH DESCRIPTION .B lat_fs is a program that creates a number of small files in the current working directory and then removes the files. Both the creation and removal of the files is timed. .SH OPTIONS If .I dir is specified, .B lat_fs will change to that directory first and do the creates and deletes there. Otherwise the creates and deletes are done in $PWD. .SH OUTPUT The results are in terms of creates per second and deletes per second as a function of file size. The output is in 4 column form and is the size of the file, the number created, the creations per second, and the removals per second. Output format looks like: .sp .ft CB .nf 0k 500 1304 2740 1k 500 904 1663 4k 500 861 1647 10k 500 674 1516 .fi .ft .SH "SEE ALSO" lmbench(8). ������������������������������������������������������������lmbench-3.0-a9/doc/lat_fs.tbl�����������������������������������������������������������������������0000664�0000764�0000764�00000002062�07507607546�015725� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c l r r. System Create \fBDelete\fP = Linux i586 1.4 0.1 IRIX64-601 IP26 0.9 0.1 Linux i586 1.5 0.1 Linux i586 1.1 0.1 Linux i586 1.4 0.1 Linux i686 1.2 0.1 SunOS-5.4 sun4d 0.7 0.4 SunOS-5.4 sun4d 18.2 8.3 Linux i586 1.4 0.1 Linux i486 0.8 0.1 Linux i486 0.8 0.1 Linux i586 2.7 0.2 Sun SC1000 3.7 1.3 Linux alpha 4.3 4.2 DEC Alpha 25.0 11.4 DEC Alpha 25.0 11.1 DEC Alpha 0.8 0.3 DEC Alpha 1.3 0.5 DEC Alpha 38.5 12.3 DEC Alpha 33.3 11.9 DEC Alpha 23.3 11.5 IRIX64 IP25 3.5 4.0 IRIX64 IP19 3.1 5.0 IRIX IP22 13.3 8.4 Linux alpha 25.0 11.5 DEC Alpha 25.6 14.1 dgux mc88110 2.4 0.5 HP-UX 9000/735 2.8 3.9 FreeBSD/i586 20.0 8.3 FreeBSD/i586 20.4 8.3 FreeBSD/i586 22.7 8.3 FreeBSD/i586 22.7 8.3 FreeBSD/i586 19.6 8.3 IRIX IP19 12.0 11.8 IRIX5.3 IP19 11.5 11.2 IBM Power2 13.3 12.8 IRIX5.3 IP22 9.4 8.5 HP-UX 9000/735 28.6 11.5 IRIX64 IP21 11.9 11.5 IBM PowerPC 12.7 12.7 HP-UX 9000/770 20.0 11.1 HP-UX 9000/735 15.4 11.1 HP-UX 9000/819 3.7 11.8 HP-UX 9000/897 58.8 17.2 HP-UX 9000/755 26.3 11.2 IRIX64 IP19 12.5 9.8 HP-UX 9000/735 26.3 12.0 .TE .KE ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_http.8�����������������������������������������������������������������������0000664�0000764�0000764�00000002021�07172615471�015647� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_FCNTL 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_fcntl \- fcntl file locking benchmark .SH SYNOPSIS .B lat_ctx [ .I "-d" ] [ .I "-e" ] [ .I "-S" ] .I serverhost [ .I port ] .SH DESCRIPTION .B lat_http is a client/server program that measures simple http transaction latencies. It has its own HTTP server, and it is meant to simply measure the minimum overall costs of simple HTTP ``GET'' transactions. It does not measure the performance of third-party HTTP servers. .LP The client simply makes a series of HTTP GET requests for files. The files are a fixed set of files included with the benchmark. No special care was made to ensure that the file sizes match and predetermined distribution. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_connect(8), lat_tcp(8), lat_sig(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_ipc.tbl����������������������������������������������������������������������0000664�0000764�0000764�00000000514�07507607546�016070� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l l c c c l l r r r. System Network \fBTCP bw\fP TCP latency UDP latency = IRIX IP21 hippi 62 1068 1099 SunOS-5.5 sun4u@167 100baseT 9.5 280 308 HP-UX 9000/735 fddi 8.8 425 441 IRIX IP22 10baseT .9 543 602 IRIX IP21 10baseT .9 1463 1376 HP-UX 9000/735 10baseT .9 592 603 Linux 10baseT .7 2954 1912 .TE .KE ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_mem_rd.8���������������������������������������������������������������������0000664�0000764�0000764�00000006662�07172615471�016152� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_MEM_RD 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_mem_rd \- memory read latency benchmark .SH SYNOPSIS .B lat_mem_rd [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I "size_in_megabytes" .I "stride" [ .I "stride stride..." ] .SH DESCRIPTION .B lat_mem_rd measures memory read latency for varying memory sizes and strides. The results are reported in nanoseconds per load and have been verified accurate to within a few nanoseconds on an SGI Indy. .LP The entire memory hierarchy is measured, including onboard cache latency and size, external cache latency and size, main memory latency, and TLB miss latency. .LP Only data accesses are measured; the instruction cache is not measured. .LP The benchmark runs as two nested loops. The outer loop is the stride size. The inner loop is the array size. For each array size, the benchmark creates a ring of pointers that point backward one stride. Traversing the array is done by .sp .ft CB p = (char **)*p; .ft .sp in a for loop (the over head of the for loop is not significant; the loop is an unrolled loop 100 loads long). .LP The size of the array varies from 512 bytes to (typically) eight megabytes. For the small sizes, the cache will have an effect, and the loads will be much faster. This becomes much more apparent when the data is plotted. .LP Since this benchmark uses fixed-stride offsets in the pointer chain, it may be vulnerable to smart, stride-sensitive cache prefetching policies. Older machines were typically able to prefetch for sequential access patterns, and some were able to prefetch for strided forward access patterns, but only a few could prefetch for backward strided patterns. These capabilities are becoming more widespread in newer processors. .SH OUTPUT Output format is intended as input to \fBxgraph\fP or some similar program (we use a perl script that produces pic input). There is a set of data produced for each stride. The data set title is the stride size and the data points are the array size in megabytes (floating point value) and the load latency over all points in that array. .SH "INTERPRETING THE OUTPUT" The output is best examined in a graph where you typically get a graph that has four plateaus. The graph should plotted in log base 2 of the array size on the X axis and the latency on the Y axis. Each stride is then plotted as a curve. The plateaus that appear correspond to the onboard cache (if present), external cache (if present), main memory latency, and TLB miss latency. .LP As a rough guide, you may be able to extract the latencies of the various parts as follows, but you should really look at the graphs, since these rules of thumb do not always work (some systems do not have onboard cache, for example). .IP "onboard cache" 16 Try stride of 128 and array size of .00098. .IP "external cache" Try stride of 128 and array size of .125. .IP "main memory" Try stride of 128 and array size of 8. .IP "TLB miss" Try the largest stride and the largest array. .SH BUGS This program is dependent on the correct operation of .BR mhz (8). If you are getting numbers that seem off, check that .BR mhz (8) is giving you a clock rate that you believe. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), tlb(8), cache(8), line(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_mmap.8�����������������������������������������������������������������������0000664�0000764�0000764�00000002141�07172615471�015625� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_MMAP 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_mmap \- costs of mmapping and unmmapping varying file sizes .SH SYNOPSIS .B lat_mmap [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I size .I file .SH DESCRIPTION .B lat_mmap times how fast a mapping can be made and unmade. This is useful because it is a fundemental part of processes that use SunOS style shared libraries (the libraries are mapped in at process start up time and unmapped at process exit). .LP The benchmark maps in and unmaps the first \fIsize\fP bytes of the file repeatedly and reports the average time for one mapping/unmapping. .LP The size specification may end with ``k'' or ``m'' to mean kilobytes (* 1024) or megabytes (* 1024 * 1024). .SH OUTPUT Output format is \f(CB"%0.2f %d\\n", megabytes, usecs\fP, i.e., .sp .ft CB 8.00 1200 .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_nullsys.tbl������������������������������������������������������������������0000664�0000764�0000764�00000001476�07507604561�017030� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS center expand doublebox; l r. SunOS-5.4 sun4m 7 Sun SC1000 9 SunOS-5.4 sun4d 12 SunOS-5.4 sun4m 9 SunOS-5.4 sun4m 13 Linux alpha 2 Linux i586 2 Linux i586 2 Unixware/i686 5 Sun Ultra1 5 DEC Alpha 9 Linux i586 3 Linux i586 3 Linux alpha 3 DEC Alpha 11 DEC Alpha 12 DEC Alpha 15 IBM PowerPC 12 DEC Alpha 17 FreeBSD/i586 7 FreeBSD/i586 9 FreeBSD/i586 10 DEC Alpha 17 FreeBSD/i586 7 SunOS-5.4 sun4d 26 Linux i686 4 Linux i586 5 Linux i586 5 Linux i486 6 Linux i486 6 DEC Alpha 9 DEC Alpha 13 HP-UX 9000/735 12 HP-UX 9000/735 13 HP-UX 9000/735 14 IRIX5.3 IP19 20 HP-UX 9000/755 14 HP-UX 9000/819 19 IRIX64 IP25 23 IRIX IP22 10 IRIX IP19 16 IRIX64 IP19 18 IRIX64 IP19 24 FreeBSD/i586 9 HP-UX 9000/770 11 HP-UX 9000/897 92 HP-UX 9000/735 12 dgux mc88110 75 IBM Power2 16 IRIX64-601 IP26 20 IRIX64 IP21 25 IRIX5.3 IP22 11 .TE .KE ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_ops.8������������������������������������������������������������������������0000664�0000764�0000764�00000001675�07172615471�015507� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_ops \- basic CPU operation parallelism .SH SYNOPSIS .B lat_ops [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B lat_ops measures the latency of basic CPU operations, such as integer ADD. .TP integer bit, add, mul, div, mod operations maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations. .TP uint64 bit, add, mul, div, mod operations maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations. .TP float add, mul, div operations maximum parallelism for flot ADD, MUL, DIV operations. .TP double add, mul, div operations maximum parallelism for flot ADD, MUL, DIV operations. .SH BUGS This benchmark is highly experimental and may sometimes (frequently?) give erroneous results. .SH "SEE ALSO" lmbench(8), par_ops(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������lmbench-3.0-a9/doc/lat_pagefault.8������������������������������������������������������������������0000664�0000764�0000764�00000002252�07172615471�016646� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_PAGEFAULT 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_pagefault \- measure the cost of pagefaulting pages from a file .SH SYNOPSIS .B lat_pagefault [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I file [ .I file.... ] .SH DESCRIPTION .B lat_pagefault times how fast a page of a file can be faulted in. The file is flushed from (local) memory by using the \f(CBmsync()\fP interface with the invalidate flag set. (Note that NFS does not send this over the wire so this makes for a handy way to measure the cost of going across the wire.) .LP The benchmark maps in the entire file and the access pages backwards using a stride of 256K kilobytes. .SH OUTPUT Output format is below; it prints the average cost of page faulting a page. .sp .ft CB Pagefaults on <file>: <d> usecs .ft .SH BUGS Using a stride of 256K may be a bad idea because SCSI controllers may have caches bigger than that. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_pipe.8�����������������������������������������������������������������������0000664�0000764�0000764�00000002005�07172615471�015627� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_PIPE 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_pipe \- measure interprocess communication latency through pipes .SH SYNOPSIS .B lat_pipe [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B lat_pipe uses two processes communicating through a Unix pipe to measure interprocess communication latencies. The benchmark passes a token back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. .SH OUTPUT The reported time is in microseconds per round trip and includes the total time, i.e., the context switching overhead is includeded. Output format is like so .sp .ft CB Pipe latency: 491 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_pipe.tbl���������������������������������������������������������������������0000664�0000764�0000764�00000001571�07507604561�016250� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS center expand doublebox; l r. SunOS-5.4 sun4m 194 SunOS-5.4 sun4m 150 DEC Alpha 141 Linux alpha 34 Linux i486 56 Linux i486 56 Unixware/i686 86 Linux i586 33 Sun Ultra1 62 SunOS-5.4 sun4m 372 Linux alpha 34 DEC Alpha 162 DEC Alpha 191 Linux i586 42 DEC Alpha 71 DEC Alpha 179 Sun SC1000 278 IBM PowerPC 65 dgux mc88110 474 SunOS-5.4 sun4d 519 FreeBSD/i586 104 FreeBSD/i586 111 FreeBSD/i586 115 SunOS-5.4 sun4d 671 Linux i586 84 Linux i686 31 Linux i586 43 Linux i586 43 Linux i586 140 DEC Alpha 185 DEC Alpha 198 DEC Alpha 278 HP-UX 9000/755 193 HP-UX 9000/897 118 IRIX64 IP19 187 HP-UX 9000/770 148 HP-UX 9000/819 113 HP-UX 9000/735 181 FreeBSD/i586 115 IRIX IP22 118 HP-UX 9000/735 178 HP-UX 9000/735 169 HP-UX 9000/735 172 IRIX64 IP21 264 IRIX5.3 IP19 366 IBM Power2 91 IRIX64 IP25 230 IRIX64-601 IP26 222 IRIX64 IP19 251 IRIX IP19 333 FreeBSD/i586 127 IRIX5.3 IP22 131 .TE .KE ���������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_proc.8�����������������������������������������������������������������������0000664�0000764�0000764�00000003137�07172615471�015644� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_PROC 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_proc \- process creation tests .SH SYNOPSIS .B lat_proc [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I "procedure|fork|exec|shell" .SH DESCRIPTION .B lat_proc creates processes in three different forms, each more expensive than the last. The purposes is to measure the time that it takes to create a basic thread of control. .LP The forms are listed and described below: .TP 20 Process fork+exit The time it takes to split a process into two (nearly) identical copies and have one exit. This is how new processes are created but is not very useful since both processes are doing the same thing. .TP Process fork+execve The time it takes to create a new process and have that new process run a new program. This is the inner loop of all shells (command interpreters). .TP Process fork+/bin/sh -c The time it takes to create a new process and have that new process run a new program by asking the system shell to find that program and run it. This is how the C library interface called \f(CBsystem\fP is implemented. It is the most general and the most expensive. .SH OUTPUT Output is in microseconds per operation like so: .sp .ft CB .nf Process fork+exit: 6054 microseconds Process fork+execve: 11212 microseconds Process fork+/bin/sh -c: 44346 microseconds .br .fi .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_rpc.8������������������������������������������������������������������������0000664�0000764�0000764�00000003335�07172615471�015465� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_RPC 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_rpc \- measure interprocess communication latency via Sun RPC .SH SYNOPSIS .B lat_rpc .I -s .sp .5 .B lat_rpc [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] [ .I "-p tcp|udp" ] .I hostname [ .I "udp|tcp" ] .sp .5 .B lat_rpc .I "-S hostname" .SH DESCRIPTION .B lat_rpc is a client/server program that measures interprocess communication latencies. The benchmark passes a token back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. .LP This benchmark may be compared to the TCP and UDP forms of the same benchmark to accurately see the cost of using RPC versus the cost of using plain old TCP or UDP sockets. It is worth noting that the RPC form is passing back and forth a single byte, not some long complicated record. .LP .B lat_rpc has three forms of usage: as a server (-s), as a client (lat_rpc localhost), and as a shutdown (lat_rpc -S localhost). .LP The client form may specify the protocol over which the RPCs are performed. The default is to measure performance for both .I udp and .IR tcp . .SH OUTPUT The reported time is in microseconds per round trip and includes the total time, i.e., the context switching overhead is includeded. Output format is like so .sp .ft CB RPC/udp latency using localhost: 1344 microseconds .br RPC/tcp latency using localhost: 2089 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_select.8���������������������������������������������������������������������0000664�0000764�0000764�00000001102�07172615471�016146� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_SELECT 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_select \- select benchmark .SH SYNOPSIS .B lat_ctx [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] [ .I "n" ] .SH DESCRIPTION .B lat_select measures the time to do a select on .I n file descriptors. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_sig.8������������������������������������������������������������������������0000664�0000764�0000764�00000001203�07172615471�015453� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_SIG 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_sig \- select benchmark .SH SYNOPSIS .B lat_ctx [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I "install|catch|prot" [ .I "file" ] .SH DESCRIPTION .B lat_sig measures the time to install and catch signals. It can also measure the time to catch a protection fault. .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_signal.tbl�������������������������������������������������������������������0000664�0000764�0000764�00000001414�07507604561�016564� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c l r r. System sigaction \fBsig handler\fP = DEC Alpha 20 30 IRIX5.3 IP22 5 9 IRIX IP22 10 12 IRIX64-601 IP26 11 10 Linux i586 11 22 Linux i586 12 22 DEC Alpha 5 101 Linux alpha 13 38 Linux i486 6 45 Linux alpha 18 37 Linux i586 9 25 Linux i586 8 50 dgux mc88110 5 16 FreeBSD/i586 4 16 FreeBSD/i586 10 34 Linux i486 7 52 FreeBSD/i586 9 34 DEC Alpha 6 138 IRIX64 IP19 6 9 IRIX5.3 IP19 4 8 IRIX64 IP21 5 13 Linux i686 4 14 Linux i586 4 23 Linux i586 6 23 HP-UX 9000/897 10 38 IRIX64 IP19 4 35 HP-UX 9000/770 10 37 HP-UX 9000/819 11 54 HP-UX 9000/755 10 52 HP-UX 9000/735 10 38 HP-UX 9000/735 6 32 IRIX IP19 6 79 HP-UX 9000/735 5 55 IRIX64 IP25 5 55 IBM PowerPC 5 19 FreeBSD/i586 13 56 IBM Power2 52 355 HP-UX 9000/735 15 47 FreeBSD/i586 18 52 .TE .KE ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_syscall.8��������������������������������������������������������������������0000664�0000764�0000764�00000002661�07172615471�016354� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_SYSCALL 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_syscall - time simple entry into the operating system .SH SYNOPSIS .B lat_syscall [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I "null|read|write|stat|fstat|open" [ .I file ] .SH DESCRIPTION .TP null measures how long it takes to do .IR getppid (). We chose .IR getppid () because in all UNIX variants we are aware of, it requires a round-trip to/from kernel space and the actual work required inside the kernel is small and bounded. .TP read measures how long it takes to read one byte from \f(CB/dev/zero\fP. Note that some operating systems do not support \f(CB/dev/zero\fP. .TP write times how long it takes to write one byte to \f(CB/dev/null\fP. This is useful as a lower bound cost on anything that has to interact with the operating system. .TP stat measures how long it takes to .IR stat () a file whose inode is already cached. .TP fstat measures how long it takes to .IR fstat () an open file whose inode is already cached. .TP open measures how long it takes to .IR open () and then .IR close() a file. .SH OUTPUT Output format is .sp .ft CB Null syscall: 67 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_tcp.8������������������������������������������������������������������������0000664�0000764�0000764�00000002440�07172615471�015463� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_TCP 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_tcp \- measure interprocess communication latency via TCP/IP .SH SYNOPSIS .B lat_tcp .I -s .sp .5 .B lat_tcp [ .I "-m <message size>" ] [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I hostname .sp .5 .B lat_tcp .I "-S hostname" .SH DESCRIPTION .B lat_tcp is a client/server program that measures interprocess communication latencies. The benchmark passes a message back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. .LP .B lat_tcp has three forms of usage: as a server (-s), as a client (lat_tcp localhost), and as a shutdown (lat_tcp -S localhost). .SH OUTPUT The reported time is in microseconds per round trip and includes the total time, i.e., the context switching overhead is includeded. Output format is like so .sp .ft CB TCP latency using localhost: 700 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8), lat_unix(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_tcp.tbl����������������������������������������������������������������������0000664�0000764�0000764�00000002141�07507604561�016073� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c l r r. System TCP \fBRPC/TCP\fP = DEC Alpha 485 788 DEC Alpha 581 822 Linux alpha 419 617 DEC Alpha 629 994 DEC Alpha 428 851 DEC Alpha 267 371 DEC Alpha 526 872 DEC Alpha 412 673 Linux i686 263 427 Sun SC1000 855 1386 DEC Alpha 826 1451 Sun Ultra1 162 346 Linux alpha 429 602 Linux i586 1149 1434 SunOS-5.4 sun4m 560 1196 SunOS-5.4 sun4d 1006 1584 SunOS-5.4 sun4m 826 1631 SunOS-5.4 sun4m 335 784 SunOS-5.4 sun4d 1211 1847 Linux i586 467 713 Linux i486 1592 2147 FreeBSD/i586 264 450 FreeBSD/i586 297 510 IRIX5.3 IP22 278 641 IRIX64-601 IP26 467 1018 IRIX IP22 279 580 Linux i586 477 718 Linux i586 1196 1506 Linux i586 1291 1668 Linux i486 1465 2078 IBM PowerPC 299 698 FreeBSD/i586 312 548 HP-UX 9000/735 222 707 FreeBSD/i586 290 532 HP-UX 9000/770 186 712 FreeBSD/i586 295 535 HP-UX 9000/819 393 668 HP-UX 9000/735 257 805 HP-UX 9000/755 262 812 HP-UX 9000/735 245 800 HP-UX 9000/897 286 854 dgux mc88110 1381 1851 IBM Power2 332 649 IRIX64 IP25 482 806 IRIX IP19 766 913 IRIX64 IP21 643 974 IRIX64 IP19 886 957 HP-UX 9000/735 248 820 IRIX64 IP19 546 900 IRIX5.3 IP19 815 1006 .TE .KE �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_udp.8������������������������������������������������������������������������0000664�0000764�0000764�00000002422�07172615471�015465� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_UDP 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_udp \- measure interprocess communication latency via UDP/IP .SH SYNOPSIS .B lat_udp .I -s .sp .5 .B lat_udp [ .I "-m <message size>" ] [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .I hostname .sp .5 .B lat_udp .I "-S hostname" .SH DESCRIPTION .B lat_udp is a client/server program that measures interprocess communication latencies. The benchmark passes a message back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. .LP .B lat_udp has three forms of usage: as a server (-s), as a client (lat_udp localhost), and as a shutdown (lat_udp -S localhost). .SH OUTPUT The reported time is in microseconds per round trip and includes the total time, i.e., the context switching overhead is included. Output format is like so .sp .ft CB UDP latency using localhost: 650 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_unix(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_udp.tbl����������������������������������������������������������������������0000664�0000764�0000764�00000002033�07507604561�016075� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.KS .TS expand doublebox; l c c l r r. System UDP \fBRPC/UDP\fP = DEC Alpha 404 718 Linux alpha 180 317 Linux alpha 199 330 DEC Alpha 259 358 Linux i686 112 217 Linux i486 368 770 Linux i586 187 366 Linux i586 276 538 DEC Alpha 379 717 DEC Alpha 676 765 DEC Alpha 489 834 Sun Ultra1 197 267 Linux i586 281 552 Linux i586 272 553 SunOS-5.4 sun4m 414 622 SunOS-5.4 sun4m 914 1290 DEC Alpha 569 836 Sun SC1000 739 1101 SunOS-5.4 sun4m 590 935 FreeBSD/i586 213 387 FreeBSD/i586 249 408 HP-UX 9000/819 413 655 IRIX5.3 IP22 313 671 IRIX64-601 IP26 474 1008 IRIX IP22 261 562 Linux i486 351 831 DEC Alpha 709 1109 SunOS-5.4 sun4d 1084 1430 SunOS-5.4 sun4d 1180 1562 IRIX IP19 796 903 FreeBSD/i586 240 420 IBM Power2 254 531 IBM PowerPC 206 536 FreeBSD/i586 265 459 IRIX64 IP21 660 783 dgux mc88110 1373 2175 HP-UX 9000/897 289 673 HP-UX 9000/770 185 657 HP-UX 9000/735 244 742 IRIX5.3 IP19 785 960 IRIX64 IP25 486 740 HP-UX 9000/735 248 759 HP-UX 9000/735 246 768 HP-UX 9000/735 252 786 IRIX64 IP19 814 964 HP-UX 9000/755 244 832 IRIX64 IP19 678 893 .TE .KE �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_unix.8�����������������������������������������������������������������������0000664�0000764�0000764�00000002146�07172615471�015663� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_UNIX 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME lat_unix \- measure interprocess communication latency via UNIX sockets .SH SYNOPSIS .B lat_unix [ .I "-m <message size>" ] [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B lat_unix is a client/server program that measures interprocess communication latencies. The benchmark passes a message back and forth between the two processes (this sort of benchmark is frequently referred to as a ``hot potato'' benchmark). No other work is done in the processes. .SH OUTPUT The reported time is in microseconds per round trip and includes the total time, i.e., the context switching overhead is includeded. Output format is like so .sp .ft CB AF_UNIX sock stream latency: 700 microseconds .ft .SH ACKNOWLEDGEMENT Funding for the development of this tool was provided by Sun Microsystems Computer Corporation. .SH "SEE ALSO" lmbench(8), lat_fcntl(8), lat_fifo(8), lat_tcp(8), lat_udp(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lat_unix_connect.8���������������������������������������������������������������0000664�0000764�0000764�00000001654�07172615471�017377� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LAT_UNIX_CONNECT 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lat_unix_connect \- measure interprocess connection latency via UNIX sockets .SH SYNOPSIS .B lat_unix_connect .I -s .sp .5 .B lat_unix_connect [ .I "-P <parallelism>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .sp .5 .B lat_unix_connect .I "-S" .SH DESCRIPTION .B lat_unix_connect is a client/server program that measures interprocess connection latencies. The benchmark times the creation and connection of an AF_UNIX socket to a local server. .LP .B lat_connect has three forms of usage: as a server (-s), as a client (lat_connect), and as a shutdown (lat_connect -S). .SH OUTPUT The reported time is in microseconds per connection. Output format is like so .sp .ft CB UNIX connection cost: 1006 microseconds .ft .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ������������������������������������������������������������������������������������lmbench-3.0-a9/doc/line.8���������������������������������������������������������������������������0000664�0000764�0000764�00000002645�10425064724�014765� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LINE 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME line \- cache line size .SH SYNOPSIS .B line [ .I "-M <len>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B line tries to determine the cache line size in bytes of the largest cache which is smaller than .I len bytes. .LP .B line creates pointer chains which access the first word on each cache line on a page (randomly meandering through all the lines in a page before jumping to the next page). It measures the average memory latency for a variety of line sizes, starting with a line size of one word. When it finds an increase in the average latency that is significantly larger than the latency for the smaller line size then it assumes that it has found the line size. .LP This algorithm works because for line sizes less than the true line size, at least two .B line cache lines fit in the space of a true cache line. Since that cache line will be accessed twice, the first access will cause an expensive cache miss, while the second access will be a cache hit. Once the .B line cache line is equal to the true cache line size, then all accesses will cause cache misses. .SH BUGS .B line is an experimental benchmark, but it seems to work well on most systems. .SH "SEE ALSO" lmbench(8), tlb(8), cache(8), par_mem(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmbench.3������������������������������������������������������������������������0000664�0000764�0000764�00000016015�07632601624�015437� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" @(#)lmbench.man 3.0 2000/10/12 .\" .\" lmbench - benchmarking toolbox .\" .\" Copyright (C) 1998-2000 Carl Staelin and Larry McVoy .\" E-mail: staelin@hpl.hp.com .\" .TH "LMBENCH" 3 "$Date:$" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH" .SH "NAME" lmbench \- benchmarking toolbox .SH "SYNOPSIS" .B "#include ``lmbench.h''" .LP .B "typedef u_long iter_t" .LP .B "typedef (*benchmp_f)(iter_t iterations, void* cookie)" .LP .B "void benchmp(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie)" .LP .B "uint64 get_n()" .LP .B "void milli(char *s, uint64 n)" .LP .B "void micro(char *s, uint64 n)" .LP .B "void nano(char *s, uint64 n)" .lP .B "void mb(uint64 bytes)" .LP .B "void kb(uint64 bytes)" .SH "DESCRIPTION" Creating benchmarks using the .I lmbench timing harness is easy. Since it is so easy to measure performance using .I lmbench , it is possible to quickly answer questions that arise during system design, development, or tuning. For example, image processing .LP There are two attributes that are critical for performance, latency and bandwidth, and .I lmbench\'s timing harness makes it easy to measure and report results for both. Latency is usually important for frequently executed operations, and bandwidth is usually important when moving large chunks of data. .LP There are a number of factors to consider when building benchmarks. .LP The timing harness requires that the benchmarked operation be idempotent so that it can be repeated indefinitely. .LP The timing subsystem, .BR benchmp , is passed up to three function pointers. Some benchmarks may need as few as one function pointer (for .IR benchmark ). .TP .B "void benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)" measures the performance of .I benchmark repeatedly and reports the median result. .I benchmp creates .I parallel sub-processes which run .I benchmark in parallel. This allows lmbench to measure the system's ability to scale as the number of client processes increases. Each sub-process executes .I initialize before starting the benchmarking cycle with .I iterations set to 0. It will call .I initialize , .I benchmark , and .I cleanup with .I iterations set to the number of iterations in the timing loop several times in order to collect .I repetitions results. The calls to .I benchmark are surrounded by .I start and .I stop call to time the amount of time it takes to do the benchmarked operation .I iterations times. After all the benchmark results have been collected, .I cleanup is called with .I iterations set to 0 to cleanup any resources which may have been allocated by .I initialize or .IR benchmark . .I cookie is a void pointer to a hunk of memory that can be used to store any parameters or state that is needed by the benchmark. .TP .B "void benchmp_getstate()" returns a void pointer to the lmbench-internal state used during benchmarking. The state is not to be used or accessed directly by clients, but rather would be passed into .I benchmp_interval. .TP .B "iter_t benchmp_interval(void* state)" returns the number of times the benchmark should execute its benchmark loop during this timing interval. This is used only for weird benchmarks which cannot implement the benchmark body in a function which can return, such as the page fault handler. Please see .I lat_sig.c for sample usage. .TP .B "uint64 get_n()" returns the number of times .I loop_body was executed during the timing interval. .TP .B "void milli(char *s, uint64 n)" print out the time per operation in milli-seconds. .I n is the number of operations during the timing interval, which is passed as a parameter because each .I loop_body can contain several operations. .TP .B "void micro(char *s, uint64 n)" print the time per opertaion in micro-seconds. .TP .B "void nano(char *s, uint64 n)" print the time per operation in nano-seconds. .TP .B "void mb(uint64 bytes)" print the bandwidth in megabytes per second. .TP .B "void kb(uint64 bytes)" print the bandwidth in kilobytes per second. .SH "USING lmbench" Here is an example of a simple benchmark that measures the latency of the random number generator .BR lrand48() : .IP .B "#include ``lmbench.h''" .br .br .B void .br .B benchmark_lrand48(iter_t iterations, void* cookie) .B { .br .B " while(iterations-- > 0)" .br .B " lrand48();" .br .B } .br .br .B int .br .B "main(int argc, char *argv[])" .br .B { .br .B " benchmp(NULL, benchmark_lrand48, NULL, 0, 1, 0, TRIES, NULL);" .br .B " micro("lrand48()", get_n());" .br .B " exit(0);" .br .B } .br .LP Here is a simple benchmark that measures and reports the bandwidth of .BR bcopy : .IP .B "#include ``lmbench.h''" .br .br .B "#define MB (1024 * 1024) .br .B "#define SIZE (8 * MB)" .br .br .B "struct _state {" .br .B " int size;" .br .B " char* a;" .br .B " char* b;" .br .B "};" .br .br .B void .br .B initialize_bcopy(iter_t iterations, void* cookie) .B "{" .br .B " struct _state* state = (struct _state*)cookie;" .br .br .B " if (!iterations) return;" .br .B " state->a = malloc(state->size);" .br .B " state->b = malloc(state->size);" .br .B " if (state->a == NULL || state->b == NULL)" .br .B " exit(1);" .br .B "}" .br .br .B void .br .B benchmark_bcopy(iter_t iterations, void* cookie) .B "{" .br .B " struct _state* state = (struct _state*)cookie;" .br .br .B " while(iterations-- > 0)" .br .B " bcopy(state->a, state->b, state->size);" .br .B "}" .br .br .B void .br .B cleanup_bcopy(iter_t iterations, void* cookie) .B "{" .br .B " struct _state* state = (struct _state*)cookie;" .br .br .B " if (!iterations) return;" .br .B " free(state->a);" .br .B " free(state->b);" .br .B "}" .br .br .B int .br .B "main(int argc, char *argv[])" .br .B "{" .br .B " struct _state state;" .br .br .B " state.size = SIZE;" .br .B " benchmp(initialize_bcopy, benchmark_bcopy, cleanup_bcopy," .br .B " 0, 1, 0, TRIES, &state);" .br .B " mb(get_n() * state.size);" .br .B " exit(0);" .br .B "}" .br .LP A slightly more complex version of the .B bcopy benchmark might measure bandwidth as a function of memory size and parallelism. The main procedure in this case might look something like this: .IP .B int .br .B "main(int argc, char *argv[])" .br .B "{" .br .B " int size, par;" .br .B " struct _state state;" .br .br .B " for (size = 64; size <= SIZE; size <<= 1) {" .br .B " for (par = 1; par < 32; par <<= 1) {" .br .B " state.size = size;" .br .B " benchmp(initialize_bcopy, benchmark_bcopy," .br .B " cleanup_bcopy, 0, par, 0, TRIES, &state);" .br .B " fprintf(stderr, \%d\\t%d\\t\", size, par);" .br .B " mb(par * get_n() * state.size);" .br .B " }" .br .B " }" .br .B " exit(0);" .br .B "}" .SH "VARIABLES" There are three environment variables that can be used to modify the .I lmbench timing subsystem: ENOUGH, TIMING_O, and LOOP_O. .SH "FUTURES" Development of .I lmbench is continuing. .SH "SEE ALSO" lmbench(8), timing(3), reporting(3), results(3). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmbench.8������������������������������������������������������������������������0000664�0000764�0000764�00000013136�10425064646�015446� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LMBENCH 8 "$Date$" "(c)1994-2000 Larry McVoy and Carl Staelin" "LMBENCH" .SH NAME lmbench \- system benchmarks .SH DESCRIPTION .B lmbench is a series of micro benchmarks intended to measure basic operating system and hardware system metrics. The benchmarks fall into three general classes: bandwidth, latency, and ``other''. .LP Most of the .I lmbench benchmarks use a standard timing harness described in timing(3) and have a few standard options: .IR parallelism , .IR warmup , and .IR repetitions . .I Parallelism specifies the number of benchmark processes to run in parallel. This is primarily useful when measuring the performance of SMP or distributed computers and can be used to evaluate the system's performance scalability. .I Warmup is the number of minimum number of microseconds the benchmark should execute the benchmarked capability before it begins measuring performance. Again this is primarily useful for SMP or distributed systems and it is intended to give the process scheduler time to "settle" and migrate processes to other processors. By measuring performance over various .I warmup periods, users may evaulate the scheduler's responsiveness. .I Repetitions is the number of measurements that the benchmark should take. This allows lmbench to provide greater or lesser statistical strength to the results it reports. The default number of .I repetitions is 11. .SH BANDWIDTH MEASUREMENTS Data movement is fundemental to the performance on most computer systems. The bandwidth measurements are intended to show how the system can move data. The results of the bandwidth metrics can be compared but care must be taken to understand what it is that is being compared. The bandwidth benchmarks can be reduced to two main components: operating system overhead and memory speeds. The bandwidth benchmarks report their results as megabytes moved per second but please note that the data moved is \fBnot\fP necessarily the same as the memory bandwidth used to move the data. Consult the individual man pages for more information. .LP Each of the bandwidth benchmarks is listed below with a brief overview of the intent of the benchmark. .TP 14 bw_file_rd reading and summing of a file via the read(2) interface. .TP bw_mem_cp memory copy. .TP bw_mem_rd memory reading and summing. .TP bw_mem_wr memory writing. .TP bw_mmap_rd reading and summing of a file via the memory mapping mmap(2) interface. .TP bw_pipe reading of data via a pipe. .TP bw_tcp reading of data via a TCP/IP socket. .TP bw_unix reading data from a UNIX socket. .SH LATENCY MEASUREMENTS Control messages are also fundemental to the performance on most computer systems. The latency measurements are intended to show how fast a system can be told to do some operation. The results of the latency metrics can be compared to each other for the most part. In particular, the pipe, rpc, tcp, and udp transactions are all identical benchmarks carried out over different system abstractions. .LP Latency numbers here should mostly be in microseconds per operation. .TP 14 lat_connect the time it takes to establish a TCP/IP connection. .TP lat_ctx context switching; the number and size of processes is varied. .TP lat_fcntl fcntl file locking. .TP lat_fifo ``hot potato'' transaction through a UNIX FIFO. .TP lat_fs creating and deleting small files. .TP lat_pagefault the time it takes to fault in a page from a file. .TP lat_mem_rd memory read latency (accurate to the ~2-5 nanosecond range, reported in nanoseconds). .TP lat_mmap time to set up a memory mapping. .TP lat_ops basic processor operations, such as integer XOR, ADD, SUB, MUL, DIV, and MOD, and float ADD, MUL, DIV, and double ADD, MUL, DIV. .TP lat_pipe ``hot potato'' transaction through a Unix pipe. .TP lat_proc process creation times (various sorts). .TP lat_rpc ``hot potato'' transaction through Sun RPC over UDP or TCP. .TP lat_select select latency .TP lat_sig signal installation and catch latencies. Also protection fault signal latency. .TP lat_syscall non trivial entry into the system. .TP lat_tcp ``hot potato'' transaction through TCP. .TP lat_udp ``hot potato'' transaction through UDP. .TP lat_unix ``hot potato'' transaction through UNIX sockets. .TP lat_unix_connect the time it takes to establish a UNIX socket connection. .SH OTHER MEASUREMENTS .TP 14 mhz processor cycle time .TP tlb TLB size and TLB miss latency .TP line cache line size (in bytes) .TP cache cache statistics, such as line size, cache sizes, memory parallelism. .TP stream John McCalpin's stream benchmark .TP par_mem memory subsystem parallelism. How many requests can the memory subsystem service in parallel, which may depend on the location of the data in the memory hierarchy. .TP par_ops basic processor operation parallelism. .SH SEE ALSO bargraph(1), graph(1), lmbench(3), results(3), timing(3), bw_file_rd(8), bw_mem_cp(8), bw_mem_wr(8), bw_mmap_rd(8), bw_pipe(8), bw_tcp(8), bw_unix(8), lat_connect(8), lat_ctx(8), lat_fcntl(8), lat_fifo(8), lat_fs(8), lat_http(8), lat_mem_rd(8), lat_mmap(8), lat_ops(8), lat_pagefault(8), lat_pipe(8), lat_proc(8), lat_rpc(8), lat_select(8), lat_sig(8), lat_syscall(8), lat_tcp(8), lat_udp(8), lmdd(8), par_ops(8), par_mem(8), mhz(8), tlb(8), line(8), cache(8), stream(8) .SH ACKNOWLEDGEMENT Funding for the development of these tools was provided by Sun Microsystems Computer Corporation. .LP A large number of people have contributed to the testing and development of lmbench. .SH COPYING The benchmarking code is distributed under the GPL with additional restrictions, see the COPYING file. .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmbench3.ms����������������������������������������������������������������������0000664�0000764�0000764�00000151211�07564162402�015775� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" This document is GNU groff -mgs -t -p -R -s .\" It will not print with normal troffs, it uses groff features, in particular, .\" long names for registers & strings. .\" Deal with it and use groff - it makes things portable. .\" .\" $X$ xroff -mgs -t -p -R -s $file .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr .VARPS .\" Define a page top that looks cool .\" HELLO CARL! To turn this off, s/PT/oldPT/ .de PT .\" .tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' .tl '''' .. .de lmPT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .\" HELLO CARL! To turn this off, s/BT/oldBT/ .de BT .tl ''Page %'' .. .de lmBT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 2002 \\*[author]'\\*(DY'%' . ps .. .de SP . if t .sp .5 . if n .sp 1 .. .de BU . SP . ne 2 \(bu\ . if \\n[.$] \fB\\$1\fP\\$2 .. .nr FIGURE 0 .nr TABLE 0 .nr SMALL .25i .de TSTART . KF . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 . ps -1 . vs -1 .. .de TEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr TABLE \\n[TABLE]+1 . ce 1 \fBTable \\n[TABLE].\ \ \\$1\fP . SP . KE .. .de FEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr FIGURE \\n[FIGURE]+1 . ce 1 \fBFigure \\n[FIGURE].\ \ \\$1\fP . SP . KE .. .\" Configuration .nr PI 3n .nr HM 1i .nr FM 1i .nr PO 1i .if t .po 1i .nr LL 6.5i .if n .nr PO 0i .if n .nr LL 7.5i .nr PS 10 .nr VS \n(PS+1 .ds title Measuring scalability .ds author Carl Staelin .ds micro \(*m .ds lmbench \f(CWlmbench\fP .ds lmbench1 \f(CWlmbench1\fP .ds lmbench2 \f(CWlmbench2\fP .ds lmbench3 \f(CWlmbench3\fP .ds bcopy \f(CWbcopy\fP .ds benchmp \f(CWbenchmp\fP .ds bw_file_rd \f(CWbw_file_rd\fP .ds bw_mem \f(CWbw_mem\fP .ds bw_mmap_rd \f(CWbw_mmap_rd\fP .ds bw_pipe \f(CWbw_pipe\fP .ds bw_tcp \f(CWbw_tcp\fP .ds bw_udp \f(CWbw_udp\fP .ds bw_unix \f(CWbw_unix\fP .ds close \f(CWclose\fP .ds connect \f(CWconnect\fP .ds dd \f(CWdd\fP .ds execlp \f(CWexeclp\fP .ds execve \f(CWexecve\fP .ds exit \f(CWexit\fP .ds fcntl \f(CWfcntl\fP .ds fork \f(CWfork\fP .ds fstat \f(CWfstat\fP .ds gcc \f(CWgcc\fP .ds get_n \f(CWget_n\fP .ds getpid \f(CWgetpid\fP .ds getppid \f(CWgetppid\fP .ds gettime \f(CWgettime\fP .ds gettimeofday \f(CWgettimeofday\fP .ds kill \f(CWkill\fP .ds lat_connect \f(CWlat_connect\fP .ds lat_ctx \f(CWlat_ctx\fP .ds lat_dram_page \f(CWlat_dram_page\fP .ds lat_fcntl \f(CWlat_fcntl\fP .ds lat_fifo \f(CWlat_fifo\fP .ds lat_fs \f(CWlat_fs\fP .ds lat_http \f(CWlat_http\fP .ds lat_mem_rd \f(CWlat_mem_rd\fP .ds lat_mmap \f(CWlat_mmap\fP .ds lat_ops \f(CWlat_ops\fP .ds lat_pagefault \f(CWlat_pagefault\fP .ds lat_pipe \f(CWlat_pipe\fP .ds lat_proc \f(CWlat_proc\fP .ds lat_rpc \f(CWlat_rpc\fP .ds lat_select \f(CWlat_select\fP .ds lat_sem \f(CWlat_sem\fP .ds lat_sig \f(CWlat_sig\fP .ds lat_syscall \f(CWlat_syscall\fP .ds lat_tcp \f(CWlat_tcp\fP .ds lat_udp \f(CWlat_udp\fP .ds lat_unix \f(CWlat_unix\fP .ds lat_unix_connect \f(CWlat_unix_connect\fP .ds lat_usleep \f(CWlat_usleep\fP .ds line \f(CWline\fP .ds lmdd \f(CWlmdd\fP .ds lmdd \f(CWlmdd\fP .ds mb \f(CWmb\fP .ds memmove \f(CWmemmove\fP .ds mhz \f(CWmhz\fP .ds micro \f(CWmicro\fP .ds mmap \f(CWmmap\fP .ds nano \f(CWnano\fP .ds nanosleep \f(CWnanosleep\fP .ds open \f(CWopen\fP .ds par_mem \f(CWpar_mem\fP .ds par_ops \f(CWpar_ops\fP .ds pipe \f(CWpipe\fP .ds popen \f(CWpopen\fP .ds pselect \f(CWpselect\fP .ds read \f(CWread\fP .ds select \f(CWselect\fP .ds semop \f(CWsemop\fP .ds setitimer \f(CWsetitimer\fP .ds sh \f(CW/bin/sh\fP .ds stat \f(CWstat\fP .ds stream \f(CWstream\fP .ds system \f(CWsystem\fP .ds tlb \f(CWtlb\fP .ds uiomove \f(CWuiomove\fP .ds usleep \f(CWusleep\fP .ds write \f(CWwrite\fP .ds yield \f(CWyield\fP .\" References stuff .R1 accumulate sort A+DT database references-lmbench3 label-in-text bracket-label [ ] ", " .R2 .EQ delim $$ .EN .TL \s(14lmbench3: measuring scalability\s0 .AU \s+2\fR\*[author]\fP\s0 .AI \fI\s+2Hewlett-Packard Laboratories Israel\s0\fP .SP .AB \*[lmbench3] extends the \*[lmbench2] system to measure a system's performance under scalable load to make it possible to assess parallel and distributed computer performance with the same power and flexibility that \*[lmbench2] brought to uni-processor performance analysis. There is a new timing harness, \*[benchmp], designed to measure performance at specific levels of parallel (simultaneous) load, and most existing benchmarks have been converted to use the new harness. .SP \*[lmbench] is a micro-benchmark suite designed to focus attention on the basic building blocks of many common system applications, such as databases, simulations, software development, and networking. It is also designed to make it easy for users to create additional micro-benchmarks that can measure features, algorithms, or subsystems of particular interest to the user. .AE .if t .MC 3.05i .NH 1 Introduction .LP \*[lmbench] is a widely used suite of micro-benchmarks that measures important aspects of computer system performance, such as memory latency and bandwidth. Crucially, the suite is written in portable ANSI-C using POSIX interfaces and is intended to run on a wide range of systems without modification. .LP The benchmarks included in the suite were chosen because in the \*[lmbench] developer's experience, they each represent an aspect of system performance which has been crucial to an application's performance. Using this multi-dimensional performance analysis approach, it is possible to better predict and understand application performance because key aspects of application performance can often be understood as linear combinations of the elements measured by \*[lmbench] .[[ Brown97 .]]. .LP \*[lmbench3] extends the \*[lmbench] suite to encompass parallel and distributed system performance by measuring system performance under scalable load. This means that the user can specify the number of processes that will be executing the benchmarked feature in parallel during the measurements. It is possible to utilize this framework to develop benchmarks to measure distributed application performance, but it is primarily intended to measure the performance of multiple processes using the same system resource at the same time. .LP In general the benchmarks report either the latency or bandwidth of an operation or data pathway. The exceptions are generally those benchmarks that report on a specific aspect of the hardware, such as the processor clock rate, which is reported in MHz and nanoseconds. .LP \*[lmbench] consists of three major components: a timing harness, the individual benchmarks built on top of the timing harness, and the various scripts and glue that build and run the benchmarks and process the results. .NH 2 \*[lmbench] history .LP \*[lmbench1] was written by Larry McVoy while he was at Sun Microsystems. It focussed on two measures of system performance: latency and bandwidth. It measured a number of basic operating system functions, such as file system read/write bandwidth or file creation time. It also focussed a great deal of energy on measuring data transfer operations, such as \*[bcopy] and \*[pipe] latency and bandwidth as well as raw memory latency and bandwidth. .LP Shortly after the \*[lmbench1] paper .[[ McVoy96 .]] was published, Aaron Brown examined the \*[lmbench] benchmark suite and published a detailed critique of its strengths and weaknesses .[[ Brown97 .]]. Largely in response to these remarks, development of \*[lmbench2] began with a focus on improving the experimental design and statistical data analysis. The primary change was the development and adoption across all the benchmarks of a timing harness that incorporated loop-autosizing and clock resolution detection. In addition, each experiment was typically repeated eleven times with the median result reported to the user. .LP The \*[lmbench2] .[[ Staelin98 .]] timing harness was implemented through a new macro, BENCH(), that automatically manages nearly all aspects of accurately timing operations. For example, it automatically detects the minimal timing interval necessary to provide timing results within 1% accuracy, and it automatically repeats most experiments eleven times and reports the median result. .LP \*[lmbench3] focussed on extending \*[lmbench]'s functionality along two dimensions: measuring multi-processor scalability and measuring basic aspects of processor micro-architecture. .LP An important feature of multi-processor systems is their ability to scale their performance. While \*[lmbench1] and \*[lmbench2] measure various important aspects of system performance, they cannot measure performance with more than one client process active at a time. Consequently, measuring performance of multi-processor and clustered systems as a function of scalable load was impossible using those tools. .LP \*[lmbench3] took the ideas and techniques developed in the earlier versions and extended them to create a new timing harness which can measure system performance under parallel, scalable loads. .LP \*[lmbench3] also includes a version of John McCalpin's STREAM benchmarks. Essentially the STREAM kernels were placed in the new \*[lmbench] timing harness. Since the new timing harness also measures scalability under parallel load, the \*[lmbench3] STREAM benchmarks include this capability automatically. .LP Finally, \*[lmbench3] includes a number of new benchmarks which measure various aspects of the processor architecture, such as basic operation latency and parallelism, to provide developers with a better understanding of system capabilities. The hope is that better informed developers will be able to better design and evaluate performance critical software in light of their increased understanding of basic system performance. .NH 1 Prior Work .LP Benchmarking is not a new field of endeavor. There are a wide variety of approaches to benchmarking, many of which differ greatly from that taken by \*[lmbench]. .LP One common form of benchmark is to take an important application or application and worklist, and to measure the time required to complete the entire task. This approach is particularly useful when evaluating the utility of systems for a single and well-known task. .LP Other benchmarks, such as SPECint, use a variation on this approach by measuring several applications and combining the results to predict overall performance. SPEChpc96 .[[ SPEChpc96 .]] extends this approach to the parallel and distributed domain by measuring the performance of a selected parallel applications built on top of MPI and/or PVM. .\" .LP .\" XXX Byte benchmark .LP Another variation takes the "kernel" of an important application and measures its performance, where the "kernel" is usually a simplification of the most expensive portion of a program. Dhrystone .[[ Weicker84 .]] is an example of this type of benchmark as it measures the performance of important matrix operations and was often used to predict system performance for numerical operations. .LP Banga developed a benchmark to measure HTTP server performance which can accurately measure server performance under high load .[[ Banga97 .]]. Due to the idiosyncracies of the HTTP protocol and TCP design and implementation, there are generally operating system limits on the rate at which a single system can generate independent HTTP requests. However, Banga developed a system which can scalably present load to HTTP servers in spite of this limitation .[[ Banga98 .]]. .LP John McCalpin's STREAM benchmark measures memory bandwidth during four common vector operations .[[ McCalpin95 .]]. It does not measure memory latency, and strictly speaking it does not measure raw memory bandwith although memory bandwidth is crucial to STREAM performance. More recently, STREAM has been extended to measure distributed application performance using MPI to measure scalable memory subsystem performance, particularly for multi-processor machines. .LP Prestor .[[ Prestor01 .]] and Saavedra .[[ Saavedra95 .]] have developed benchmarks which analyze memory subsystem performance. .LP Micro-benchmarking extends the "kernel" approach, by measuring the performance of operations or resources in isolation. \*[lmbench] and many other benchmarks, such as nfsstone .[[ Shein89 .]], measure the performance of key operations so users can predict performance for certain workloads and applications by combining the performance of these operations in the right mixture. .LP Saavedra .[[ Saavedra92 .]] takes the micro-benchmark approach and applies it to the problem of predicting application performance. They analyze applications or other benchmarks in terms of their ``narrow spectrum benchmarks'' to create a linear model of the application's computing requirements. They then measure the computer system's performance across this set of micro-benchmarks and use a linear model to predict the application's performance on the computer system. Seltzer .[[ Seltzer99 .]] applied this technique using the features measured by \*[lmbench] as the basis for application prediction. .LP Benchmarking I/O systems has proven particularly troublesome over the years, largely due to the strong non-linearities exhibited by disk systems. Sequential I/O provides much higher bandwidth than non-sequential I/O, so performance is highly dependent on the workload characteristics as well as the file system's ability to capitalize on available sequentiality by laying out data contiguously on disk. .LP I/O benchmarks have a tendency to age poorly. For example, IOStone .[[ Park90a .]], IOBench .[[ Wolman89 .]], and the Andrew benchmark .[[ Howard88 .]] used fixed size datasets, whose size was significant at the time, but which no longer measure I/O performance as the data can now fit in the processor cache of many modern machines. .LP The Andrew benchmark attempts to separately measure the time to create, write, re-read, and then delete a large number of files in a hierarchical file system. .LP Bonnie .[[ Bray90 .]] measures sequential, streaming I/O bandwidth for a single process, and random I/O latency for multiple processes. .LP Peter Chen developed an adaptive harness for I/O benchmarking .[[ Chen93d .]] .[[ Chen94a .]], which defines I/O load in terms of five parameters, uniqueBytes, sizeMean, readFrac, seqFrac, and processNum. The benchmark then explores the parameter space to measure file system performance in a scalable fashion. .LP Parkbench .[[ Parkbench .]] is a benchmark suite that can analyze parallel and distributed computer performance. It contains a variety of benchmarks that measure both aspects of system performance, such as communication overheads, and distributed application kernel performance. Parkbench contains benchmarks from both NAS .[[ NAS .]] and Genesis .[[ Glendinning94 .]]. .NH 1 Timing Harness .LP The first, and most crucial element in extending \*[lmbench2] so that it could measure scalable performance, was to develop a new timing harness that could accurately measure performance for any given load. Once this was done, then each benchmark would be migrated to the new timing harness. .LP The harness is designed to accomplish a number of goals: .IP 1. during any timing interval of any child it is guaranteed that all other child processes are also running the benchmark .IP 2. the timing intervals are long enough to average out most transient OS scheduler affects .IP 3. the timing intervals are long enough to ensure that error due to clock resolution is negligible .IP 4. timing measurements can be postponed to allow the OS scheduler to settle and adjust to the load .IP 5. the reported results should be representative and the data analysis should be robust .IP 6. timing intervals should be as short as possible while ensuring accurate results .LP Developing an accurate timing harness with a valid experimental design is more difficult than is generally supposed. Many programs incorporate elementary timing harnesses which may suffer from one or more defects, such as insufficient care taken to ensure that the benchmarked operation is run long enough to ensure that the error introduced by the clock resolution is insignificant. The basic elements of a good timing harness are discussed in Staelin .[[ Staelin98 .]]. .LP The new timing harness must also collect and process the timing results from all the child processes so that it can report the representative performance. It currently reports the median performance over all timing intervals from all child processes. It might perhaps be argued that it should report the median of the medians. .LP When running benchmarks with more than one child, the harness must first get a baseline estimate of performance by running the benchmark in only one process using the standard \*[lmbench] timing interval, which is often 5,000 microseconds. Using this information, the harness can compute the average time per iteration for a single process, and it uses this figure to compute the number of iterations necessary to ensure that each child runs for at least one second. .NH 2 Clock resolution .LP \*[lmbench] uses the \*[gettimeofday] clock, whose interface resolves time down to 1 microsecond. However, many system clock's resolution is only 10 milli-seconds, and there is no portable way to query the system to discover the true clock resolution. .LP The problem is that the timing intervals must be substantially larger than the clock resolution in order to ensure that the timing error doesn't impact the results. For example, the true duration of an event measured with a 10 milli-second clock can vary $+-$10 milli-seconds from the true time, assuming that the reported time is always a truncated version of the true time. If the clock itself is not updated precisely, the true error can be even larger. This implies that timing intervals on these systems should be at least 1 second. .LP However, the \*[gettimeofday] clock resolution in most modern systems is 1 microsecond, so timing intervals can as small as a few milli-seconds without incurring significant timing errors related to clock resolution. .LP Since there is no standard interface to query the operating system for the clock resolution, \*[lmbench] must experimentally determine the appropriate timing interval duration which provides results in a timely fashion with a negligible clock resolution error. .NH 2 Coordination .LP Developing a timing harness that correctly manages $N$ processes and accurately measures system performance over those same $N$ processes is significantly more difficult than simply measuring system performance with a single process because of the asynchronous nature of parallel programming. .LP In essence, the new timing harness needs to create $N$ jobs, and measure the average performance of the target subsystem while all $N$ jobs are running. This is a standard problem for parallel and distributed programming, and involves starting the child processes and then stepping through a handshaking process to ensure that all children have started executing the benchmarked operation before any child starts taking measurements. .TSTART .TS box tab (/) box expand ; c c l l . Parent/Child T{ \(bu start up P child processes T}/ T{ \(bu wait for P \fIready\fR signals T}/T{ \(bu run benchmark operation for a little while T} \(da/T{ \(bu send a \fIready\fR signal T} T{ \(bu on reciept of \fIready\fR signals, sleep for \fIwarmup\fR \*[micro]s T}/T{ \(bu run benchmark operation while polling for a \fIgo\fR signal T} T{ \(bu send \fIgo\fR signal to P children T}/\(da T{ \(bu wait for P \fIdone\fR signals T}/T{ \(bu on receipt of \fIgo\fR signal, begin timing benchmark operation T} \(da/T{ \(bu send a \fIdone\fR signal T} T{ \(bu one receipt of \fIdone\fR signals, iterate through children sending \fIresults\fR signal and gathering results T}/T{ \(bu run benchmark operation while polling for a \fIresults\fR signal T} T{ \(bu collate results T}/T{ \(bu on receipt of \fIresults\fR signal, send timing results and wait for \fIexit\fR signal T} T{ \(bu send \fIexit\fR signal T}/\(da /T{ \(bu exit T} .TE .TEND "Timing harness sequencing" .nr TABLEseq \n[TABLE] .LP Table \n[TABLEseq] shows how the parent and child processes coordinate their activities to ensure that all children are actively running the benchmark activity while any child could be taking timing measurements. .LP The reason for the separate "exit" signal is to ensure that all properly managed children are alive until the parent allows them to die. This means that any SIGCHLD events that occur before the "exit" signal indicate a child failure. .NH 2 Accuracy .LP The new timing harness also needs to ensure that the timing intervals are long enough for the results to be representative. The previous timing harness assumed that only single process results were important, and it was able to use timing intervals as short as possible while ensuring that errors introduced by the clock resolution were negligible. In many instances this meant that the timing intervals were smaller than a single scheduler time slice. The new timing harness must run benchmarked operations long enough to ensure that timing intervals are longer than a single scheduler time slice. Otherwise, you can get results which are complete nonsense. For example, running several copies of an \*[lmbench2] benchmark on a uni-processor machine will often report that the per-process performance with $N$ jobs running in parallel is equivalent to the performance with a single job running!\** .FS This was discovered by someone who naively attempted to parallelize \*[lmbench2] in this fashion, and I received a note from the dismayed developer describing the failed experiment. .FE .LP In addition, since the timing intervals now have to be longer than a single scheduler time slice, they also need to be long enough so that a single scheduler time slice is insignificant compared to the timing interval. Otherwise the timing results can be dramatically affected by small variations in the scheduler's behavior. .LP Currently \*[lmbench] does not measure the scheduler timeslice; the design blithely assumes that timeslices are generally on the order of 10-20ms, so one second timing intervals are sufficient. Some schedulers may utilize longer time slices, but this has not (yet) been a problem. .NH 2 Resource consumption .LP One important design goal was that resource consumption be constant with respect to the number of child processes. This is why the harness uses shared pipes to communicate with the children, rather than having a separate set of pipes to communicate with each child. An early design of the system utilized a pair of pipes per child for communication and synchronization between the master and slave processes. However, as the number of child processes grew, the fraction of system resources consumed by the harness grew and the additional system overhead could start to interfere with the accuracy of the measurements. .LP Additionally, if the master has to poll (\*[select]) $N$ pipes, then the system overhead of that operation also scales with the number of children. .NH 2 Pipe atomicity .LP Since all communication between the master process and the slave (child) processes is done via a set of shared pipes, we have to ensure that we never have a situation where the message can be garbled by the intermingling of two separate messages from two separate children. This is ensured by either using pipe operations that are guaranteed to be atomic on all machines, or by coordinating between processes so that at most one process is writing at a time. .LP The atomicity guarantees are provided by having each client communicate synchronization states in one-byte messages. For example, the signals from the master to each child are one-byte messages, so each child only reads a single byte from the pipe. Similarly, the responses from the children back to the master are also one-byte messages. In this way no child can receive partial messages, and no message can be interleaved with any other message. .LP However, using this design means that we need to have a separate pipe for each \fIbarrier\fR in the process, so the master uses three pipes to send messages to the children, namely: \fIstart_signal\fR, \fIresult_signal\fR, and \fIexit_signal\fR. If a single pipe was used for all three barrier events, then it is possible for a child to miss a signal, or if the signal is encoded into the message, then it is possible for a child to infinite loop pulling a signal off the pipe, recognizing that it has already received that signal so that it needs to push it back into the pipe, and then then re-receiving the same message it just re-sent. .LP However, all children share a single pipe to send data back to the master process. Usually the messages on this pipe are single-byte signals, such as \fIready\fR or \fIdone\fR. However, the timing data results need to be sent from the children to the master and they are (much) larger than a single-byte message. In this case, the timing harness sends a single-byte message on the \fIresult_signal\fR channel, which can be received by at most one child process. This child then knows that it has sole ownership of the response pipe, and it writes its entire set of timing results to this pipe. Once the master has received all of the timing results from a single child, it sends the next one-byte message on the \fIresult_signal\fR channel to gather the next set of timing results. .TSTART 1 .so lmbench3_signals.pic .FEND "Control signals" 1 .nr FIGUREsig \n[FIGURE] .LP The design of the signals is shown in Figure \n[FIGUREsig]. .NH 2 Benchmark initialization .LP By allowing the benchmark to specify an initialization routine that is run in the child processes, the new timing harness allows benchmarks to do either or both global initializations that are shared by all children and specific per-child initializations that are done independently by each child. Global initialization is done in the master process before the \*[benchmp] harness is called, so the state is preserved across the \*[fork] operations. Per-child initialization is done inside the \*[benchmp] harness by the optional initialization routine and is done after the \*[fork] operation. .LP Similarly, each benchmark is allowed to specify a cleanup routine that is run by the child processes just before exiting. This allows the benchmark routines to release any resources that they may have used during the benchmark. Most system resources would be automatically released on process exit, such as file descriptors and shared memory segments, but some resources such as temporary files might need to be explicitly released by the benchmark. .NH 2 Scheduler transients .LP Particularly on multi-processor systems, side-effects of process migration can dramatically affect program runtimes. For example, if the processes are all initially assigned to the same processor as the parent process, and the timing is done before the scheduler migrates the processes to other available processors, then the system performance will appear to be that of a uniprocessor. Similarly, if the scheduler is over-enthusiastic about re-assigning processes to processors, then performance will be worse than necessary because the processes will keep encountering cold caches and will pay exhorbitant memory access costs. .LP The first case is a scheduler transient, and users may not want to measure such transient phenomena if their primary interest is in predicting performance for long-running programs. Conversely, that same user would be extraordinarily interested in the second phenomena. The harness was designed to allow users to specify that the benchmarked processes are run for long enough to (hopefully) get the scheduler past the transient startup phase, so it can measure the steady-state behavior. .NH 2 Data analysis .LP Analyzing the data to produce representative results is a crucial step in the benchmarking process. \*[lmbench] generally reports the \fImedian\fP result for $11$ measurements. Most benchmarks report the results of a single measurement .[[ Howard88 .]], an average of several results .[[ McCalpin95 .]], or a trimmed mean .[[ Brown97 .]]. .\" XXX UNKNOWN: .\" .RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90 .LP Since \*[lmbench] is able to use timing intervals that are often smaller than a scheduler time slice when measuring single-process performance, the raw timing results are often severely skewed. Often most results cluster around a single value a small number of outliers with significantly larger values. The median is preferable to the mean when the data can be very skewed .[[ Jain91 .]]. Since the timing intervals are significantly longer when the desired load is larger than a single process, the results tend not to be as badly skewed. In these cases we could use the \fImean\fR instead, but we decide to use a uniform statistical framework, so we usually use the median. .LP In some instances, however, \*[lmbench] internally uses the \fIminimum\fP rather than the median, such as in \*[mhz]. In those instances, we are not trying to find the \fIrepresentative\fP value, but rather the \fIminimum\fP value. There are only a few sources of error which could cause a the measured timing result to be shorter than the true elapsed time: the system clock is adjusted, or round-off error in the clock resolution. The timing interval duration is set to ensure that the round-off error is bounded to 1% of the timing interval, and we blithely assume that people don't reset their system clocks while benchmarking their systems. .LP \*[lmbench] does not currently report any statistics representing measurement variation, such as the difference between the first and third quartiles. This is an enhancement under active consideration. .NH 1 Interface .LP Unfortunately we had to move away from the macro-based timing harness used in \*[lmbench2] and migrate to a function-based system because the macros were too large for some C pre-processors. .TSTART 1 .DS L \f(CWtypedef void (*bench_f)(iter_t iters, void* cookie); typedef void (*support_f)(void* cookie); extern void benchmp(support_f initialize, bench_f benchmark, support_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie); extern uint64 gettime(); extern uint64 get_n(); extern void nano(char* s, uint64 n); extern void micro(char* s, uint64 n); extern void mb(uint64 bytes);\fP .DE .FEND "Programming interface" 1 .nr FIGinterface \n[FIGURE] .LP Figure \n[FIGinterface] shows the key elements of the new timing harness and result reporting interface. A brief description of the \*[benchmp] parameters: .IP \fIenough\fR Enough can be used to ensure that a timing interval is at least 'enough' microseconds in duration. For most benchmarks this should be zero, but some benchmarks have to run for more time due to startup effects or other transient behavior. .IP \fIparallel\fR is simply the number of instances of the benchmark that will be run in parallel on the system. .IP \fIwarmup\fR can be used to force the benchmark to run for warmup microseconds before the system starts making timing measurements. Note that it is a lower bound, not a fixed value, since it is simply the time that the parent sleeps after receiving the last "ready" signal from each child (and before it sends the "go" signal to the children). .IP \fIrepetitions\fR is the number of times the experiment should be repeated. The default is eleven. .IP \fIcookie\fR is a pointer that can be used by the benchmark writer to pass in configuration information, such as buffer size or other parameters needed by the inner loop. In \*[lmbench3] it is generally used to point to a structure containing the relevant configuration information. .LP \*[gettime] returns the median timing interval duration, while \*[get_n] returns the number of iterations executed during that timing interval. .LP \*[nano] and \*[micro] print the passed string latency followed by the latency in terms of nanoseconds and microseconds respectively. The latency is computed as $gettime()/n$, where $n$ is the passed parameter. The reason $n$ is passed as a parameter is because the benchmark can actually execute the operation of interest multiple times during a single iteration. For example, the memory latency benchmarks typically repeat the memory load operation a hundred times inside the loop, so the actual number of operations is $100 times get_n()$, and it is this value that should be passed to \*[nano] or \*[micro]. .LP \*[mb] reports the bandwidth in MB/s when given the total number of bytes processed during the timing interval. Note that for scalable benchmarks that process $"size"$ bytes per iteration, the total number of bytes processed is $get_n() times parallel times "size"$. .TSTART 1 .DS L \f(CW#include "bench.h" void bench(iter_t iters, void* cookie) { while (iters-- > 0) { getppid(); } } int main(int argc, char* argv[]) { benchmp(NULL, bench, NULL, 0, 1, 0, TRIES, NULL); nano("getppid", get_n()); return(0); }\fP .DE .FEND "A sample benchmark" 1 .nr FIGsample \n[FIGURE] .LP Figure \n[FIGsample] shows a sample benchmark that measures the latency of the \*[getppid] system call using this timing harness. Since there is no setup or cleanup needed for this benchmark, the \fIinitialize\fR and \fIcleanup\fR parameters are NULL. The \fIbench\fR routine simply calls \*[getppid] as many times as requested, and the rest of the parameters, \fIenough\fR, \fIparallel\fR, \fIwarmup\fR, \fIrepetitions\fR, and \fIcookie\fR are given with the default values. .NH 1 Benchmarks .LP \*[lmbench] contains a large number of micro-benchmarks that measure various aspects of hardware and operating system performance. The benchmarks generally measure latency or bandwidth, but some new benchmarks also measure instruction-level parallelism. .TSTART .TS center box tab (&); c c l & l . Name&Measures _ &\fBBandwidth\fR \fIbw_file_rd\fR&T{ \*[read] and then load into processor T} \fIbw_mem\fR&T{ read, write, and copy data to/from memory T} \fIbw_mmap_rd\fR&read from \*[mmap]'ed memory \fIbw_pipe\fR&\*[pipe] inter-process data copy \fIbw_tcp\fR&TCP inter-process data copy \fIbw_unix\fR&UNIX inter-process _ &\fBLatency\fR lat_connect&TCP connection \fIlat_ctx\fR&T{ context switch via \*[pipe]-based ``hot-potato'' token passing T} lat_dram_page&T{ DRAM page open T} \fIlat_fcntl\fR&T{ \*[fcntl] file locking ``hot-potato'' token passing T} \fIlat_fifo\fR&T{ FIFO ``hot-potato'' token passing T} lat_fs&file creation and deletion lat_http&http GET request latency \fIlat_mem_rd\fR&memory read \fIlat_mmap\fR&\*[mmap] operation \fIlat_ops\fR&T{ basic operations (\fIxor\fR, \fIadd\fR, \fImul\fR, \fIdiv\fR, \fImod\fR) on (relevant) basic data types (\fIint\fR, \fIint64\fR, \fIfloat\fR, \fIdouble\fR) T} \fIlat_pagefault\fR&page fault handler \fIlat_pipe\fR&\*[pipe] ``hot-potato'' token passing \fIlat_pmake\fR&T{ time to complete $N$ parallel jobs that each do $usecs$-worth of work T} \fIlat_proc\fR&T{ procedure call overhead and process creation using \*[fork], \*[fork] and \*[execve], and \*[fork] and \*[sh] T} \fIlat_rand\fR&T{ random number generator T} \fIlat_rpc\fR&SUN RPC procedure call \fIlat_select\fR&\*[select] operation \fIlat_sem\fR&T{ semaphore ``hot-potato'' token passing T} \fIlat_sig\fR&T{ signal handle installation and handling T} \fIlat_syscall\fR&T{ \*[open], \*[close], \*[getppid], \*[write], \*[stat], \*[fstat] T} \fIlat_tcp\fR&TCP ``hot-potato'' token passing \fIlat_udp\fR&UDP ``hot-potato'' token passing \fIlat_unix\fR&UNIX ``hot-potato'' token passing \fIlat_unix_connect\fR&UNIX socket connection \fIlat_usleep\fR&T{ \*[usleep], \*[select], \*[pselect], \*[nanosleep], \*[setitimer] timer resolution T} _ &\fBOther\fR disk&T{ zone bandwidths and seek times T} line&cache line size lmdd&\fIdd\fR clone par_mem&memory subsystem ILP par_ops&basic operation ILP \fIstream\fR&STREAM clones tlb&TLB size .TE .TEND "\*[lmbench] micro-benchmarks" .nr TABLEbench \n[TABLE] .LP Table \n[TABLEbench] contains the full list of micro-benchmarks in \*[lmbench3]. Benchmarks that were converted to measure performance under scalable load are shown in italics, while the remaining benchmarks are shown with normal typeface. A detailed description of most benchmarks can be found in .[[ McVoy96 .]]. .NH 1 Scaling Benchmarks .LP There are a number of issues associated with converting single-process benchmarks with a single process to scalable benchmarks with several independent processes, in addition to the various issues addressed by the timing harness. Many of the benchmarks consume or utilize system resources, such as memory or network bandwidth, and a careful assessment of the likely resource contention issues is necessary to ensure that the benchmarks measure important aspects of system performance and not artifacts of artificial resource contention. .LP For example, the Linux 2.2 and 2.4 kernels use a single lock to control access to the kernel data structures for a file. This means that multiple processes accessing that file will have their operations serialized by that lock. If one is interested in how well a system can handle multiple independent accesses to separate files and if the child processes all access the same file, then this file sharing is an artificial source of contention with potentially dramatic effects on the benchmark results. .NH 2 File System .LP A number of the benchmarks measure aspects of file system performance, such as \*[bw_file_rd], \*[bw_mmap_rd], \*[lat_mmap], and \*[lat_pagefault]. It is not immediately apparent how these benchmarks should be extended to the parallel domain. For example, it may be important to know how file system performance scales when multiple processes are reading the same file, or when multiple processes are reading different files. The first case might be important for large, distributed scientific calculations, while the second might be more important for a web server. .LP However, for the operating system, the two cases are significantly different. When multiple processes access the same file, access to the kernel data structures for that file must be coordinated and so contention and locking of those structures can impact performance, while this is less true when multiple processes access different files. .LP In addition, there are any number of issues associated with ensuring that the benchmarks are either measuring operating system overhead (e.g., that no I/O is actually done to disk), or actually measuring the system's I/O performance (e.g., that the data cannot be resident in the buffer cache). Especially with file system related benchmarks, it is very easy to develop benchmarks that compare apples and oranges (e.g., the benchmark includes the time to flush data to disk on one system, but only includes the time to flush a portion of data to disk on another system). .LP \*[lmbench3] allows the user to measure either case as controlled by a command-line switch. When measuring accesses to independent files, the benchmarks first create their own private copies of the file, one for each child process. Then each process accesses its private file. When measuring accesses to a single file, each child simply uses the designated file directly. .NH 2 Context Switching .LP Measuring context switching accurately is a difficult task. \*[lmbench1] and \*[lmbench2] measured context switch times via a "hot-potato" approach using pipes connected in a ring. However, this experimental design heavily favors schedulers that do "hand-off" scheduling, since at most one process is active at a time. Consequently, it is not really a good benchmark for measuring scheduler overhead in multi-processor machines. .LP The design currently used in \*[lmbench3] is to create $N$ \*[lmbench2]-style process rings and to measure the context switch times with all $N$ rings running in parallel. This does extend the \*[lmbench2] context switch benchmark to a scalable form, but it still suffers from the same weaknesses. .LP One approach that was considered was to replace the ring with a star formation, so the master process would send tokens to each child and then wait for them all to be returned. This has the advantage that more than one process is active at a time, reducing the sensitivity to "hand-off" scheduling. However, this same feature can cause problems on a multi-processor system because several of the context switches and working set accesses can occur in parallel. .LP The design and methodology for measuring context switching and scheduler overhead need to be revisited so that it can more accurately measure performance for multi-processor machines. .NH 1 Stream .LP \*[lmbench3] includes a new micro-benchmark, \*[stream] which measures the performance of John McCalpin's STREAM benchmark kernels for both STREAM version 1 .[[ McCalpin95 .]] and version 2 .[[ McCalpin2002 .]]. This benchmark faithfully recreates each of the kernel operations from both STREAM benchmarks, and because of the powerful new timing harness it can easily measure memory system scalability. .TSTART .TS center box tab (|); c s s s s c | c | c s | c l | l | l | l | l . Stream _ Kernel|Code|Bytes|FL ||rd|wr|OPS _ COPY|$a[i]=b[i]$|8(+8)|8|0 SCALE|$a[i]=q times b[i]$|8(+8)|8|1 ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1 TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2(-1) .TE .TS center box tab (|); c s s s s c | c | c s | c l | l | l | l | l . Stream2 _ Kernel|Code|Bytes|FL ||rd|wr|OPS _ FILL|$a[i]=q$|0(+8)|8|0 COPY|$a[i]=b[i]$|8(+8)|8|0 DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2(-1) SUM|$sum=sum + a[i]$|8|0|1 .TE .TEND "Stream operations" .LP Table \n[TABLE] is based on McCalpin's tables .[[ McCalpin95 .]] .[[ McCalpin2002 .]] and shows the four kernels for each version of the \*[stream] benchmark. Note that the .I read columns include numbers in parentheses, which represent the average number of bytes read into the cache as a result of the write to that variable\**. .FS This number is independent of the cache line size because the STREAM uses dense arrays, so the cost is amortized over the subsequent operations on the rest of the line. .FE Cache lines are almost invariably bigger than a single double, and so when a write miss occurs the cache will read the line from memory and then modify the selected bytes. Sometimes vector instructions such as SSE and 3DNow can avoid this load by writing an entire cache line at once. .LP In addition, some architectures support multiply-add instructions which can do both the multiply and add operations for TRIAD and DAXPY in a single operation, so the physical FLOPS count would be 1 for these architectures on these instructions. The numbers in parenthesis in the .I FLOPS column reflect this reduction in FLOPS count. .LP Following the STREAM bandwidth reporting conventions, the \*[lmbench] STREAM benchmarks report their results as bandwidth results (MB/s) computed as a function of the amount of data explicitly read or written by the benchmark. For example, \fIcopy\fR and \fIscale\fR copy data from one array to the other, so the bandwidth is measured as a function of the amount of data read plus the amount of data written, or the sum of the two array sizes. Similarly, \fIsum\fR, \fItriad\fR, and \fIdaxpy\fR operate on three arrays, so the amount of data transferred is the sum of the sizes of the three arrays. Note that the actual amount of data that is transferred by the system may be larger because in the write path the cache may need to fetch (read) the cache line before a portion of it is overwritten by dirty data. .NH 1 Unscalable benchmarks .LP There are a number of benchmarks which either did not make sense for scalable load, such as \*[mhz], or which could not be extended to measure scalable load due to other constraints, such as \*[lat_connect]. .LP \*[mhz] measures the processor clock speed, which is not a scalable feature of the system, so it doesn't make any sense to create a version of it that measures scalable performance. .LP More specifically, \*[lat_connect] measures the latency of connecting to a TCP socket. TCP implementations have a timeout on sockets and there is generally a fixed size queue for sockets in the TIMEOUT state. This means that once the queue has been filled by a program connecting and closing sockets as fast as possible, then all new socket connections have to wait TIMEOUT seconds. Needless to say, this gives no insight into the latency of socket creation per se, but is rather a boring artifact. Since the \*[lmbench2] version of the benchmark can run for very short periods of time, it generally does not run into this problem and is able to correctly measure TCP connection latency. .LP Any scalable version of the benchmark needs each copy to run for at least a second, and there are $N$ copies creating connections as fast as possible, so it would essentially be guaranteed to run into the TIMEOUT problem. Consequently, \*[lat_connect] was not enhanced to measure scalable performance. .LP \*[lat_fs] has not yet been parallelized because of the difficulty in measuring file creation and file deletion times in the new timing harness. The timing harness assumes that it can ask the benchmarked operation to be repeated as many times as necessary. This would mean that the file creation benchmark could create any number of new files of a given size, which could well fill up the file system. The real problem lies in the file deletion benchmark. In order to delete files of a given size, they must have been created before the benchmark begins. However, the number of files is not known in advance, so the benchmark would have a difficult time ensuring that it has created enough files. .LP The benchmarks that measure aspects of memory-subsystem micro-architecture, \*[lat_dram_page], \*[line], \*[par_mem], and \*[tlb], were not parallelized because the multiple processes' memory access patterns would likely interfere with one another. For example, in \*[lat_dram_page], those accesses which were supposed to be to open DRAM pages could well be accessing closed DRAM pages, invalidating the benchmark. .LP \*[lmdd] was not parallelized because it is supposed to be a clone of \*[dd], and it wasn't clear what a parallel form of \*[dd] would look like. .NH 1 Results .LP The results presented here were obtained using \*[lmbench] version 3.0-a2 under Linux 2.4.18-6mdk on a two processor 450MHz PIII running a stock Mandrake 8.2 Linux 2.4.18 kernel. .TSTART .TS center box tab (&); c | c s l | l | l. Benchmark&Latency ($mu$s) _ &1 process&2 processes _ null call&0.79&0.81 null I/O&1.39&2.39 stat&9.26&25.9 open/close&11.7&27.1 select (TCP)&55.3&58.6 signal install&1.89&1.95 signal handler&6.34&7.21 fork process&793.&868. exec process&2474&2622 sh process&24.K&25.K pipe&17.7&23.3 unix socket&51.6&37.6 UDP&70.2&70.6 TCP&91.2&92.3 rpc (UDP)&120.0&120.4 rpc (TCP)&157.1&159.1 .TE .TEND "Latency results" .nr TABLElatency \n[TABLE] .TSTART .TS center box tab (&); c | c s l | l | l. Benchmark&Bandwidth (MB/s) _ &1 process&2 processes _ pipe&155&268 unix socket&142&179 TCP&57.5&57.8 bcopy(libc)&134&175 bcopy(hand)&144&174 memory read&319&486 memory write&199&202 STREAM copy&288.68&367.99 STREAM scale&290.39&369.08 STREAM sum&337.75&415.54 STREAM triad&246.90&380.09 STREAM2 fill&198.96&276.28 STREAM2 copy&288.55&359.93 STREAM2 daxpy&318.98&493.79 STREAM2 sum&354.03&512.05 .TE .TEND "Bandwidth results" .nr TABLEbandwidth \n[TABLE] .TSTART .TS center box tab (&); c | c s s l | l | l | l. Benchmark&Load _ &1&2&2clone _ bw_file_rd&151.04&266.74&273.51 bw_mmap_rd&316.08&480.02&482.57 lat_mmap&615&878&786 lat_pagefault&2.9802&3.9159&3.4589 .TE .TEND "File bandwidth results" .nr TABLEfile \n[TABLE] .LP Table \n[TABLElatency] shows the latency of various system and communication operations for both 1 and 2 process loads, while Table \n[TABLEbandwidth] shows the bandwidth of various data operations and Table \n[TABLEfile] shows how various file system operations scale. Table \n[TABLEfile] shows system performance with one process, two processes sharing the same file, and two processes accessing their own files. .TSTART 1 .G1 label left "Latency (ns)" label bottom "Memory size (MB)" coord x 0.0004,32 y 5,300 log x draw solid 0.00049 6.680 0.00098 6.683 0.00195 6.680 0.00293 6.680 0.00391 6.681 0.00586 6.681 0.00781 6.681 0.00977 6.684 0.01172 6.683 0.01367 6.690 0.01562 6.725 0.01758 48.977 0.01953 49.051 0.02148 49.043 0.02344 49.025 0.02539 48.889 0.02734 48.880 0.02930 48.902 0.03125 49.020 0.03516 49.043 0.03906 48.904 0.04297 49.044 0.04688 49.027 0.05078 49.046 0.05469 48.889 0.05859 49.018 0.06250 49.012 0.07031 49.025 0.07812 49.030 0.08594 48.936 0.09375 49.042 0.10156 49.022 0.10938 48.889 0.11719 49.073 0.12500 48.998 0.14062 49.043 0.15625 49.125 0.17188 49.160 0.18750 49.113 0.20312 49.123 0.21875 48.991 0.23438 49.045 0.25000 49.184 0.28125 49.971 0.31250 57.735 0.34375 72.668 0.37500 79.106 0.40625 77.612 0.43750 78.764 0.46875 88.636 0.50000 104.024 1.00000 179.817 1.50000 182.297 2.00000 182.043 2.50000 182.902 3.00000 183.130 3.50000 184.333 4.00000 182.868 5.00000 183.319 6.00000 183.208 7.00000 183.688 8.00000 183.871 10.00000 183.659 12.00000 183.583 14.00000 183.773 16.00000 183.828 18.00000 183.894 20.00000 183.933 30.00000 183.971 new dashed 0.00049 6.811 0.00098 6.815 0.00195 6.825 0.00293 6.807 0.00391 6.803 0.00586 6.822 0.00781 6.826 0.00977 6.825 0.01172 6.922 0.01367 6.825 0.01562 6.866 0.01758 49.954 0.01953 49.989 0.02148 50.021 0.02344 50.019 0.02539 50.003 0.02734 50.085 0.02930 50.000 0.03125 50.187 0.03516 49.988 0.03906 50.032 0.04297 49.986 0.04688 50.186 0.05078 50.196 0.05469 50.107 0.05859 50.087 0.06250 49.983 0.07031 50.092 0.07812 50.135 0.08594 50.057 0.09375 50.188 0.10156 65.950 0.10938 55.614 0.11719 54.328 0.12500 61.700 0.14062 59.710 0.15625 52.637 0.17188 82.911 0.18750 74.304 0.20312 72.371 0.21875 78.124 0.23438 74.577 0.25000 96.374 0.28125 110.708 0.31250 97.832 0.34375 103.006 0.37500 129.292 0.40625 140.816 0.43750 165.255 0.46875 164.632 0.50000 170.912 1.00000 233.968 1.50000 285.445 2.00000 241.341 2.50000 263.436 3.00000 273.101 3.50000 269.926 4.00000 233.626 5.00000 222.305 6.00000 293.832 7.00000 238.863 8.00000 245.026 10.00000 282.297 12.00000 239.152 14.00000 274.218 16.00000 226.299 18.00000 284.183 20.00000 224.596 30.00000 236.416 "1 process" at 5,165 "2 processes" at 0.3,280 .G2 .FEND "Memory subsystem performance" 1 .nr FIGUREmem \n[FIGURE] .LP Figure \n[FIGUREmem] shows the memory latency curves with 32 byte strides for one and two process loads versus memory size. .NH 1 Conclusion .LP \*[lmbench] is a useful, portable micro-benchmark suite designed to measure important aspects of system performance. \*[lmbench3] adds a number of important extensions, such as the ability to measure system scalability. .LP The benchmarks are available via ftp from: .IP .I "http://ftp.bitmover.com/lmbench" .NH 1 Acknowledgments .LP Many people have provided invaluable help and insight into the benchmarks. We especially thank: Eric Anderson \s-1(HP)\s0, Bruce Chapman \s-1(SUN)\s0, Larry McVoy \s-1(BitMover)\s0, David Mosberger \s-1(HP)\s0, Wayne Scott \s-1(BitMover)\s0, John Wilkes \s-1(HP)\s0, and Mitch Wright \s-1(HP)\s0. .LP We would also like to thank all of the people that have run the benchmark and contributed their results; none of this would have been possible without their assistance. .LP Our thanks to all of the free software community for tools that were used during this project. .\" .R1 .\" bibliography references-lmbench3 .\" .R2 .\"******************************************************************** .\" Redefine the IP paragraph format so it won't insert a useless line .\" break when the paragraph tag is longer than the indent distance .\" .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] .\". br . \} . rm par*label .\} .. .\"******************************************************************** .\" redefine the way the reference tag is printed so it is enclosed in .\" square brackets .\" .de ref*end-print .ie d [F .IP "[\\*([F]" 2 .el .XP \\*[ref*string] .. .\"******************************************************************** .\" Get journal number entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-N .ref*field N "" ( ) .. .\"******************************************************************** .\" Get journal volume entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-V .ref*field V , "" "" "" .. .\"******************************************************************** .\" Get the date entry right. Should not be enclosed in parentheses. .\" .de ref*add-D .ref*field D "," .. .\" References .[ $LIST$ .] .\" .so bios ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmbench3_arch.fig����������������������������������������������������������������0000664�0000764�0000764�00000010155�07416411431�017114� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#FIG 3.2 Landscape Center Inches Letter 100.00 Single -2 1200 2 6 900 1425 2100 2400 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 900 1950 2100 1950 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 900 2025 2100 2025 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1350 1950 1350 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1500 1950 1500 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1650 1950 1650 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1800 1950 1800 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1950 1950 1950 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1200 1950 1200 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1050 1950 1050 2100 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 900 1425 2100 1425 2100 2400 900 2400 900 1425 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 900 2100 2100 2100 4 0 0 50 0 0 12 0.0000 4 135 480 1275 1575 Cache\001 -6 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 150 525 3450 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 300 750 300 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 600 750 600 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 900 750 900 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 150 2625 2250 2625 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 3075 75 3450 75 3450 300 3075 300 3075 75 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2550 300 2550 525 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 2400 75 2775 75 2775 300 2400 300 2400 75 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 1950 75 2325 75 2325 300 1950 300 1950 75 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 3225 300 3225 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2100 300 2100 525 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 225 75 825 75 825 300 225 300 225 75 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 975 75 1575 75 1575 300 975 300 975 75 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1275 300 1275 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 525 300 525 525 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 600 2775 1800 2775 1800 3450 600 3450 600 2775 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1125 2625 1125 2775 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2775 525 2775 750 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1950 750 1950 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4 2775 975 2775 1275 1500 1275 1500 1425 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 2925 1350 3375 1350 3375 1575 2925 1575 2925 1350 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 3000 900 3150 900 3150 1350 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 3150 1575 3150 1725 2100 1725 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 898 1940 675 1875 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 4 2175 1950 2250 1950 2250 2100 2175 2100 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2475 2025 2250 2025 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1500 2400 1500 2625 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 2625 750 3000 750 3000 975 2625 975 2625 750 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 1875 750 2250 750 2250 975 1875 975 1875 750 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 1125 750 1500 750 1500 975 1125 975 1125 750 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 675 750 1050 750 1050 975 675 975 675 750 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 225 750 600 750 600 975 225 975 225 750 4 0 0 50 0 0 12 0.0000 4 135 375 300 225 ALU\001 4 0 0 50 0 0 12 0.0000 4 135 270 75 450 bus\001 4 0 0 50 0 0 12 0.0000 4 135 150 3150 225 fn\001 4 0 0 50 0 0 12 0.0000 4 15 135 2850 225 ...\001 4 0 0 50 0 0 12 0.0000 4 180 1710 1725 450 floating point registers\001 4 0 0 50 0 0 12 0.0000 4 135 150 2475 225 f1\001 4 0 0 50 0 0 12 0.0000 4 135 150 2025 225 f0\001 4 0 0 50 0 0 12 0.0000 4 135 345 1050 225 FPU\001 4 0 0 50 0 0 12 0.0000 4 135 600 900 2925 memory\001 4 0 0 50 0 0 12 0.0000 4 135 300 2700 900 MA\001 4 0 0 50 0 0 12 0.0000 4 180 1500 1350 1275 physical addressing\001 4 0 0 50 0 0 12 0.0000 4 135 765 150 1875 cache line\001 4 0 0 50 0 0 12 0.0000 4 180 900 2550 2025 set (2-way)\001 4 0 0 50 0 0 12 0.0000 4 135 330 3000 1500 TLB\001 4 0 0 50 0 0 12 0.0000 4 180 915 2325 2625 memory bus\001 4 0 0 50 0 0 12 0.0000 4 90 150 1950 900 rn\001 4 0 0 50 0 0 12 0.0000 4 15 135 1575 900 ...\001 4 0 0 50 0 0 12 0.0000 4 135 150 1200 900 r2\001 4 0 0 50 0 0 12 0.0000 4 135 150 750 900 r1\001 4 0 0 50 0 0 12 0.0000 4 135 150 300 900 r0\001 4 0 0 50 0 0 12 0.0000 4 180 1245 975 675 integer registers\001 �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmbench3_signals.fig�������������������������������������������������������������0000664�0000764�0000764�00000006121�07415306241�017636� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#FIG 3.2 Landscape Center Inches Letter 100.00 Single -2 1200 2 6 225 1575 1050 2025 2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 225 1800 375 1800 375 1950 225 1950 225 1800 2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 225 1575 375 1575 375 1725 225 1725 225 1575 4 0 0 50 0 0 12 0.0000 4 180 600 450 1725 working\001 4 0 0 50 0 0 12 0.0000 4 180 465 450 1950 timing\001 -6 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2025 300 2025 1725 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 2250 300 2250 1575 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 750 525 2250 525 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 750 825 2250 825 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 1575 675 750 675 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 1800 975 750 975 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 750 1125 2250 1125 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1800 300 1800 1875 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 3000 600 2250 600 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 1575 675 3000 675 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 3000 900 2250 900 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 1800 1200 3000 1200 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 3000 1275 2250 1275 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 2025 1350 3000 1350 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1500 750 1650 600 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1950 1425 2100 1275 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 1500 75 2325 75 2325 300 1500 300 1500 75 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 2 1575 300 1575 2025 2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 1 1 1.00 60.00 120.00 2025 1350 750 1350 2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 150 525 750 525 750 975 150 975 150 525 2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 150 675 750 675 750 825 150 825 150 675 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 150 225 750 225 750 1350 150 1350 150 225 2 2 2 1 0 0 50 0 3 3.000 0 0 -1 0 0 5 3000 600 3600 600 3600 1200 3000 1200 3000 600 2 2 2 1 0 0 50 0 10 3.000 0 0 -1 0 0 5 3000 675 3600 675 3600 900 3000 900 3000 675 2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 3000 225 3600 225 3600 1425 3000 1425 3000 225 4 0 0 50 0 0 12 0.0000 4 150 480 1650 225 parent\001 4 0 0 50 0 1 12 0.0000 4 180 420 900 525 ready\001 4 0 0 50 0 1 12 0.0000 4 135 360 900 825 done\001 4 0 0 50 0 1 12 0.0000 4 135 495 1200 975 results\001 4 0 0 50 0 1 12 0.0000 4 180 1020 825 1125 timing results\001 4 0 0 50 0 0 12 0.0000 4 135 450 3075 375 child1\001 4 0 0 50 0 1 12 0.0000 4 135 495 2325 1200 results\001 4 0 0 50 0 1 12 0.0000 4 180 420 2550 600 ready\001 4 0 0 50 0 1 12 0.0000 4 135 360 2550 900 done\001 4 0 0 50 0 1 12 0.0000 4 135 165 1350 675 go\001 4 0 0 50 0 1 12 0.0000 4 135 300 1275 1350 exit\001 4 0 0 50 0 0 12 0.0000 4 105 360 1650 2025 start\001 4 0 0 50 0 0 12 0.0000 4 135 690 2325 1575 response\001 4 0 0 50 0 0 12 0.0000 4 135 285 2100 1725 exit\001 4 0 0 50 0 0 12 0.0000 4 135 435 1875 1875 result\001 4 0 0 50 0 0 12 0.0000 4 135 450 225 375 child0\001 �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/lmdd.8���������������������������������������������������������������������������0000664�0000764�0000764�00000006652�07045412511�014753� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH LMDD 8 "$Date$" "(c)1994 Larry McVoy" "LMBENCH" .SH NAME lmdd \- move io for performance and debugging tests .SH SYNOPSIS .B lmdd [ .IB option = value ] .\|.\|. .SH DESCRIPTION .B lmdd copies a specified input file to a specified output with possible conversions. This program is primarily useful for timing I/O since it prints out the timing statistics after completing. .SH OPTIONS .TP 15 .BI if= name Input file is taken from .IR name ; .I internal is the default. .I internal is a special file that acts like Sun's .IR /dev/zero , i.e., it provides a buffer of zeros without doing a system call to get them. .sp .5 The following file names are taken to mean the standard input: .IR - , .IR 0 , or .IR stdin . .TP .BI of= name Output file is taken from .IR name ; .I internal is the default. .I internal is a special file that acts like .IR /dev/null , without doing a system call to get rid of the data. .sp .5 The following file names are taken to mean the standard output: .IR - , .IR 1 , or .IR stdout . .sp .5 The following file names are taken to mean the standard error: .IR 2 , or .IR stderr . .TP .BI bs= n Input and output block size .I n bytes (default 8192). Note that this is different from dd(1), it has a 512 byte default. Also note that the block size can be followed by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), respectively. .TP .BI ipat= n If .B n is non zero, expect a known pattern in the file (see opat). Mismatches will be displayed as "ERROR: off=%d want=%x got=%x". The pattern is a sequence of 4 byte integers with the first 0, second 1, and so on. The default is not to check for the pattern. .TP .BI opat= n If .B n is non zero, generate a known pattern on the output stream. Used for debugging file system correctness. The default is not to generate the pattern. .TP .BI mismatch= n If .B n is non zero, stop at the first mismatched value. Used with ipat. .TP .BI skip= n Skip .IR n "" input blocks before starting copy. .TP .BI fsync= n If .I n is non-zero, call fsync(2) on the output file before exiting or printing timing statistics. .TP .BI sync= n If .I n is non-zero, call sync(2) before exiting or printing timing statistics. .TP .BI rand= n This argument, by default off, turns on random behavior. The argument is not a flag, it is a size, that size is used as the upper bound for the seeks. Also note that the block size can be followed by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), .TP .BI flush= n If .I n is non-zero and mmap(2) is available, call msync(2) to invalidate the output file. This flushes the file to disk so that you don't have unmount/mount. It is not as good as mount/unmount because it just flushes file pages - it misses the indirect blocks which are still cached. Not supported on all systems, compile time option. .TP .BI rusage= n If .I n is non-zero, print rusage statistics as well as timing statistics. Not supported on all systems, compile time option. .TP .BI count= n Copy only .IR n "" input records. .SH EXAMPLES .LP This is the most common usage, the intent is to measure disk performance. The disk is a spare partition mounted on /spare. .sp .nf .in +4 # mount /spare # lmdd if=internal of=/spare/XXX count=1000 fsync=1 7.81 MB in 3.78 seconds (2.0676 MB/sec) : Flush cache # umount /spare # mount /spare # lmdd if=/spare/XXX of=internal 7.81 MB in 2.83 seconds (2.7611 MB/sec) .in .sp .fi .SH AUTHOR Larry McVoy, lm@sun.com ��������������������������������������������������������������������������������������lmbench-3.0-a9/doc/mem.pic��������������������������������������������������������������������������0000664�0000764�0000764�00000343266�07045412511�015222� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.PS .ps 8 .vs 11 .ft CB [ # Variables, tweak these. xtick = 2.000000 # width of an X tick xlower = 8.000000 # where the xtick start xupper = 24.000000 # upper range of graph xn = 8 # number of ticks to do ytick = 50.000000 # width of an Y tick ylower = 0.000000 # where the ytick start yupper = 500.000000 # upper range of graph yn = 10 # number of ticks to do xsize = 1.75 # width of the graph ysize = 1.75 # height of the graph yscale = ysize / (yupper - ylower) # scale data to paper xscale = xsize / (xupper - xlower) # scale data to paper tick = 0.10000000000000000555 # distance towards numbers gthk = .1 # thickness of grid lines thk = .75 # thickness of data lines qthk = 2.0 # thickness of quartile lines vs = .15 # works for 10 point fonts # Draw the graph borders and tick marks O: box thick 1.5 ht ysize wid xsize j = ylower t = tick * .5 for i = 0 to yn by 1 do { ys = j - ylower g = ys * yscale line thick 1.5 from O.sw + (-tick, g) to O.sw + (0, g) if (i < yn) then { y2 = (ys + (ytick / 2)) * yscale line thick .5 from O.sw + (-t, y2) to O.sw + (0, y2) } if (yupper - ylower > 999) then { sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) } else { if (yupper - ylower > 10) then { sprintf("%.0f", j) rjust at O.sw + (-.2, g - .02) } else { if (yupper - ylower > 1) then { sprintf("%.1f", j) rjust at O.sw + (-.2, g - .02) } else { sprintf("%.2f", j) rjust at O.sw + (-.2, g - .02) }}} j = j + ytick } j = xlower for i = 0 to xn by 1 do { xs = j - xlower g = xs * xscale line thick 1.5 from O.sw + (g, -tick) to O.sw + (g, 0) if (i < xn) then { x2 = (xs + (xtick / 2)) * xscale line thick .5 from O.sw + (x2, 0) to O.sw + (x2, -t) } if (xupper - xlower > 999) then { sprintf("%.0f", j) at O.sw + (g, -.25) } else { if (xupper - xlower > 10) then { sprintf("%.0f", j) at O.sw + (g, -.25) } else { if (xupper - xlower > 1) then { sprintf("%.1f", j) at O.sw + (g, -.25) } else { sprintf("%.2f", j) at O.sw + (g, -.25) }}} j = j + xtick } # DATASET: stride=8, MARK 0 [ "\(ci" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (18 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (23 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (27 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (29 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (29 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (29 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (29 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (29 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (30 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (31 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (32 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (32 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (33 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (32 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (33 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (34 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (34 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (34 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (35 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (35 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (36 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (36 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (37 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (46 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (40 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (89 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (91 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (91 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (89 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (91 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (90 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (92 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(ci" ] at O.sw + \ (xscale * (23 - xlower), yscale * (92 - ylower)) # DATASET: stride=16, MARK 1 [ "\(sq" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (26 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (36 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (44 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (52 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (53 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (53 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (58 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (59 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (60 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (65 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (164 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (165 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (167 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (165 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (165 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (168 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (167 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (166 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (165 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (168 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (168 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (167 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (167 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (166 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (167 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(sq" ] at O.sw + \ (xscale * (23 - xlower), yscale * (167 - ylower)) # DATASET: stride=32, MARK 2 [ "\(*D" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (58 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (60 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (62 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (62 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (63 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (65 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (64 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (68 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (70 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (83 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (85 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (87 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (335 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (335 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (336 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (335 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (339 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (337 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (338 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (336 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (337 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (335 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (338 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (339 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (336 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (340 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(*D" ] at O.sw + \ (xscale * (23 - xlower), yscale * (340 - ylower)) # DATASET: stride=64, MARK 3 [ "\(mu" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (49 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (51 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (57 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (58 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (58 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (62 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (63 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (63 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (76 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (79 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (323 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (325 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (328 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(mu" ] at O.sw + \ (xscale * (23 - xlower), yscale * (327 - ylower)) # DATASET: stride=128, MARK 4 [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (59 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (60 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (60 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (60 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (61 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (71 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (75 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (75 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (317 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (322 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (319 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (323 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+4\(bu\s0" ] at O.sw + \ (xscale * (23 - xlower), yscale * (321 - ylower)) # DATASET: stride=512, MARK 5 [ box ht .07 wid .07 fill 1 ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (28 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (53 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (50 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (67 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (77 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (74 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (80 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (317 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (320 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (322 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (321 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (322 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (322 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (322 - ylower)) line thick thk from 2nd last [].c to last [].c [ box ht .07 wid .07 fill 1 ] at O.sw + \ (xscale * (23 - xlower), yscale * (322 - ylower)) # DATASET: stride=1024, MARK 6 [ "\s+2\(pl\s0" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (53 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (78 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (88 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (91 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (91 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (324 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (325 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (328 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (328 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (327 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (326 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+2\(pl\s0" ] at O.sw + \ (xscale * (23 - xlower), yscale * (326 - ylower)) # DATASET: stride=2048, MARK 7 [ "\s+4\(**\s0" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (27 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (40 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (48 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (100 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (111 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (115 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (114 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (340 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (340 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (343 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (344 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (343 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (343 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (345 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (343 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (344 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (344 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (344 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (344 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (345 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (345 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (345 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\s+4\(**\s0" ] at O.sw + \ (xscale * (23 - xlower), yscale * (345 - ylower)) # DATASET: stride=4096, MARK 0 [ "\(ci" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (39 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (56 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (147 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (146 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (146 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (145 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (145 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (145 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (157 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (162 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (160 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (379 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (380 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (378 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (380 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (382 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (381 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (381 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (381 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (382 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (382 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (382 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (382 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (383 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (383 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (385 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(ci" ] at O.sw + \ (xscale * (23 - xlower), yscale * (385 - ylower)) # DATASET: stride=8192, MARK 1 [ "\(sq" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (11 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (232 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (231 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (231 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (232 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (232 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (230 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (240 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (246 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (246 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (445 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (441 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (450 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (451 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (443 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (441 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (442 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (446 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (452 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (452 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (453 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (453 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(sq" ] at O.sw + \ (xscale * (23 - xlower), yscale * (453 - ylower)) # DATASET: stride=16384, MARK 2 [ "\(*D" ] at O.sw + \ (xscale * (9.0050693696783969955 - xlower), yscale * (10 - ylower)) [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.005069369678396995 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (10.997689839312798199 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (11.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.001384322870542576 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.585116379985436197 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (12.999538263705169072 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.322574277531574083 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.585116379985436197 - xlower), yscale * (11 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (13.807157053169248684 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.000461588562853166 - xlower), yscale * (10 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.32183575944998033 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.459137803548600232 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.585116379985436197 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.700404205210695352 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.807157053169248684 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (14.907044474872799711 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.170078880706594049 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.32183575944998033 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.459473587337127398 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.585116379985436197 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.807420872323479699 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (15.906798260171138182 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.1698737047066885 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.322020424415466522 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.584962500721157852 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.700404205210695352 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.807420872323479699 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (16.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.169976296354082734 - xlower), yscale * (54 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.459473587337129175 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.700475230197337595 - xlower), yscale * (53 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (17.906921372774437629 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.169925001442312151 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.32192809488736529 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.459431618637296424 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.584962500721157852 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.70043971814109085 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.807354922057605506 - xlower), yscale * (55 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (18.90689059560851959 - xlower), yscale * (72 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (19 - xlower), yscale * (243 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20 - xlower), yscale * (432 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (20.584962500721154299 - xlower), yscale * (445 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21 - xlower), yscale * (445 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.321928094887361738 - xlower), yscale * (447 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.584962500721154299 - xlower), yscale * (448 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (21.807354922057605506 - xlower), yscale * (448 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.169925001442312151 - xlower), yscale * (450 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.321928094887361738 - xlower), yscale * (447 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.459431618637296424 - xlower), yscale * (450 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.584962500721157852 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.70043971814109085 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.807354922057605506 - xlower), yscale * (449 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (22.90689059560851959 - xlower), yscale * (452 - ylower)) line thick thk from 2nd last [].c to last [].c [ box invis ht .05 wid .05 ] at O.sw + \ (xscale * (23 - xlower), yscale * (452 - ylower)) line thick thk from 2nd last [].c to last [].c [ "\(*D" ] at O.sw + \ (xscale * (23 - xlower), yscale * (452 - ylower)) # DATASET: stride=16384, MARK 3 # DATASET: stride=16384, MARK 4 .ps 8 .vs 8 "8KB" "cache" at O.sw + .35,.32 arrow thick 2 wid .07 down .15 from O.sw + .35,.20 ".5MB" "cache" at O.sw + .85,.50 arrow thick 2 wid .07 down .15 from O.sw + .85,.38 "Main" "mem" at O.e - .25,.15 arrow thick 2 wid .07 up .15 from O.e - .25,0 .vs .ps # Xaxis title. "\s+2log2(Array size)\s0" rjust at O.se - (0, .6) # Yaxis title (Latency in nanoseconds) .ps +2 .vs -1 "L" "a" "t" "e" "n" "c" "y" " " "i" "n" at O.w - (.95, 0) "n" "a" "n" "o" "s" "e" "c" "o" "n" "d" "s" at O.w - (.75, 0) .ps .vs # Graph title. "\s+2DEC alpha@182mhz memory latencies\s0" at O.n + (-.5, .3) # Title. #[ "\(ci" ] at O.ne + (.25, - 0 * vs) #"stride=8" ljust at last [].e + (.1, 0) #[ "\(sq" ] at O.ne + (.25, - 1 * vs) #"stride=16" ljust at last [].e + (.1, 0) #[ "\(*D" ] at O.ne + (.25, - 2 * vs) #"stride=32" ljust at last [].e + (.1, 0) #[ "\(mu" ] at O.ne + (.25, - 3 * vs) #"stride=64" ljust at last [].e + (.1, 0) #[ "\s+4\(bu\s0" ] at O.ne + (.25, - 4 * vs) #"stride=128" ljust at last [].e + (.1, 0) #[ box ht .07 wid .07 fill 1 ] at O.ne + (.25, - 5 * vs) #"stride=512" ljust at last [].e + (.1, 0) #[ "\s+2\(pl\s0" ] at O.ne + (.25, - 6 * vs) #"stride=1024" ljust at last [].e + (.1, 0) #[ "\s+4\(**\s0" ] at O.ne + (.25, - 7 * vs) #"stride=2048" ljust at last [].e + (.1, 0) #[ "\(ci" ] at O.ne + (.25, - 8 * vs) #"stride=4096" ljust at last [].e + (.1, 0) #[ "\(sq" ] at O.ne + (.25, - 9 * vs) #"stride=8192" ljust at last [].e + (.1, 0) #[ "\(*D" ] at O.ne + (.25, - 10 * vs) #"stride=16384" ljust at last [].e + (.1, 0) ] .ft .ps .PE ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/memhier-color.d������������������������������������������������������������������0000664�0000764�0000764�00000004043�07477577460�016671� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������frame invis ht 1.5 wid 2.5 left solid bot solid label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 label bot "Size (MB)" label left "Latency (ns)" coord log log ticks bottom out at 0.000512 "512", 0.001024 "", 0.002048 "", 0.004096 "", 0.008192 "8K", 0.016384 "", 0.032768 "", 0.065536 "", 0.098304 "96K", 0.131072 "", 0.262144 "", 0.524288 "", 1.048576 "1M", 2.097152 "", 4.194304 "", 8.388608 "", 16.777216 "", 33.554432 "32M" draw dotted 0.000512 4.042 0.008192 4.046 0.010240 8.873 0.012288 12.085 0.016384 16.097 0.032768 16.103 0.065536 19.908 0.098304 20.622 0.114688 29.808 0.131072 37.724 0.196608 47.561 0.262144 52.134 0.524288 66.410 1.048576 74.897 1.310720 153.075 1.572864 198.678 2.097152 264.935 3.145728 333.862 4.194304 366.109 8.388608 370.522 33.554432 370.682 "Colored" ljust at 1.572864, 222.789 draw solid 0.000512 4.042 0.000640 4.043 0.000768 4.044 0.000896 4.043 0.001024 4.043 0.001280 4.044 0.001536 4.044 0.001792 4.044 0.002048 4.041 0.002560 4.044 0.003072 4.045 0.003584 4.044 0.004096 4.045 0.005120 4.046 0.006144 4.047 0.007168 4.048 0.008192 4.048 0.010240 8.872 0.012288 12.079 0.014336 14.379 0.016384 16.097 0.020480 16.104 0.024576 16.117 0.028672 16.114 0.032768 16.106 0.040960 16.110 0.049152 16.123 0.057344 18.062 0.065536 19.179 0.081920 97.039 0.098304 84.011 0.114688 81.764 0.131072 79.122 0.163840 82.634 0.196608 108.550 0.229376 104.530 0.262144 119.771 0.327680 111.317 0.393216 131.057 0.458752 143.902 0.524288 173.323 0.655360 197.268 0.786432 219.736 0.917504 224.743 1.048576 249.878 1.310720 287.157 1.572864 302.857 1.835008 315.170 2.097152 329.874 2.621440 347.418 3.145728 357.183 3.670016 362.297 4.194304 365.720 5.242880 369.345 33.554432 370.296 "Malloc'ed" rjust at 0.458752, 219.736 ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/memhier-line.d�������������������������������������������������������������������0000664�0000764�0000764�00000001171�07477577460�016501� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������frame invis ht 1.5 wid 2.5 left solid bot solid label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 label bot "Line Size (Bytes)" label left "Latency (ns)" coord log log ticks bottom out from 8 to 512 by *4 ticks bottom out from 8 to 512 by *2 "" draw solid 8 7.247 16 10.909 32 16.788 64 17.083 128 16.272 256 16.721 512 16.129 "L1" rjust above at 512, 16.129 draw solid 8 22.853 16 41.496 32 78.712 64 141.658 128 139.119 256 138.446 512 137.902 "L2" rjust above at 512, 137.902 draw solid 8 51.529 16 98.915 32 193.614 64 372.230 128 371.689 256 371.486 512 371.486 "L3" rjust above at 512, 371.486 �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/memhier-tlb.d��������������������������������������������������������������������0000664�0000764�0000764�00000012125�07564162402�016313� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������frame invis ht 1.5 wid 2.5 left solid bot solid label top "\fBalpha Linux 2.2.16-3\fR" down 0.3 label bot "Pages" label left "Latency (ns)" coord log log draw dotted 1 4.042 2 4.047 3 4.043 4 4.044 5 4.043 6 4.043 7 4.045 8 4.044 9 4.044 10 4.044 11 4.044 12 4.044 13 4.044 14 4.044 15 4.045 16 4.046 17 4.047 18 4.046 19 4.046 20 4.047 21 4.048 22 4.046 23 4.047 24 4.047 25 4.048 26 4.048 27 4.048 28 4.048 29 4.048 30 4.049 31 4.048 32 4.049 33 4.049 34 4.049 35 4.049 36 4.049 37 4.049 38 4.049 39 4.071 40 4.070 41 4.070 42 4.070 43 4.070 44 4.070 45 4.070 46 4.069 47 4.070 48 4.070 49 4.071 50 4.070 51 4.070 52 4.069 53 4.048 54 4.049 55 4.069 56 4.049 57 4.049 58 4.070 59 4.048 60 4.050 61 4.070 62 4.050 63 4.048 64 4.066 65 4.048 66 4.050 67 4.069 68 4.048 69 4.049 70 4.069 71 4.049 72 4.049 73 4.069 74 4.071 75 4.071 76 4.069 77 4.071 78 4.071 79 4.069 80 4.069 81 4.069 82 4.069 83 4.069 84 4.070 85 4.070 86 4.069 87 4.070 88 4.070 89 4.071 90 4.071 91 4.070 92 4.070 93 4.072 94 4.070 95 4.049 96 4.049 97 4.070 98 4.049 99 4.050 100 4.071 101 4.050 102 4.048 103 4.049 104 4.048 105 4.048 106 4.048 107 4.049 108 4.048 109 4.048 110 4.048 111 4.048 112 4.048 113 4.051 114 4.048 115 4.069 116 4.050 117 4.048 118 4.048 119 4.048 120 4.054 121 4.054 122 4.048 123 4.050 124 4.049 125 4.048 126 4.049 127 4.048 128 4.049 129 4.260 130 4.446 131 4.647 132 4.802 133 4.978 134 5.148 135 5.321 136 5.490 137 5.653 138 5.816 139 5.980 140 6.138 141 7.370 256 7.068 "Packed" rjust above at 246, 7.370 draw solid 1 4.042 2 4.042 3 4.042 4 4.042 5 4.042 6 4.043 7 4.042 8 4.042 9 4.042 10 4.042 11 4.042 12 4.042 13 4.043 14 4.042 15 4.041 16 4.042 17 4.043 18 4.042 19 4.042 20 4.043 21 4.043 22 4.046 23 4.044 24 4.043 25 4.043 26 4.044 27 4.042 28 4.041 29 4.044 30 4.043 31 4.044 32 4.044 33 4.044 34 4.044 35 4.044 36 4.045 37 4.044 38 4.044 39 4.044 40 4.042 41 4.043 42 4.042 43 4.044 44 4.044 45 4.044 46 4.045 47 4.044 48 4.051 49 4.044 50 4.044 51 4.043 52 4.042 53 4.045 54 4.044 55 4.042 56 4.044 57 4.049 58 4.046 59 4.045 60 4.045 61 4.045 62 4.047 63 4.045 64 39.263 65 39.209 66 39.163 67 39.488 68 39.473 69 39.752 70 39.710 71 39.651 72 39.605 73 39.606 74 39.522 75 47.264 76 39.490 77 40.007 78 39.945 79 39.900 80 39.891 81 47.525 82 39.819 83 40.051 84 39.993 85 40.556 86 40.487 87 40.470 88 40.396 89 40.623 90 40.565 91 40.497 92 41.640 93 53.333 94 40.866 95 40.823 96 46.649 97 40.723 98 40.739 99 40.896 100 40.826 101 41.257 102 41.462 103 41.192 104 41.150 105 41.309 106 41.267 107 41.471 108 46.722 109 41.819 110 41.742 111 46.823 112 41.691 113 41.592 114 41.554 115 41.736 116 41.712 117 46.795 118 43.811 119 41.940 120 52.439 121 42.053 122 42.025 123 43.049 124 42.302 125 42.431 126 42.403 127 42.346 128 42.496 129 43.304 130 42.394 131 42.591 132 43.344 133 46.852 134 43.398 135 47.048 136 43.622 137 46.991 138 42.750 139 42.892 140 43.915 141 47.368 142 52.607 143 46.635 144 43.154 145 43.198 146 43.866 147 43.205 148 47.229 149 44.179 150 47.845 151 44.228 152 45.044 153 47.489 154 44.559 155 52.694 156 44.713 157 48.325 158 43.963 159 47.580 160 53.114 161 48.816 162 48.765 163 46.131 164 49.539 165 51.761 166 48.149 167 49.600 168 44.871 169 49.938 170 47.790 171 47.698 172 48.453 173 45.148 174 55.011 175 45.250 176 45.917 177 51.219 178 48.819 179 45.335 180 48.083 181 58.405 182 48.727 183 46.855 184 46.712 185 54.348 186 46.814 187 48.785 188 49.653 189 51.982 190 51.728 191 46.027 192 52.139 193 53.446 194 46.605 195 52.417 196 52.008 197 47.167 198 50.892 199 54.935 200 46.870 201 48.752 202 46.438 203 50.100 204 48.546 205 49.406 206 48.250 207 48.192 208 49.371 209 50.398 210 52.615 211 49.973 212 58.927 213 51.122 214 47.716 215 51.216 216 53.270 217 49.865 218 50.324 219 49.916 220 49.336 221 56.814 222 50.417 223 50.910 224 55.038 225 61.760 226 53.135 227 53.262 228 50.561 229 48.315 230 49.193 231 53.704 232 53.386 233 61.107 234 49.641 235 49.387 236 51.842 237 52.700 238 49.340 239 52.748 240 57.290 241 49.655 242 50.643 243 52.568 244 52.457 245 54.264 246 59.484 247 52.176 248 52.697 249 63.909 250 56.820 251 52.252 252 62.305 253 51.512 254 54.730 255 51.264 256 52.391 "Word/Page" rjust at 80, 52.391 �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/memhier.ms�����������������������������������������������������������������������0000664�0000764�0000764�00000153277�07564162402�015746� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" This document is GNU groff -mgs -t -p -R -s .\" It will not print with normal troffs, it uses groff features, in particular, .\" long names for registers & strings. .\" Deal with it and use groff - it makes things portable. .\" .\" $X$ xroff -mgs -t -p -R -s $file .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr .VARPS .\" Define a page top that looks cool .\" HELLO CARL! To turn this off, s/PT/oldPT/ .de PT .tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' .. .de lmPT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .\" HELLO CARL! To turn this off, s/BT/oldBT/ .de BT .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' .. .de lmBT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 2001 \\*[author]'\\*(DY'%' . ps .. .de SP . if t .sp .5 . if n .sp 1 .. .de BU . SP . ne 2 \(bu\ . if \\n[.$] \fB\\$1\fP\\$2 .. .nr FIGURE 0 .nr TABLE 0 .nr SMALL .25i .de TSTART . KF . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 . ps -1 . vs -1 .. .de TEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr TABLE \\n[TABLE]+1 . ce 1 \fBTable \\n[TABLE].\ \ \\$1\fP . SP . KE .. .de FEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr FIGURE \\n[FIGURE]+1 . ce 1 \fBFigure \\n[FIGURE].\ \ \\$1\fP . SP . KE .. .\" Configuration .nr PI 3n .nr HM 1i .nr FM 1i .nr PO 1i .if t .po 1i .nr LL 6.5i .if n .nr PO 0i .if n .nr LL 7.5i .nr PS 10 .nr VS \n(PS+1 .ds title Micro-architecture analysis .ds author Carl Staelin .ds lmbench \f(CWlmbench\fP .ds lmbench1 \f(CWlmbench1\fP .ds lmbench2 \f(CWlmbench2\fP .ds lmbench3 \f(CWlmbench3\fP .ds bcopy \f(CWbcopy\fP .ds connect \f(CWconnect\fP .ds execlp \f(CWexeclp\fP .ds exit \f(CWexit\fP .ds fork \f(CWfork\fP .ds gcc \f(CWgcc\fP .ds getpid \f(CWgetpid\fP .ds getpid \f(CWgetpid\fP .ds gettimeofday \f(CWgettimeofday\fP .ds kill \f(CWkill\fP .ds lat_mem_rd \f(CWlat_mem_rd\fP .ds lat_ops \f(CWlat_ops\fP .ds lmdd \f(CWlmdd\fP .ds memmove \f(CWmemmove\fP .ds mmap \f(CWmmap\fP .ds par_mem \f(CWpar_mem\fP .ds par_ops \f(CWpar_ops\fP .ds popen \f(CWpopen\fP .ds read \f(CWread\fP .ds stream \f(CWstream\fP .ds system \f(CWsystem\fP .ds uiomove \f(CWuiomove\fP .ds write \f(CWwrite\fP .ds yield \f(CWyield\fP .\" References stuff .de RN \"Reference Name: .RN $1 -- prints the reference prettily .\" [\s-2\\$1\s+2]\\$2 [\s-1\\$1\s0]\\$2 .. .\" .R1 .\" sort A+DT .\" database references .\" label-in-text .\" label A.nD.y-2 .\" bracket-label \*([. \*(.] ", " .\" .R2 .EQ delim $$ .EN .TL \s(14Micro-architecture analysis\s0 .AU \s+2\fR\*[author]\fP\s0 .AI \fI\s+2Hewlett-Packard Laboratories Israel\s0\fP .SP .AB \*[lmbench] version 3 includes a number of new micro-benchmarks that analyze specific aspects of system micro-architecture, such as instruction level parallelism, the cache hierarchy and TLB. .LP There are new benchmarks to measure instruction level parallelism, such as the effectiveness of overlapped memory accesses or arithmetic operations. There are other new benchmarks to measure various aspects of the architecture, such as the cache line size(s), TLB size, and latency costs for basic arithmetic operations. \*[lmbench] can identify the number of caches, and the size, line size, and available parallelism for each cache. It can also measure the effective TLB size. .AE .if t .MC 3.05i .NH 1 Introduction .LP \*[lmbench] version 3 includes a variety of new benchmarks designed to measure and analyze various aspects of memory system design and performance. The most important aspect of memory subsystem performance is typically the memory hierarchy, the number and size of caches. Other important aspects include the cache line size, TLB, and memory parallelism. .LP There are any number of aspects of a computer's micro-architecture that can impact a program's performance, such as the design of the memory hierarchy and the basic performance of the various arithmetic units. .LP All of the new benchmarks were added to \*[lmbench] because the author needed them to help guide his design decisions in one or more projects over the last few years. For example, \*[lat_ops] was added because the author was trying to decide whether a particular image processing algorithm should be implemented using integer or floating point arithmetic. Floating point arithmetic was preferred for a variety of reasons, but it was feared that floating point arithmetic would be prohibitively expensive compared to integer operations. By quickly building \*[lat_ops] the author was able to verify that the floating point performance should be no worse than integer performance. .LP Memory speeds have not kept pace with the dizzying pace of processor performance improvements. The result has been a steady increase in the relative cost of memory accesses, when measured in terms of instructions or clock ticks. For example, a 2GHz processor with 200ns memory latency would wait roughly 400 instructions for a single memory access. .LP To alleviate memory bottlenecks, architects use cache memory to reduce the average memory latency. Typically there are between one and three caches in modern memory subsystems. A rule of thumb is that each step down the memory hierarchy results in at least a doubling of memory latency and at least a doubling of the cache size. .LP The details of the memory hierarchy design can have a significant impact on application performance .RN Whaley98 , but unfortunaley developers frequently cannot predict the exact configuration of machines which will run their software. Additionally, many developers are even unaware of the architectural details of their own machines. .LP One hope is that by providing a portable ANSI-C tool, developers may be better informed about the architectural possibilities provided by their own machines, and they may develop more efficient software which can automatically utilize features of the particular hardware based on information provided by these utilities. .LP For example, .RN Staelin02c proposes variations on familiar data structures which take advantage of the increased memory parallelism afforded by modern processors to increase performance as much as 50%. .LP Before explaining the various algorithms and experimental methods for determining various aspects of the memory hierarchy design, we first give a short tutorial on memory system design. Then we describe the basic techniques used in analyzing the memory hierarchy, and how they neutralize or measure various subsystems or features of the memory system. Finally, we describe in more detail the specific algorithms used to measure the various aspects of the memory subsystem. .NH 1 Computer Architecture Primer .LP A processor architecture is generally defined by its instruction set, but most computer architectures incorporate a large number of common building blocks and concepts, such as registers, arithmetic logic units, and caches. .LP Of necessity, this primer over-simplifies the many details and variations of specific computer designs and architectures. For more information, please see .RN Hennessy96 . .TSTART 1 .so lmbench3_arch.pic .FEND "Architecture diagram" 1 .LP Figure \n[FIGURE] contains a greatly simplified block diagram of a computer. Various important elements, such as the I/O bus and devices, have been left out. The core of the processor are the registers (r0, ..., rn and f0, ..., fn) and the arithmetic units (ALU and FPU). In general, the arithmetic units can access data in registers ''instantly''. Often data must be explicitly loaded from memory into a register before it can be manipulated by the arithmetic units. .LP The ALU handles integer arithmetic, such as bit operations (AND, OR, XOR, NOT, and SHIFT) as well as ADD, MUL, DIV, and MOD. Sometimes there is specialized hardware to handle one or more operations, such as a barrel shifter for SHIFT or a multiplier, and sometimes there is no hardware support for certain operations, such as MUL, DIV, and MOD. .LP The FPU handles floating point arithmetic. Sometimes there are separate FPUs for single and double precision floating point operations. .NH 2 Memory hierarchy .LP Nearly all modern, general purpose computers use virtual memory with phyically addressed caches. As such, there is typically one or more caches between the physical memory and the processor, and virtual-to-physical address translation occurs between the processor and the top-level cache. Cache staging and replacement is done in \fIcache line\fR units, which are typically several words in length, and caches lower in the hierarchy sometimes have cache lines which are larger than those in the higher caches. .LP Modern processors usually incorporate at least an L1 cache on-chip, and some are starting to also incorporate the L2 cache on-chip. In addition, most include a translation look-aside buffer (TLB) on-chip for fast virtual-to-physical address translation. .LP One key element of any cache design is its replacement strategy. Most caches use either direct-mapped or set associative caches. In the first instance any word in physical memory has exactly one cache line where into which it may be staged, while set associative caches allow a given word to be cached into one of a set of lines. Direct-mapped caches have a very simple replacement policy: the contents of the line that is needed is discarded. Set associative caches usually use LRU or some variant within each set, so the least recently used line in the set of possible cache lines is replaced. The control logic for direct-mapped caches is much cheaper to build, but they are generally only as effective as a set-associative cache half the size.\** .FS See .RN Hennessy96 page 396. .FE .LP Another key element of memory hierarchy design is the management of dirty data; at what point are writes passed down the memory hierarchy to lower caches and main memory? The two basic policies are write-through and write-back. A write-through policy means that writes are immediately passed through the cache to the next level in the hierarchy, so the lower levels are updated at the same time as the cache. A write-back policy means that the cache line is marked as dirty in the cache, and only when the line is ejected from the cache is the data passed down the hierarchy. Write-through policies are often used in higher (smaller) caches because multi- processor systems need to keep a coherent view of memory and the writes are often propagated to other processors by \fIsnoopy\fR caches. .LP One often overlooked aspect of cache performance is cache behavior during writes. Most cache lines contain several words, and most instructions only update the line a word at a time. This means that when the processor writes a word to a cache line that is not present, the cache will read the line from memory before completing the write operation. For \*[bcopy]-like operations this means that the overall memory bandwidth requirement is actually two reads and one write per copied word, rather than the expected read and write. .LP Most modern processors now include some form of prefetch in the memory hierarchy. For the most part these are simple systems that can recognize fixed strided accesses through memory, such as might be seen in many array operations. However, prefetching systems appear to be growing in complexity and capability. .LP Additionally, modern memory subsystems can usually support multiple outstanding requests; the level of parallelism is usually dependent on the level of the hierarchy being accessed. Top-level caches can sometimes support as many as six or eight outstanding requests, while main memory can usually support two outstanding requests. Other elements of the memory hierarchy, such as the TLB, often have additional limits on the level of achievable parallelism in practice.\** .FS For example, if the TLB serializes all TLB misses, and if each memory access causes a TLB miss, then the memory accesses will be serialized even if the data was in a cache supporting six outstanding requests. .FE .LP For more information and details on memory subsystem design, and computer architecture in general, please see .RN Hennessy96 which has an excellent description of these and many other issues. .NH 2 Some Recent Innovations .LP There are a number of modern extensions to computer architecture that attempt to increase the processor's ability to do several things at once. Nearly all of these enhancements are intended to be invisible to programmers using higher-level languages such as C or JAVA. .IP "\fBSuperscalar processors\fR" Superscalar processors have multiple processing units which can operate simultaneously. .IP "\fBDynamic instruction reordering\fR" Dynamic instruction reordering allows the processor to execute instructions whose operands are ready before instructions which are stalled waiting for memory or other instruction's completion. .IP "\fBMemory parallelism\fR" By allowing multiple outstanding memory requests, processors allow the memory subsystem to service multiple (independent) requests in parallel. Since memory accesses are a common performance bottleneck, this can greatly improve performance. .IP "\fBVector processing\fR" Vector processing allows the processor to execute arithmetic operations on vector operands in parallel, and in modern commodity processors goes by names such as MMX, SSE, and 3DNow. .IP "\fBSimultaneous multi-threading (SMT)\fR" SMT allows superscalar processors to simulatenously execute instructions from several threads (contexts) .RN Tullsen96 . SMT may include extensions which allow for very lightweight inter-thread synchronization primitives that enable much finer-grained thread-level parallelism than traditional synchronization methods .RN Tullsen99 . .IP "\fBExplicitly parallel instruction computers (EPIC)\fR" EPIC allows the compiler to explicitly issue $N$ instructions in parallel at each instruction, which informs the hardware that these instructions are independent and may be executed in parallel .RN Schlansker00 . It moves much of the burden regarding dependency checking from the hardware to the compiler. .NH 1 Basic operation latency .LP \*[lmbench3] includes a new micro-benchmark which measures the latency for a variety of basic operations, such as addition, multiplication, and division of integer, float, and double operands. To measure the basic operation latency we construct a basic arithmetic statement containing the desired operands and operations. This statement is repeated one hundred times and these repetitions are then embedded in a loop. .TSTART .TS center box tab (&); c c c l & l & l . Operand&Operation&Statement _ int&$bit$&r^=i;s^=r;r|=s; &$add$&a+=b;b-=a; &$mul$&r=(r*i)^r; &$div$&r=(r/i)^r; &$mod$&r=(r%i)^r; _ float&$add$&f+=f; &$mul$&f*=f; &$div$&f=g/f; _ double&$add$&f+=f; &$mul$&f*=f; &$div$&f=g/f; .TE .TEND "lat_ops statements" .LP Table \n[TABLE] shows the data type and expressions used for each basic operation type. The variable $i$ indicates the integer loop variable and generally changes every ten or hundred evaluations of the basic expression. All other variables are of the basic type being measured, and aside from being modified by the relevant expressions are only initialized once at the beginning of the benchmark routine. .LP Each statement has been designed to ensure that the statement instances are \fIinterlocked\fR, namely that the processor cannot begin processing the next instance of the statement until it has completed processing the previous instance. This property is crucial to the correct measurement of operation latency. .LP One important consideration in the design of the statements was that they not be optimized out of the loop by intelligent compilers. Since the statements are repeated one hundred times, the compiler has the option of evaluating the sequence of one hundred repetitions of the same statement, and sometimes it can find optimizations that are not immediately apparent. For example, the integer statement $a=a+a;$ when repeated one hundred times in a loop can be replaced with the single statement $a=0;$ because the statement $a=a+a;$ is equivalent to $a< < =1;$, and one hundred repetitions of that statement is equivalent to $a< < =100;$, which for 32bit (or even 64bit) integers is equivalent to $a=0;$. .LP It is relatively easy to identify floating point statements that interlock, are not optimized away, and that only use the operation of interest. It is much harder to identify integer statements meeting the same criterion. All simple integer bitwise operations can either be optimized away, don't interlock, or use operations other than one of interest. We chose to add operations other than the operation(s) of interest to the statements. .LP The integer $mul$, $div$, and $mod$ statements all include an added $xor$ operation which prevents (current) compilers from optimizing the statements away. Since the $xor$ operation is generally completed in a single clock tick, and since we can measure the $xor$ operation latency separately and subtract that overhead, we can still measure the latencies of the other operations of interest. .LP It is not possible to measure latency for 64bit operations on 32bit machines because most implementations allow operations on the upper and lower bits to overlap. This means that on most 32bit machines, the measured latency would appear to be a non-integral multiple of the basic clock cycle. For example, in the $add$ statement, the system could first add the two lower words. Then, in parallel it could both add the two upper words (along with the carry from the lower words), and compute the $xor$ of the lower word. Finally, it can overlap the $xor$ of the upper word with the addition of the two lower words from the next instantiation of the statement. .TSTART .TS center box tab (&); c c c c c c c c c c l & l & r & r & r . Operand&Op&HPPA2.0&PIII&AMD &&400MHz&667MHz&1.3GHz _ mhz&&2.50&1.50&0.75 int&$bit$&2.53&1.50&0.75 &$add$&2.50&1.51&0.75 &$mul$&14.52&6.07&3.03 &$div$&109.40&58.52&30.86 &$mod$&75.14&65.01&32.59 _ float&$add$&7.54&4.58&3.0 &$mul$&7.50&7.50&3.0 &$div$&45.00&35.26&13.21 _ double&$add$&7.52&4.53&3.01 &$mul$&7.52&7.71&3.01 &$div$&85.01&35.51&13.16 .TE .TEND "lat_ops results (ns)" .LP Table \n[TABLE] contains some sample results for two processors. It does contain one result which is slightly surprising unless you are familiar with the PA-RISC architecture: floating point multiply and divide are faster than the corresponding integer operations! This is because PA-RISC does not contain integer MUL, DIV, or MOD instructions and the optimizing compiler converts the integers into floating point, does the operations in the floating point unit, and then converts the result back to an integer. .NH 2 Basic operation parallelism .LP Instruction-level parallelism in commodity processors has become commonplace in the last ten years. Modern processors typically have more than one operational unit that can be active during a given clock cycle, such as an integer arithmetic unit and a floating point unit. In addition, processors may have more than a single instance of a given type of operational unit, both of which may be active at a given time. All this intra-processor parallelism is used to try and reduce the average number of clock cycles per executed instruction. .LP \*[lmbench3] incorporates a new benchmark \*[par_ops] which attempts to quantify the level of available instruction-level parallelism provided by the processor. This benchmark is very similar to \*[lat_ops], and in fact uses the same statement kernels, but it has been modified and extended. We create different versions of each benchmark; each version has $N$ sets of interleaved statements. Each set is identical to equivalent \*[lat_ops] statements. In this way multiple independent sets can be executing the same operation(s) in parallel, if the hardware supports it. .LP For example, the float $mul$ benchmark to measure performance with two parallel streams of statements would look like something this: .DS L \f(CW#define TEN(a) a a a a a a a a a a void benchmark_1(iter_t iterations, void* cookie) { register iter_t i = iterations; struct _state* state = (struct _state*)cookie; register float f0 = state->float_data[0]; register float f1 = state->float_data[1]; while (i-- > 0) { TEN(f0*=f0; f1*=f1;) } use_int((int)f0); use_int((int)f1); }\fP .DE .LP If the processor had two floating point multiply units, then both $f0$ and $f1$ multiplies could proceed in parallel. .LP However, there are some potential problems with the integer operations, namely the fact that the statements contain mixed operations. In general, processors have at least as many integer units that can do $xor$ as can do the other operations of interest ($mul$, $div$ and $mod$), so the inclusion of $xor$ in the statements shouldn't be a bottleneck. .LP However, since parallelism is measured by comparing the latency of the single-stream with that of multiple interleaved streams, and since the single-stream latency includes the $xor$ latency, the apparent parallelism of $mul$, $div$, $mod$ can be over-stated. For example, if a process has one unit that can do integer bit operations, such as $xor$, and another unit for integer $mul$ operations, then the average latency for $a0 = (i * a0) ^ a0$ in the single stream case would be: .EQ t bar = t sub xor + t sub mul .EN In the multi-stream case, the execution of the $xor$ operation of one stream can be overlapped with the $mul$ of another stream, so the average latency per stream would simply be $t bar = t sub mul$, assuming that $mul$ operations are not cheaper than $xor$ operations, which results in an apparent parallelism $p tilde$: .EQ p tilde = {t sub xor + t sub mul} over { t sub mul } .EN Assuming that $t sub xor < < t sub mul$, this still gives a reasonable approximation to the correct answer. Unfortunately, this is not always a reasonable assumption. .LP Of course, if it was known ahead of time that $xor$ and { $mul$, $div$, and $mod$ } used different execution units, then the benchmark could simply subtract $t sub xor$ from the baseline measurement. The difficulty lies in determining whether the units overlap or not. .TSTART .TS center box tab (&); c c c c c c c c c c l & l & r & r & r . Operand&Op&HPPA2.0&PIII&AMD &&400MHz&667MHz&1.3GHz _ int&$bit$&1.99&1.70&1.87 &$add$&1.99&1.61&1.90 &$mul$&6.64&3.81&2.00 &$div$&2.81&1.20&1.00 &$mod$&2.78&1.11&1.03 _ float&$add$&5.88&1.00&2.66 &$mul$&5.86&1.14&2.47 &$div$&2.12&1.03&1.14 _ double&$add$&5.68&1.08&2.49 &$mul$&5.58&1.00&2.53 &$div$&2.19&1.03&1.14 .TE .TEND "par_ops results" .LP .NH 1 Memory analysis .LP There are a variety of aspects of memory hierarchy design that are interesting to a software developer, such as the number of caches and their sizes. In addition, other aspects of cache design, such as the line size, associativity and parallelism can impact software performance and are of potential interest to software developers. .LP The problem is designing a portable ANSI-C program to infer the cache parameters. A number of operating systems have hooks to report at least certain aspects of cache and memory hierarchy design, but any program utilizing those hooks would not be fully portable across hardware and operating system platforms. .LP The key observation is that caches help reduce memory latency. In a perfect world, all possible data would fit in the cache, so a graph of average memory latency versus amount of memory utilized would look like a series of plateaus separated by cliffs. The cliff edges would be located at the cache boundaries and the plateau height would be the average memory latency. .LP The first problem is that one needs a mechanism for accurately measuring time in a portable fashion. \*[lmbench2] introduced a new timing harness that determines the minimum duration of a timing interval for \*[gettimeofday] to provide accurate measurements .RN Staelin98 . .LP \*[lmbench] includes a benchmark that measures average memory latency, \*[lat_mem_rd] .RN McVoy96 . It creates a pointer chain, and then measures the average time to dereference the pointers. \*[lat_mem_rd] creates the pointer chain by simply striding through memory at fixed intervals, e.g. every other word. .LP \*[lmbench2] extended \*[lat_mem_rd] so that each timing interval only accessed memory as many times as necessary to consume a timing interval. When accessing cache this often means that the whole pointer chain will be accessed at least once during the timing interval, but when accessing memory this often means that only a portion of the chain will be accessed during any given timing interval. .LP While this approach gives very useful insights into memory hierarchy performance, it is not quite sufficient to determine the various characteristics of the memory hierarchy. .LP The first problem is that unless the stride is exactly the same size as the cache line size, then there will either be multiple successive accesses to the same line, or some fraction of data will be completely skipped. In the first case the observed latency is much faster than the true latency because it is the average of a single miss latency (slow) with one or more hit latencies (fast). In the second case, the amount of data actually loaded into the cache may be a small fraction of the expected amount so the data may fit into a smaller (faster) cache. The second problem is that this sequence is highly predictable, even by simple-minded prefetching policies, so accurate prefetching might be masking the true memory latencies. .LP This method does do a few things properly. First of all, accesses to a single page are clustered together so the TLB miss cost (if any) is amortized over as many accesses as possible. Secondly, assuming the pointer chain is laid out unpredictably, the memory subsystem must wait for the previous load to complete before it can initiate the next load, so we can measure the true latency. .NH 2 Prefetching .LP Some memory subsystems have been highly optimized to recognize and automatically prefetch memory when given "predictable" memory access streams, such as when striding through array accesses. This means that the memory access stream generated by \*[lmbench] must be unpredictable by the standard prediction algorithms. .LP The original \*[lmbench] memory latency benchmark, lat_mem_rd, built a chain of pointers that would stride backwards through memory. This was able to defeat many simple prefetching algorithms of the time, but some systems came to incorporate prefetching algorithms that recognized strided accesses in both directions. .LP The obvious method for producing an unpredictable chain of line references is to use a random permutation of line indexes. .LP \*[lmbench] uses a deterministic algorithm to compute the reference chain which guarantees that references are as far away from previous accesses in both time and space as possible. Basically, the binary bits representing the line index are reversed, so that 1101 becomes 1011, or 001 becomes 100. This only works if the number of cache lines is an even power of two, but since page sizes and line sizes are always powers of two, this assumption is valid.\** .FS At least this is the case in every modern system known to the author. .FE .LP Additionally, since higher-level caches can have smaller line sizes than lower-level caches, it is necessary to access every word in the relevant chunk of memory. However, accesses to words in the same line must be separated in time by accesses to the rest of the memory. This is achieved by identifying the line size for the largest cache, and then setting up the chain so that there is one pass through the memory for each word in the line with the sequence of words being determined by the bit-reversal method described above. .LP For example, suppose a system has 4KB pages, the largest cache has a line size of 64bytes, and a word is 4bytes. Then each page would have 64 lines, and each line would have 16 words. The system would setup a pointer chain that visits each line on each page using the zeroth word; at the end of the chain it would then jump to the start of the pages and visit each line on each page using the eigth word, and so forth until each word had been visited. .NH 2 Dirty data .LP An additional issue that we need to take into account is the cache's policy for dirty data. Many caches use a copy-back policy, while others use a write-through policy. .LP Different caches on the same machine may use different policies. Also, cache performance can be affected by the presence of dirty data. For example, suppose both the L1 and L2 caches use a copy-back policy, and suppose that the access time for reading data located in L2 depends on whether the data being ejected from L1 is dirty and needs to be copied back from L1 to L2 before the read from L2 to L1. In this case, a benchmark which writes a pointer chain that fits in L2 but is larger than L1, and then measures the time to follow the chain, will get a different average memory latency than a benchmark which writes the same chain and reads enough data to flush the L2 cache before measuring the time to follow the chain. In the first case, each application read will result in a write from L1 to L2 followed by a read from L2 to L1, while in the second case each application read will only result in a read from L2 to L1. .LP Since it is possible that average memory latencies for a read-only access stream may be increased if any of the data in the cache is dirty, we need to flush the cache after setting up the pointer chains and before we do any measurements. Otherwise, when we access a pointer chain that is larger than the L1 cache but smaller than the largest cache, dirty data can reside in the lowest (largest) cache and as each line is staged from the largest cache to the L1 cache, it is marked as dirty in the L1 cache. Then when each dirty line is flushed from the L1 cache (to the L2 cache), the system has to write the data back to L2, which delays the load of the next (dirty) line from L2 to L1. .LP To flush the cache we read (and sum) a large amount of memory, which should be several times larger than the largest cache. In this way, all dirty data in the cache should be flushed from the cache without creating additional dirty data. .NH 2 Page mapping .LP Complicating the issue still further is the fact that caches do not use full LRU replacement policies. Nearly all caches use some form of set associativity, where pages are directed to a pool of cache lines based on the physical address. Replacement within the pool is typically LRU. Direct-mapped caches are a special case where the pool size is a single line. .LP Additionally, some systems use victim caches, which are typically small caches which caches recently discarded cache lines. Victim caches can be particularly effective for direct-mapped caches by reducing the cache miss rate caused by colliding hot spots. .LP However, page mapping and its attendant cache collisions is under the control of the kernel, and is in fact invisible to user-land programs. Some operating systems make an effort to minimize possible page collisions when giving memory to processes\**, while other operating systems appear to simply grab the first available pages, regardless of potential cache collision effects. .FS This is generally known as "page coloring", and is much more important on systems with direct-mapped caches than those with N-way set associative caches. .FE .LP Factoring out page placement affects on average memory latency is very difficult, but it is necessary to ensure that the correct cache size is identified. .NH 1 Cache line size .LP The first feature of the memory hierarchy we will try to analyze is the cache line size, since we can find the line size for the largest cache without any other knowledge of the system, and since determining nearly all other aspects of the memory subsystem either require or are greatly simplified by knowing the cache line size. .LP The most obvious aspect of cache design is that replacement is done on a per-line basis, and cache lines often contain several words of data (32-128bytes per line is common). However, it is necessary to ensure that we don't generate "spurious" cache hits by referencing a word from a cache line that was recently accessed. We must ensure that each line is only re-referenced after all other memory in the buffer has been referenced. .LP Unfortunately, we usually do not know the cache line size ahead of time. In addition, sometimes systems contain several caches, and each cache can use a different line size! Usually line sizes are powers of two, and usually the smaller (higher) caches have line sizes which are the same or smaller than the larger (lower) caches. However, we still need to ensure that we access all cache lines for all caches without generating the spurious cache hits. .LP Determining the cache line size requires a series of experiments. The basic observation is that when the amount of memory being accessed is larger than the cache, and when the access chain is arranged properly, then each memory reference causes a cache miss. If however, a word on a recently access line is requested, then that reference will be a cache hit. More completely, the average memory access time $t bar$ is: .EQ t bar = t sub miss + ( n - 1 ) t sub hit .EN expressed as a function of $n$, the number of accesses to the cache line, $t sub miss$, the cache miss latency, and $t sub hit$, the cache hit latency. .TSTART .G1 .so memhier-line.d .G2 .FEND "Line Size" .LP We can determine the cache line size by measuring the average memory access latency over a series of memory access patterns: accessing every word, every other word, every fourth word, every eighth word, ... While the system is accessing multiple words per cache line, the average memory latency will be smaller than the cache miss latency, and as the space between accesses increases, the average memory increase will grow. When the system accesses only one word per line, the average memory latency will remain level even as the spacing between accesses increases. .LP It is possible to utilize this behavior to identify the cache line size. The algorithm is to measure the average memory latency when each word is accessed. Then as you increase the space between accessed words (doubling the space each iteration), you look for a situation where the average latency increased dramatically, say greater than 30%, followed by a levelling off on the next iteration, say an increase less than 15%. The line size is the last point where the average latency jumped dramatically. .NH 1 TLB .LP Measuring the TLB-miss costs assumes that one can isolate those costs from the rest of the memory access costs. The key observation is that it is often possible to create a situation in which all data being accessed resides in the cache, and yet it requires a TLB-miss to be able to locate it. .LP This program identifies the effective TLB size, rather than the true TLB size. First of all, from a programmer's point of view, it is really the effective TLB size that impacts program performance. Secondly, there is no way for a user-land program to measure true TLB size because kernels sometimes pin some kernel page mappings into the TLB and because some hardware/OS combinations support "super-pages", or multi-page mappings. .LP We create two similar pointer chains with identical length and which reference an identical amount of memory, with one key difference. In the first chain, the data is packed tightly into as few pages as possible, and references remain within a single page as long as possible. The second chain spreads the data over as many pages as possible and jumps between pages at each reference. The two chains are arranged so that the same amount of data will fit into the cache, so that the raw memory access time for each chain is identical, within experimental constraints. The sole difference between average access costs should be the TLB-lookup times. .LP When the pages from the second chain fit into the TLB, the average access times for the two chains should be identical. However, as soon as the number of pages in the second chain exceeds the TLB size, the second chain will start to pay frequent TLB-miss costs. Depending on the TLB replacement policy, the fraction of requests generating TLB-misses in the second chain can vary dramatically\**. .FS Pure LRU would ensure that as soon as the chain was one page longer than the TLB size, every access would trigger a TLB-miss. However, other replacement algorithms might result in as few as $"number of pages" - "TLB size" + 1$ misses per iteration over the loop. .FE .TSTART .G1 .so memhier-tlb.d .G2 .FEND "TLB" .LP The system must search for the point at which the average memory latency of the second chain diverges from the average latency of the first chain. Since most systems have relatively small TLBs and since checking TLB sizes smaller than the effective TLB size is faster than checking TLB sizes larger than the TLB, the system starts with the guess of eight pages to establish a baseline. It then iteratively doubles the number of pages until either a maximum limit has been reached or the average TLB-miss cost is greater than 15% of the average memory latency. Once it discovers the upper bound on the possible TLB size, it uses a binary search between the last two TLB size guesses to find the point at which the average latency for the two streams diverge. .NH 1 Cache size .LP For the purpose of identifying the cache size, the ideal situation is that as long as the amount of memory is equal to or less than the cache size, then all the data is in the cache and the average memory latency is the cache hit latency. As soon as the memory doesn't fit in cache, then none of it should be in the cache, so the average memory latency is the cache miss latency.\** When examining average memory latency versus memory size, this would give nice flat plateaus for each cache, with nice sharp transitions from one cache to the next, and from the largest cache to main memory. .FS Of course, for real programs, you want the average memory latency to be as low as possible, which means that you want as much of the data in cache as possible. .FE .LP However, the realities are that real data from real systems is corrupted in a variety of ways. First of all, even when the memory can fit into the cache, pages often collide in the cache and the fraction of pages that have collisions often increases as the amount of memory nears the cache size. Secondly, even when the memory cannot fit into the cache, there can be pages that do not collide. Finally, there is simple experimental noise, which is usually limited to 1% or less. .LP The result of the first two problems is that on some systems, the average memory latency increases gradually as the memory size is increased. There are no flat plateaus and sharp cliffs which make it easy to identify the number, size, and performance of the caches. .NH 2 Page coloring .LP The first problem is to create a set of pages which do not collide in the cache. The solution is to allocate more memory than necessary, and to try different combinations of pages to find the page set with the fastest average memory latency. Unfortunately, the obvious algorithm is exponential in the number of pages. .TSTART .G1 .so memhier-color.d .G2 .FEND "Page Coloring Effects" .LP One observation is that cache misses are usually much more expensive than cache hits. So, one possibility is to choose a random set of pages as the baseline and measure the average memory latency. Then iterate over the pages, removing that page from the set and measuring the average memory latency of the reduced set. If that page collides with another page, then the average memory latency for the reduced set should be smaller than the average latency for the whole set. .LP Once a page that collides has been identified, then the system can iterate through available pages, try adding them to the reduced set and measuring the average memory latency. If the page doesn't collide with any pages in the reduced set, then the average memory latency should drop still further. In this way, the system could identify all colliding pages and replace them with pages that don't collide (assuming the memory all fits in the cache). .LP There are a number of problems with this simple approach. First of all, it would take a very long time to run due to the large, but polynomial, number of experiments required. Secondly, as the memory size increases and the number of pages involved gets large, the effect of a single page on the average memory latency can reach the level of experimental noise. .LP This approach makes the assumption that physical page locations do not change once the memory has been allocated. In most systems, this assumption is valid unless the memory is paged to disk. However, at least IRIX includes an operating system configuration option to allow the operating system to dynamically relocate pages in memory. This capability is disabled by default, so its use is relatively uncommon. It is possible that page relocation will become more common in the future, in which case this design may need to be revisited in the future. .LP Our algorithm uses this basic approach, but attempts to reduce the number of experiments required by removing chunks of pages at a time. It will remove up to 5% of pages at a time and see if the average memory latency decreases significantly, in which case it examines the chunk a page at a time to find the page or pages which probably conflict. .LP An additional problem is that for large caches, the measured difference between two sets of pages with just one page collision difference can be very hard to measure. For example, on a system with a 512Kbyte L2 cache and 4Kbyte pages, the cache can hold 128 pages. Assuming that a cache miss is 200ns, a cache hit is 50ns, and 123 pages have no collisions but 5 pages collide, then the average memory latency is .EQ t bar = { 123 times 50 + 5 times 200 } over 128 .EN or 55.85ns. Suppose we remove one page and replace it with another page which doesn't collide, so we now have 4 collisions and 124 pages without collisions, then the average memory latency is 54.68ns. The difference is generally significant even in the face of experimental noise, but for larger caches the differences may recede into the background noise. .LP As caches increase in size, the problems associated with detecting page collisions can only increase. For example, an 8MB cache on a system with 4KB pages would contain 2,048 pages. Removing a single page collision, even when the resulting memory latency for that page reduces by a factor of four, would simply result in an overall reduction in average memory latency of less than 0.2%, which is smaller than the average experimental measurement errors. .LP Additionally, as caches increase in size, effects such as cache consumption by the page table can begin to become important. .LP The single largest remaining problem in our system is that this algorithm does not guarantee that we find a set of pages which do not contain any collisions in all cases that it \fImight\fR find such a set. It merely does so \fImost\fR of the time with (relatively) few measurements. .LP One possible means of dealing with this problem is to try an remove sets of pages in the hope that enough pages from a set of colliding pages will be removed at once, so that the remaining pages from that collision set won't collide anymore. Suppose you have a 4-way set associative cache, and that you have six pages that collide. If you remove two of the pages, then the remaining four pages don't collide anymore either. This means that by removing two pages we have removed six collisions, which should be easier to detect. .LP XXX Look into randomizing the pages after each iteration of the top-level loop to make this sort of serendipitious event more likely. .NH 2 Measurement .LP In order to reduce the number of memory sizes that are measured by the system, we use a binary search on memory sizes to find "edges" in the memory latency. We make the simplifying assumption that cache sizes are either a power of two, or 1.5 times a power of two. In our experience, this assumption has been true. We also assume that no cache is smaller than 512 bytes. .LP We explore the memory space at intervals equivalent to the most recent power of two divided by four. So, starting at one megabyte we would (potentially) measure memory latency at 1MB, 1.25MB, 1.5MB, and 1.75MB. This allows us to detect cache sizes at the desired intervals, since the measurement at the exact cache size can often be corrupted by other system activity so the next smaller measurement should still be valid. .LP XXX If the measurement size increment is several times larger than a page, then perhaps we should actually measure the system with a couple pages less than the stated size? This would allow us some "slop" for collisions and might make it easier near cache boundaries to get accurate measurements. The "slop" should probably be some fraction of the measurement increment size, such as 10%, so it scales properly. .LP Since we start with a maximum size as a given, and we use 512 bytes as a minimum, and we can compute the full set of possible measurements, and initialize an array with the desired sizes. We can then use a modified binary search on this array to efficiently locate cache edges while still (potentially) leaving large, flat plateaus unexplored between the end points. .LP Finally, we assume that true memory latency is monotonically increasing with the amount of memory that you access. This means that if the measured latency ever decreases as you increase the amount of accessed memory, then the previous measurement must have been an error and the value is replaced by the smaller measurement. .NH 2 Data analysis .LP Assuming the data collected by the system were noise-free and that the experimental system had managed to eliminate all artifacts such as page coloring effects, then the next problem is to analyze the data to find the number and size of the caches. Basically this means examining the data to find plateaus and cliffs. Each plateau would represent a cache, and the cliff represents the edge (size) of the cache. .LP Of course, real data is never perfect, and there are any number of issues which can affect the experimental results, so the analysis methodology must be robust to noise. .LP XXX describe analysis methodology here .NH 1 Cache associativity .LP No modern caches are fully associative, meaning that no caches use LRU replacement, because the performance overhead for LRU is so severe. Most caches are either set associative or direct mapped, meaning that data from a given location can only go to one of a small number of cache lines, and in the case of a direct-mapped cache to a single cache line. .LP To determine the cache associativity we need to find a set of pages which have no page collisions and which (just) fit into the cache. We then need to locate a page which collides with these pages and append it to the set. Then we can iterate through the pages in the initial page set, removing a page at a time, and comparing the resulting average memory latency with that of the full set. When the average memory latency drops significantly, then we know that this page conflicts with the full page set, and since the page set only has one conflict, we know it conflicts with the newly introduced page. The number of pages that conflict with this newly introduced page is the set associativity. .LP There is a potential bug in this algorithm for systems with victim caches! If the victim cache can hold at least a page of data, then this algorithm cannot properly determine the cache associativity because the victim cache will play the role of additional associative cache lines. .LP For smaller caches there is the additional problem that the cache associativity may not be smaller than the number of pages that the cache may hold. In which case, this simple approach will never find pages that collide in the cache. The solution to this problem is to increase the line size and the number of pages so that only portions of each page are accessed, and there can be enough pages to create collisions. .NH 1 Memory parallelism .LP With the increasing memory bottleneck, most modern systems allow multiple outstanding memory references. On many systems, the effective parallelism depends on which part of the memory hierarchy is being accessed. For example, L1 caches can often service as many as six or eight outstanding requests, while main memory systems can usually support at most two outstanding requests. .LP To measure the available parallelism for a given chunk of memory, the system sets up a pointer chain running through the memory exactly the same as if it were to measure the average memory latency. It then uses fifteen different access routines, one for each possible level of parallelism.\** .FS The assumption here is that no memory subsystem supports more than sixteen accesses in parallel. .FE Each routine dereferences $N$ pointers in parallel. For example, the inner loop of the routine where $N=2$ would look something like this: .DS L \f(CWwhile (iterations-- > 0) { p0 = (char**)*p0; p1 = (char**)*p1; }\fP .DE .LP The available parallelism is the maximum speedup over all N compared to the sequential case. .LP Note that this value is often not integral because many factors go into the effective parallelism, such as TLB contention, can limit the effective parallelism. .NH 1 DRAM pages .LP Within DRAM chips there is usually one or more lines of data which is "cached" in registers near the chip outputs. Accessing data contained in these lines is typically faster than accessing data from the body of the DRAM chip. The set of memory contained in a bank of DRAM chips for a single line (per DRAM chip) of memory is usually called a DRAM page. .LP Recently some systems have started taking advantage of this potential performance increase by keeping DRAM pages "open" (in the register bank) after an access in the hope that the next access will be to the same page. This means that main memory latency suddenly depends on the access history, and that dramatic differences in "open" versus "closed" DRAM page performance may impact software and data structure design. .LP To measure DRAM page latency, we need to compare performance for accesses to "open" versus "closed" DRAM pages. The standard pointer chain developed for measuring cache and memory latency maximizes "open" DRAM page accesses while minimizing other overheads, such as TLB misses. This means that we need to develop another pointer chain which maximizes "closed" DRAM accesses while still minimizing other overheads such as TLB misses. .LP This can be done by clustering pages into \fIgroups\fP whose size is smaller than the TLB size. Within each group the pointer chain switches pages on each access to maximize the probability of a "closed" DRAM page access. For all but the last page in the group, each access points to the same location within the page, except on the next page. The last page points to the next location in the first page, using the same location bit-switching selection logic used in the standard pointer chain. .NH 1 Conclusion .LP XXX Update conclusions \*[lmbench] is a useful, portable micro-benchmark suite designed to measure important aspects of system performance. We have found that a good memory subsystem is at least as important as the processor speed. As processors get faster and faster, more and more of the system design effort will need to move to the cache and memory subsystems. .NH 1 Acknowledgments .LP Many people have provided invaluable help and insight into both the benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers were especially helpful. We thank all of them and especially thank: Wayne Scott \s-1(BitMover)\s0, Larry McVoy \s-1(BitMover)\s0, Bruce Chapman \s-1(SUN)\s0, and John McCalpin \s-1(Univ. of Virginia)\s0. .LP We would also like to thank all of the people that have run the benchmark and contributed their results; none of this would have been possible without their assistance. .NH 1 Obtaining the benchmarks .LP The benchmarks are available at: .QP \fIhttp://ftp.bitmover.com/lmbench\fP .ft .\" .R1 .\" bibliography references-memhier .\" .R2 .\"******************************************************************** .\" Redefine the IP paragraph format so it won't insert a useless line .\" break when the paragraph tag is longer than the indent distance .\" .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] .\". br . \} . rm par*label .\} .. .\"******************************************************************** .\" redefine the way the reference tag is printed so it is enclosed in .\" square brackets .\" .de ref*end-print .ie d [F .IP "[\\*([F]" 2 .el .XP \\*[ref*string] .. .\"******************************************************************** .\" Get journal number entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-N .ref*field N "" ( ) .. .\"******************************************************************** .\" Get journal volume entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-V .ref*field V , "" "" "" .. .\"******************************************************************** .\" Get the date entry right. Should not be enclosed in parentheses. .\" .de ref*add-D .ref*field D "," .. .R1 accumulate sort A+DT database references-memhier label-in-text label A.nD.y-2 bracket-label [ ] ", " bibliography references-memhier .R2 .\" .so bios ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/mhz.8����������������������������������������������������������������������������0000664�0000764�0000764�00000001440�07172615471�014632� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH MHZ 8 "$Date$" "(c)1994-2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME mhz \- calulate processor clock rate .SH SYNOPSIS .B mhz .I [-c] .SH DESCRIPTION .B mhz calculates the processor clock rate and megahertz. It uses an unrolled, interlocked loop of adds or shifts. So far, superscalarness has been defeated on the tested processors (SuperSPARC, RIOS, Alpha). .SH OUTPUT Output format is either just the clock rate as a float (-c) or more verbose .sp .ft CB 39.80 Mhz, 25 nanosec clock .ft .LP .B mhz is described more completely in ``mhz: Anatomy of a microbenchmark'' in .I "Proceedings of 1998 USENIX Annual Technical Conference", June 1998. .SH "SEE ALSO" lmbench(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/par_mem.8������������������������������������������������������������������������0000664�0000764�0000764�00000005247�10425064544�015457� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH PAR_MEM 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME par_mem \- memory parallelism benchmark .SH SYNOPSIS .B par_mem [ .I "-L <line size>" ] [ .I "-M <len>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B par_mem measures the available parallelism in the memory hierarchy, up to .I len bytes. Modern processors can often service multiple memory requests in parallel, while older processors typically blocked on LOAD instructions and had no available parallelism (other than that provided by cache prefetching). .B par_mem measures the available parallelism at a variety of points, since the available parallelism is often a function of the data location in the memory hierarchy. .LP In order to measure the available parallelism .B par_mem conducts a variety of experiments at each memory size; one for each level of parallelism. It builds a pointer chain of the desired length. It then creates an array of pointers which point to chain entries which are evenly spaced across the chain. Then it starts running the pointers forward through the chain in parallel. It can then measure the average memory latency for each level of parallelism, and the available parallelism is the minimum average memory latency for parallelism 1 divided by the average memory latency across all levels of available parallelism. .LP For example, the inner loop which measures parallelism 2 would look something like: .sp .ft CB for (i = 0; i < N; ++i) { p0 = (char **)*p0; p1 = (char **)*p1; } .ft .sp in a .I for loop (the overhead of the .I for loop is not significant; the loop is an unrolled loop 100 loads long). In this case, if the hardware can process two LOAD operations in parallel, then the overall latency of the loop should be equivalent to that of a single pointer chain, so the measured parallelism would be roughly two. If, however, the hardware can only process a single LOAD operation at once, or if there is (significant) resource contention between the two LOAD operations, then the loop will be much slower than a loop with a single pointer chain, so the measured parallelism will be less than two, and probably no smaller than one. .SH OUTPUT Output format is intended as input to \fBxgraph\fP or some similar program (we use a perl script that produces pic input). There is a set of data produced for each stride. The data set title is the stride size and the data points are the array size in megabytes (floating point value) and the load latency over all points in that array. .SH "SEE ALSO" lmbench(8), line(8), cache(8), tlb(8), par_ops(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/par_ops.8������������������������������������������������������������������������0000664�0000764�0000764�00000002145�10425064603�015470� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH PAR_OPS 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME par_ops \- basic CPU operation parallelism .SH SYNOPSIS .B par_ops [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B par_ops measures the available parallelism for basic CPU operations, such as integer ADD. Results are reported as the average operation latency divided by the minimum average operation latency across all levels of parallelism. .TP integer bit, add, mul, div, mod operations; maximum parallelism for integer XOR, ADD, MUL, DIV, MOD operations. .TP uint64 bit, add, mul, div, mod operations; maximum parallelism for uint64 XOR, ADD, MUL, DIV, MOD operations. .TP float add, mul, div operations; maximum parallelism for flot ADD, MUL, DIV operations. .TP double add, mul, div operations; maximum parallelism for flot ADD, MUL, DIV operations. .SH BUGS This benchmark is highly experimental and may sometimes (frequently?) give erroneous results. .SH "SEE ALSO" lmbench(8), lat_ops(8), par_mem(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/parallel.ms����������������������������������������������������������������������0000664�0000764�0000764�00000025257�07414075675�016121� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" This document is GNU groff -mgs -t -p -R -s .\" It will not print with normal troffs, it uses groff features, in particular, .\" long names for registers & strings. .\" Deal with it and use groff - it makes things portable. .\" .\" $X$ xroff -mgs -t -p -R -s $file .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr .VARPS .\" Define a page top that looks cool .\" HELLO CARL! To turn this off, s/PT/oldPT/ .de draftPT .\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP' .. .de lmPT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .\" HELLO CARL! To turn this off, s/BT/oldBT/ .de draftBT .\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' .. .de lmBT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 2001 \\*[author]'\\*(DY'%' . ps .. .de SP . if t .sp .5 . if n .sp 1 .. .de BU . SP . ne 2 \(bu\ . if \\n[.$] \fB\\$1\fP\\$2 .. .nr FIGURE 0 .nr TABLE 0 .nr SMALL .25i .de TSTART . KF . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 . ps -1 . vs -1 .. .de TEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr TABLE \\n[TABLE]+1 . ce 1 \fBTable \\n[TABLE].\ \ \\$1\fP . SP . KE .. .de FEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr FIGURE \\n[FIGURE]+1 . ce 1 \fBFigure \\n[FIGURE].\ \ \\$1\fP . SP . KE .. .\" Configuration .nr PI 3n .nr HM 1i .nr FM 1i .nr PO 1i .if t .po 1i .nr LL 6.5i .if n .nr PO 0i .if n .nr LL 7.5i .nr PS 10 .nr VS \n(PS+1 .ds title Utilizing instruction-level parallelism .ds author Carl Staelin .ds lmbench \f(CWlmbench\fP .ds lmbench3 \f(CWlmbench3\fP .ds lmdd \f(CWlmdd\fP .ds bcopy \f(CWbcopy\fP .ds connect \f(CWconnect\fP .ds execlp \f(CWexeclp\fP .ds exit \f(CWexit\fP .ds fork \f(CWfork\fP .ds gcc \f(CWgcc\fP .ds getpid \f(CWgetpid\fP .ds getpid \f(CWgetpid\fP .ds gettimeofday \f(CWgettimeofday\fP .ds kill \f(CWkill\fP .ds memmove \f(CWmemmove\fP .ds mmap \f(CWmmap\fP .ds popen \f(CWpopen\fP .ds read \f(CWread\fP .ds stream \f(CWstream\fP .ds system \f(CWsystem\fP .ds uiomove \f(CWuiomove\fP .ds write \f(CWwrite\fP .ds yield \f(CWyield\fP .ds select \f(CWselect\fP .ds lat_ops \f(CWlat_ops\fP .ds benchmp \f(CWbenchmp\fP .ds lat_connect \f(CWlat_connect\fP .\" References stuff .de RN \"Reference Name: .RN $1 -- prints the reference prettily .\" [\s-2\\$1\s+2]\\$2 [\s-1\\$1\s0]\\$2 .. .\" .R1 .\" sort A+DT .\" database references .\" label-in-text .\" label A.nD.y-2 .\" bracket-label \*([. \*(.] ", " .\" .R2 .EQ delim $$ .EN .TL \s(14Utilizing instruction-level parallelism\s0 .AU \s+2\fR\*[author]\fP\s0 .AI \fI\s+2Hewlett-Packard Laboratories Israel\s0\fP .SP .AB Modern processors and systems provide a great deal of parallelism, even for traditional single-threaded software. Often this parallelism is hidden, but the potential performance benefits of restructuring software to allow the hardware to utilize this parallelism can be striking. For example, modern memory systems can usually support at least two outstanding requests to main memory, and as many as six or eight outstanding requests to cache memory. Since memory latencies can account for a significant fraction of many program's runtime, restructuring data structures and algorithms so strictly sequential memory accesses can be parallelized can greatly improve performance. .AE .if t .MC 3.05i .NH 1 Introduction .LP Computer scientists are generally taught some basic computer architectecture and a set of standard data structures and algorithms, such as lists, hash tables, and binary search. These data structures and algorithms are commonly used and in many programs their handling can consume a significant fraction of the overal runtime. However, these data structures and algorithms were designed over thirty years ago, when most processors had no parallelism. .LP There has been a great deal of work by compiler writers and computer architects on automatically discovering and utilizing instruction-level parallelism in existing software, but relatively little work has been done on examining data structures and algorithms that can enable increased instruction-level parallelism. .LP There has been a great deal of work focussing on developing parallel algorithms for multi-processor machines, with explicit synchronization primitives such as semaphores and barriers. At this level of parallelism, the overheads are generally so high that the parallelism must be fairly coarse-grained, or else the overhead costs consume any benefits provided by the parallelism. .LP However, instruction-level parallelism is "free"; it is managed by the hardware and incurs no additional runtime costs. The main question is how to structure software algorithms and data structures to maximize the available parallelism. .NH 1 Prior work .LP Over the last few years, there has been some work on improving the performance of critical software in a architecture-sensitive manner. .LP .RN Agarwal96 describes the design and implementation of a fast sorting algorithm for superscalar RISC machines. .LP The Automatically Tuned Linear Algebra System (ATLAS) .RN Whaley98 contains a number of parametrized code generators for matrix multiply operations, as well as a pluggable architecture to allow developers to add hardware-specific modules. ATLAS then explores the parameter space to find the optimal parameter settings for the particular system. .LP FFTW .RN Frigo98 is another project which uses architecture-aware optimizations. .NH 1 Computer architecture primer .LP A processor architecture is generally defined by its instruction set, but most computer architectures incorporate a large number of common building blocks and concepts, such as registers, arithmetic logic units, and caches. .NH 2 Traditional architecture .LP One view of a traditional architecture might be the MIX system defined by Knuth in his classic work on algorithms and data structures .RN Knuth73 . While the MIX instruction set and architecture does not forbid parallelism, there is no explicit parallelism mentioned in the description. Consequently, none of the algorithms assumes any instruction-level parallelism, or is structured to explicitly utilize such parallelism had it existed. .LP The MIX system has a single arithmetic logic unit, and no floating point unit, so there is no explicit instruction-level parallelism specified in the architecture. .NH 2 Modern Extensions .LP There are a number of modern extensions to computer architecture that attempt to increase the processor's ability to do several things at once. Nearly all of these enhancements, with the notable exception of the EPIC work, are intended to be invisible to the average programmer. Most notably, they do not require changing the instruction set. .IP "Superscalar processors" Superscalar processors have multiple processing units which can operate simultaneously. .IP "Dynamic instruction reordering" Dynamic instruction reordering allows the processor to execute instructions whose operands are ready before instructions which are stalled waiting for memory or other instruction's completion. .IP "Memory parallelism" By allowing multiple outstanding memory requests, processors allow the memory subsystem to service multiple (independent) requests in parallel. Since memory accesses are a common performance bottleneck, this can greatly improve performance. .IP "Vector processing" Vector processing allows the processor to execute arithmetic operations on vector operands in parallel, and in modern commodity processors goes by names such as MMX, SSE, and 3DNow. .IP "Simultaneous multi-threading (SMT)" SMT allows superscalar processors to simulatenously execute instructions from several threads (contexts) .RN Tullset96 . SMT may include extensions which allow for very lightweight inter-thread synchronization primitives that enable much finer-grained thread-level parallelism than traditional synchronization methods .RN Tullsen99 . .IP "Explicitly parallel instruction computers (EPIC)" EPIC allows the compiler to explicitly issue $N$ instructions in parallel at each instruction, which informs the hardware that these instructions are independent and may be executed in parallel .RN Schlansker00 . It moves much of the burden regarding dependency checking from the hardware to the compiler. .NH 1 Conclusion .LP With the increasing proliferation of both explicit and hidden parallelism in processor and memory system designs, it is becoming important to revisit many data structures and algorithms to adapt them to the new hardware environment. .NH 1 Acknowledgments .LP Many people have provided invaluable help and insight into both the benchmarks themselves and the paper. We thank all of them and especially thank Larry McVoy \s-1(BitMover)\s0 for the lively conversations and discussions regarding benchmarking and experimental design. .\" .R1 .\" bibliography references-parallel .\" .R2 .\"******************************************************************** .\" Redefine the IP paragraph format so it won't insert a useless line .\" break when the paragraph tag is longer than the indent distance .\" .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] .\". br . \} . rm par*label .\} .. .\"******************************************************************** .\" redefine the way the reference tag is printed so it is enclosed in .\" square brackets .\" .de ref*end-print .ie d [F .IP "[\\*([F]" 2 .el .XP \\*[ref*string] .. .\"******************************************************************** .\" Get journal number entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-N .ref*field N "" ( ) .. .\"******************************************************************** .\" Get journal volume entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-V .ref*field V , "" "" "" .. .\"******************************************************************** .\" Get the date entry right. Should not be enclosed in parentheses. .\" .de ref*add-D .ref*field D "," .. .R1 accumulate sort A+DT database references-parallel label-in-text label A.nD.y-2 bracket-label [ ] ", " bibliography references-parallel .R2 .\" .so bios �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/pgraph.1�������������������������������������������������������������������������0000664�0000764�0000764�00000007234�07045412511�015302� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .de DS . sp .5 . nf . in +4 . ft CW . vs -1 .. .de DE . sp .5 . fi . in . ft . vs .. .TH PGRAPH 1 "Nov, 1995" "lm@sgi.com" "Docomentation tools" .SH NAME pgraph \- compile graphs into pic input .SH SYNOPSIS .B pgraph [ options ] [ .I filename \&.\|.\|. ] .SH DESCRIPTION .LP .B pgraph is a perl script which takes sets of X Y data and generates a (human readable) pic program that will produce the graphed data. The output is designed such that you can save it in a file and tweak it to make it fit your document. Try one and look at the output. The output is actually commented. .LP The graph is autosized and auto ticked. .LP The input data format is similar that of xgraph(1), i.e., .DS "sloped across 1 1 2 2 3 3 "straight across 1 4 2 4 3 4 .DE .SH "CONTROL OPTIONS" .LP You may set the graph title, the X title, and the Y title with the following control sequences in the data stream: .DS %T Graph title in +4 point font %X X axis title and/or units in +2 point font %Y Y axis title and/or units in +2 point font %fakemax-X <value> force graph to be that big %fakemax-Y <value> force graph to be that big %fakemin-X <value> force graph to be that small %fakemin-Y <value> force graph to be that small .DE .SH OPTIONS .IP -rev 12 reverse X/Y data sense (and titles). Note this is done after processing any fudging of the input data stream(s) (see -xk, -yk, -logx, etc below). .IP -below put data set titles below the graph rather than to the right. .IP -close no extra space around the data's endpoints. .IP -qline connect the quartile center points. .IP -grid dotted line grid marks. .IP -nobox no box around whole graph. .IP -big make the graph take the whole page, and be about 8 inches tall by 7 inches wide and the title is +8 points. .IP -slide make the graph be 4.25 inches square to fit in slides, in a helvetica bold 10 point font. .IP -small make the graph be small, 1.75 inches square, and use an 8 point bold font. .IP -grapheach draw each data set in its own graph. .IP -nolabels no X/Y/Title labels. .IP -notitle no Title label. .IP -nodatal no data set labels. .IP -nomarks do not mark each data point with distinct markers (endpoints are still marked). .IP -k print values larger than 1000 as value/1000. .IP -xk multiply X input by 1024 (blech). .IP -yk multiply Y input by 1024 (blech). .IP -xm multiply X input by 1024*1024 (blech). .IP -ym multiply Y input by 1024*1024 (blech). .IP -logx convert X input into log base 2 of X input. .IP -logy convert Y input into log base 2 of Y input. .SH EXAMPLE Workstation price performance from a Digital ad. Process with .DS .ps -2 graph -rev workstations | groff -TX75 %T Workstation Price / Performance, 6/93 %X SPECINT 92 Performance %Y Price in $1000's "Dec AXP line 35 5 65 10 78 15 110 70 "Sun SPARC line 25 4 25 8 38 16 48 21 52 23 64 27 .DE .ps .SH "QUARTILE FORMAT" Data points are \f(CBx y1 y2 y3 y4 y5\fP. You get a two lines from the first two y values, a mark at the third, and another line from the last two. .SH "SEE ALSO" .BR gtroff (1), .BR gpic (1), .BR perl (1). .SH BUGS -grapheach assumes the set of N graphs will fit on one page. .LP Since it is just a simple perl script, I tend to be constantly adding one more feature on the fly. Consult the script for the latest set of options. Development is typically done by using the closest set of options to generate the graph, massage the graph to do what you want, then add that set of changes as a new option. .LP This isn't done as much as I would like. It isn't integrated with the groff preprocessor yet. It doesn't know about .GS/.GE things. I use it to manually generate a pic file and then include that. .LP I need to include some example data sets with pgraph. ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/rccs.1���������������������������������������������������������������������������0000664�0000764�0000764�00000011065�07045412511�014750� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .de DS . sp .5 . nf . in +4 . ft CW . vs -1 .. .de DE . sp .5 . fi . in . ft . vs .. .TH RCCS 1 "Nov, 1995" "lm@sgi.com" "Programmers tools" .SH NAME rccs \- apply RCS commands to sets of files .SH SYNOPSIS .B rccs command [ options ] [ .I filename and/or .I directory \&.\|.\|. ] .SH DESCRIPTION .LP .B rccs is a perl script that tries to emulate the Berkeley \fBSCCS\fP program for \fBRCS\fP. If your fingers know how to type commands to \fBSCCS\fP, just do the same thing to \fBrccs\fP. .LP A subset of the \fBSCCS\fP commands are implemented, the ones that I use. Some new commands have been added. It is easy to add more commands, see the \fIExample\fP routine at the bottom of \fBrccs\fP to see how. .LP This interface does not require a list of files/directories for most commands; the implied list is *,v and/or RCS/*,v. Destructive commands, such as clean -f, unedit, unget, do \fBnot\fP have an implied list. In other words, \f(CBrccs diffs\fP is the same as \f(CBrccs diffs RCS\fP but \f(CBrccs unedit\fP is not the same as \f(CBrccs unedit RCS\fP. .SH COMMANDS .IP options 8 Note that RCS options are typically passed through to RCS. The options that made sense to SCCS commands are translated to RCS options. .IP "ci" 10 Alias for delta. Checks in files. .IP "clean [-e] [-f] [-d|y'message'] [files]" Without any arguments, this command removes all files that are read only and have an associated RCS file. With the -e argument, clean removes files that have been checked out writable but have not been modified. The -d|y|m option may be combined with -e to check in the set of files that have been modified. With the -f option, clean removes all working files, \fBincluding\fP files that have been modified since the check out. Be careful. .IP co Alias for get. Checks out files. .IP "create [-y|d'message'] [-g] files" Initial check in of files to the RCS system. The files are then checked out readonly unless the -g option is present. The -y or -d options may be used to set the descriptive text message. Differs from SCCS in that the original files are not preserved. .IP deledit Alias for delta followed by a get -e. .IP delget Alias for delta followed by a get. .IP "delta [-y|d'message'] [-q] [files]" Check in a delta of the file. -q is changed to RCS' -s and means to be quiet about hwat is happening. -y'message' or -d'message' or -m'message' all get sent through to RCS as the check in message. No other arguments are translated. .IP "diffs [-C|c] [-r<rev>] [-sdiff] [files]" Shows changes between the working files and the RCS file. Note that the files do not need to be checked out, only writable. -C or -c means do a context diff. -sdiff means do a side by side diff. The sdiff option will figure out your screen width if it knows how - see the source to make this work on your system. .IP edit Alias for get -e. .IP enter Alias for create -g. .IP fix Useful if you just checked in the file and then realized you forgot something. The fix command will remove the top delta from the history and leave you with an editable working file with the top delta as the contents. .IP "get [-e] [-p] [-k] [-s] [files]" Get, or check out, the file. Without any options, get just gets the latest revision of the RCS file in the working file. With -e, check out the file writable. With -p, send the file to stdout. With -k, supress expansion of key words. With -s, be quiet about what is happening. .IP help Get a brief help screen of information. .IP "history [files]" Print the RCS history (my format) of the specified files. .IP "info [files]" Print the list of files being edited. .IP print Alias for a loop that prints the history of each file followed by the contents of the file. .IP prs Alias for history. .IP prt Alias for history. .IP unedit Alias for clean -f. .IP unget Alias for clean -f. .SH GLOBAL OPTIONS .IP -debug 10 Turn on debugging. Used when debugging \fBrccs\fP itself. .IP -verbose Be more verbose about what is happening. .SH EXAMPLES To start off, add a bunch of files to RCS: .DS rccs create -y'my program name' myprog.c myprog.h .DE Now let's edit them all: .DS rccs get -e .DE If we didn't change anything, the following gives us a clean directory: .DS rccs clean -e .DE If we changed myprog.h, the following gives us a clean directory after checking in myprog.h: .DS rccs clean -e -d'some message' .DE If we want to see what we changed: .DS rccs diffs .DE .SH "SEE ALSO" .BR "RCS commands" , .BR "SCCS commands" , .BR sdiff (1), .BR perl (1). .SH TODO It would be nice to implement a \fB-i\fP option that prompted before each action, especially the destructive ones. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/refdbms.keys���������������������������������������������������������������������0000664�0000764�0000764�00000000267�07045412511�016255� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������Chen93d Chen94a Fenwick95 Howard88 Jain91 McCalpin95 Ousterhout90 Park90 Smith82b Smith85 Wolman89 Wong88 Agarwal95 Bailey93 Bitton83 Chen91b Dietrich92 Leutenegger93 Nelson89 TPPC92 �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/references�����������������������������������������������������������������������0000664�0000764�0000764�00000014772�07045412511�016010� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Saavedra95 %A R.H. Saavedra %A A.J. Smith %T Measuring cache and TLB performance and their effect on benchmark runtimes %J IEEE Transactions on Computers %V 44 %N 10 %D October 1995 %P 1223-1235 %z Article %K Wolman89 %A Barry L. Wolman %A Thomas M. Olson %T IOBENCH: a system independent IO benchmark %J Computer Architecture News %V 17 %N 5 %D September 1989 %P 55-70 %x IOBENCH is an operating system and processor independent synthetic %x input/output (IO) benchmark designed to put a configurable IO and %x processor (CP) load on the system under test. This paper discusses %x the UNIX versions. %k IOBENCH, synthetic I/O benchmark, UNIX workload %s vinton%cello@hplabs.hp.com (Fri Sep 20 12:55:58 PDT 1991) %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Article %K Chen94a %A P. M. Chen %A D. A. Patterson %T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance %D November 1994 %J Transactions on Computer Systems %V 12 %N 4 %P 308-339 %x Current I/O benchmarks suffer from several chronic problems: they %x quickly become obsolete; they do not stress the I/O system; and they %x do not help much in undelsi;anding I/O system performance. We %x propose a new approach to I/O performance analysis. First, we %x propose a self-scaling benchmark that dynamically adjusts aspects of %x its workload according to the performance characteristic of the %x system being measured. By doing so, the benchmark automatically %x scales across current and future systems. The evaluation aids in %x understanding system performance by reporting how performance varies %x according to each of five workload parameters. Second, we propose %x predicted performance, a technique for using the results from the %x self-scaling evaluation to estimate quickly the performance for %x workloads that have not been measured. We show that this technique %x yields reasonably accurate performance estimates and argue that this %x method gives a far more accurate comparative performance evaluation %x than traditional single-point benchmarks. We apply our new %x evaluation technique by measuring a SPARCstation 1+ with one SCSI %x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running %x the Sprite LFS operating system with a three-disk disk array, a %x Convex C240 minisupercomputer with a four-disk disk array, and a %x Solbourne 5E/905 fileserver with a two-disk disk array. %s toc@hpl.hp.com (Mon Mar 13 10:57:38 1995) %s wilkes%hplajw@hpl.hp.com (Sun Mar 19 12:38:01 PST 1995) %s wilkes%cello@hpl.hp.com (Sun Mar 19 12:38:53 PST 1995) %z InProceedings %K Ousterhout90 %s wilkes%cello@hplabs.hp.com (Fri Jun 29 20:46:08 PDT 1990) %A John K. Ousterhout %T Why aren't operating systems getting faster as fast as hardware? %C Proceedings USENIX Summer Conference %c Anaheim, CA %D June 1990 %P 247-256 %x This paper evaluates several hardware pplatforms and operating systems using %x a set of benchmarks that stress kernel entry/exit, file systems, and %x other things related to operating systems. The overall conclusion is that %x operating system performance is not improving at the same rate as the base speed of the %x underlying hardware. The most obvious ways to remedy this situation %x are to improve memory bandwidth and reduce operating systems' %x tendency to wait for disk operations to complete. %o Typical performance of 10-20 MIPS cpus is only 0.4 times what %o their raw hardware performance would suggest. HP-UX is %o particularly bad on the HP 9000/835, at about 0.2x. (Although %o this measurement discounted a highly-tuned getpid call.) %k OS performance, RISC machines, HP9000 Series 835 system calls %z InProceedings %K McVoy91 %A L. W. McVoy %A S. R. Kleiman %T Extent-like Performance from a Unix File System %C Proceedings USENIX Winter Conference %c Dallas, TX %D January 1991 %P 33-43 %z Article %K Chen93d %A Peter M. Chen %A David Patterson %T Storage performance \- metrics and benchmarks %J Proceedings of the IEEE %V 81 %N 8 %D August 1993 %P 1151-1165 %x Discusses metrics and benchmarks used in storage performance evaluation. %x Describes, reviews, and runs popular I/O benchmarks on three systems. Also %x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling %x benchmark with predicted performance. %k I/O, storage, benchmark, workload, self-scaling benchmark, %k predicted performance, disk, performance evaluation %s staelin%cello@hpl.hp.com (Wed Sep 27 16:21:11 PDT 1995) %z Article %K Park90a %A Arvin Park %A J. C. Becker %T IOStone: a synthetic file system benchmark %J Computer Architecture News %V 18 %N 2 %D June 1990 %P 45-52 %o this benchmark is useless for all modern systems; it fits %o completely inside the file system buffer cache. Soon it may even %o fit inside the processor cache! %k IOStone, I/O, benchmarks %s staelin%cello@hpl.hp.com (Wed Sep 27 16:37:26 PDT 1995) %z Article %K Fenwick95 %A David M. Fenwick %A Denis J. Foley %A William B. Gist %A Stephen R. VanDoren %A Danial Wissell %T The AlphaServer 8000 series: high-end server platform development %J Digital Technical Journal %V 7 %N 1 %D August 1995 %P 43-65 %x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end %x server products. Both servers are based on the 300Mhz Alpha 21164 %x microprocessor and on the AlphaServer 8000-series platform architecture. %x The AlphaServer 8000 platform development team set aggressive system data %x bandwidth and memory read latency targets in order to achieve high-performance %x goals. The low-latency criterion was factored into design decisions made at %x each of the seven layers of platform development. The combination of %x industry-leading microprocessor technology and a system platform focused %x on low latency has resulted in a 12-processor server implementation --- %x the AlphaServer 8400 --- capable of supercomputer levels of performance. %k DEC Alpha server, performance, memory latency %s staelin%cello@hpl.hp.com (Wed Sep 27 17:27:23 PDT 1995) %z Book %K Toshiba94 %A Toshiba %T DRAM Components and Modules %I Toshiba America Electronic Components, Inc. %P A59-A77,C37-C42 %D 1994 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %V to appear %D December 1995 %z Article %K FSF89 %A Richard Stallman %Q Free Software Foundation %T General Public License %D 1989 %O Included with \*[lmbench] ������lmbench-3.0-a9/doc/references-����������������������������������������������������������������������0000664�0000764�0000764�00000014450�07045412511�016056� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Wolman89 %A Barry L. Wolman %A Thomas M. Olson %T IOBENCH: a system independent IO benchmark %J Computer Architecture News %V 17 %N 5 %D September 1989 %P 55-70 %x IOBENCH is an operating system and processor independent synthetic %x input/output (IO) benchmark designed to put a configurable IO and %x processor (CP) load on the system under test. This paper discusses %x the UNIX versions. %k IOBENCH, synthetic I/O benchmark, UNIX workload %s vinton%cello@hplabs.hp.com (Fri Sep 20 12:55:58 PDT 1991) %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Article %K Chen94a %A P. M. Chen %A D. A. Patterson %T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance %D November 1994 %J Transactions on Computer Systems %V 12 %N 4 %P 308-339 %x Current I/O benchmarks suffer from several chronic problems: they %x quickly become obsolete; they do not stress the I/O system; and they %x do not help much in undelsi;anding I/O system performance. We %x propose a new approach to I/O performance analysis. First, we %x propose a self-scaling benchmark that dynamically adjusts aspects of %x its workload according to the performance characteristic of the %x system being measured. By doing so, the benchmark automatically %x scales across current and future systems. The evaluation aids in %x understanding system performance by reporting how performance varies %x according to each of five workload parameters. Second, we propose %x predicted performance, a technique for using the results from the %x self-scaling evaluation to estimate quickly the performance for %x workloads that have not been measured. We show that this technique %x yields reasonably accurate performance estimates and argue that this %x method gives a far more accurate comparative performance evaluation %x than traditional single-point benchmarks. We apply our new %x evaluation technique by measuring a SPARCstation 1+ with one SCSI %x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running %x the Sprite LFS operating system with a three-disk disk array, a %x Convex C240 minisupercomputer with a four-disk disk array, and a %x Solbourne 5E/905 fileserver with a two-disk disk array. %s toc@hpl.hp.com (Mon Mar 13 10:57:38 1995) %s wilkes%hplajw@hpl.hp.com (Sun Mar 19 12:38:01 PST 1995) %s wilkes%cello@hpl.hp.com (Sun Mar 19 12:38:53 PST 1995) %z InProceedings %K Ousterhout90 %s wilkes%cello@hplabs.hp.com (Fri Jun 29 20:46:08 PDT 1990) %A John K. Ousterhout %T Why aren't operating systems getting faster as fast as hardware? %C Proceedings USENIX Summer Conference %c Anaheim, CA %D June 1990 %P 247-256 %x This paper evaluates several hardware pplatforms and operating systems using %x a set of benchmarks that stress kernel entry/exit, file systems, and %x other things related to operating systems. The overall conclusion is that %x operating system performance is not improving at the same rate as the base speed of the %x underlying hardware. The most obvious ways to remedy this situation %x are to improve memory bandwidth and reduce operating systems' %x tendency to wait for disk operations to complete. %o Typical performance of 10-20 MIPS cpus is only 0.4 times what %o their raw hardware performance would suggest. HP-UX is %o particularly bad on the HP 9000/835, at about 0.2x. (Although %o this measurement discounted a highly-tuned getpid call.) %k OS performance, RISC machines, HP9000 Series 835 system calls %z InProceedings %K McVoy91 %A L. W. McVoy %A S. R. Kleiman %T Extent-like Performance from a Unix File System %C Proceedings USENIX Winter Conference %c Dallas, TX %D January 1991 %P 33-43 %z Article %K Chen93d %A Peter M. Chen %A David Patterson %T Storage performance \- metrics and benchmarks %J Proceedings of the IEEE %V 81 %N 8 %D August 1993 %P 1151-1165 %x Discusses metrics and benchmarks used in storage performance evaluation. %x Describes, reviews, and runs popular I/O benchmarks on three systems. Also %x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling %x benchmark with predicted performance. %k I/O, storage, benchmark, workload, self-scaling benchmark, %k predicted performance, disk, performance evaluation %s staelin%cello@hpl.hp.com (Wed Sep 27 16:21:11 PDT 1995) %z Article %K Park90a %A Arvin Park %A J. C. Becker %T IOStone: a synthetic file system benchmark %J Computer Architecture News %V 18 %N 2 %D June 1990 %P 45-52 %o this benchmark is useless for all modern systems; it fits %o completely inside the file system buffer cache. Soon it may even %o fit inside the processor cache! %k IOStone, I/O, benchmarks %s staelin%cello@hpl.hp.com (Wed Sep 27 16:37:26 PDT 1995) %z Article %K Fenwick95 %A David M. Fenwick %A Denis J. Foley %A William B. Gist %A Stephen R. VanDoren %A Danial Wissell %T The AlphaServer 8000 series: high-end server platform development %J Digital Technical Journal %V 7 %N 1 %D August 1995 %P 43-65 %x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end %x server products. Both servers are based on the 300Mhz Alpha 21164 %x microprocessor and on the AlphaServer 8000-series platform architecture. %x The AlphaServer 8000 platform development team set aggressive system data %x bandwidth and memory read latency targets in order to achieve high-performance %x goals. The low-latency criterion was factored into design decisions made at %x each of the seven layers of platform development. The combination of %x industry-leading microprocessor technology and a system platform focused %x on low latency has resulted in a 12-processor server implementation --- %x the AlphaServer 8400 --- capable of supercomputer levels of performance. %k DEC Alpha server, performance, memory latency %s staelin%cello@hpl.hp.com (Wed Sep 27 17:27:23 PDT 1995) %z Book %K Toshiba94 %A Toshiba %T DRAM Components and Modules %I Toshiba America Electronic Components, Inc. %P A59-A77,C37-C42 %D 1994 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %V to appear %D December 1995 %z Article %K FSF89 %A Richard Stallman %Q Free Software Foundation %T General Public License %D 1989 %O Included with \*[lmbench] ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/references-lmbench3��������������������������������������������������������������0000664�0000764�0000764�00000027671�07633046421�017511� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Staelin98 %A Carl Staelin %A Larry McVoy %T mhz: Anatomy of a microbenchmark %B Proceedings USENIX Annual Technical Conference %C New Orleans, LA %D June 1998 %P 155-166 %z Article %K McVoy96 %A Larry McVoy %A Carl Staelin %T lmbench: Portable tools for performance analysis %B Proceedings USENIX Winter Conference %C San Diego, CA %D January 1996 %P 279-284 %K Bray90 %A Tim Bray %T Bonnie benchmark %D 1990 %o http://www.textuality.com/bonnie/ %z Article %K Brown97 %A Aaron Brown %A Margo Seltzer %T Operating system benchmarking in the wake of lmbench: a case study of the performance of NetBSD on the Intel x86 architecture %B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems %C Seattle, WA %D June 1997 %P 214-224 %o http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html %z Article %A Cristina Hristea %A Danial Lenoski %A John Keen %T Measuring memory hierarchy performance of cache-coherent multiprocessors using microbenchmarks %B Proceedings of Supercomputing '97 %D November 1997 %C San Jose, CA %o http://www.supercomp.org/sc97/proceedings/TECH/HRISTEA/ %z Thesis %K Prestor01 %A Uros Prestor %T Evaluating the memory performance of a ccNUMA system %I Department of Computer Science, University of Utah %D May 2001 %z Thesis %K Saavedra92 %A Rafael H. Saavedra-Barrera %T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking %I Department of Computer Science, University of California at Berkeley %D 1992 %z Article %K Saavedra95 %A R.H. Saavedra %A A.J. Smith %T Measuring cache and TLB performance and their effect on benchmark runtimes %J IEEE Transactions on Computers %V 44 %N 10 %D October 1995 %P 1223-1235 %z Article %K Wolman89 %A Barry L. Wolman %A Thomas M. Olson %T IOBENCH: a system independent IO benchmark %J Computer Architecture News %V 17 %N 5 %D September 1989 %P 55-70 %x IOBENCH is an operating system and processor independent synthetic %x input/output (IO) benchmark designed to put a configurable IO and %x processor (CP) load on the system under test. This paper discusses %x the UNIX versions. %k IOBENCH, synthetic I/O benchmark, UNIX workload %s vinton%cello@hplabs.hp.com (Fri Sep 20 12:55:58 PDT 1991) %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Book %K Jain91 %A Raj Jain %T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling %I Wiley-Interscience %C New York, NY %D April 1991 %z Article %K Chen94a %A P. M. Chen %A D. A. Patterson %T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance %D November 1994 %J Transactions on Computer Systems %V 12 %N 4 %P 308-339 %x Current I/O benchmarks suffer from several chronic problems: they %x quickly become obsolete; they do not stress the I/O system; and they %x do not help much in undelsi;anding I/O system performance. We %x propose a new approach to I/O performance analysis. First, we %x propose a self-scaling benchmark that dynamically adjusts aspects of %x its workload according to the performance characteristic of the %x system being measured. By doing so, the benchmark automatically %x scales across current and future systems. The evaluation aids in %x understanding system performance by reporting how performance varies %x according to each of five workload parameters. Second, we propose %x predicted performance, a technique for using the results from the %x self-scaling evaluation to estimate quickly the performance for %x workloads that have not been measured. We show that this technique %x yields reasonably accurate performance estimates and argue that this %x method gives a far more accurate comparative performance evaluation %x than traditional single-point benchmarks. We apply our new %x evaluation technique by measuring a SPARCstation 1+ with one SCSI %x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running %x the Sprite LFS operating system with a three-disk disk array, a %x Convex C240 minisupercomputer with a four-disk disk array, and a %x Solbourne 5E/905 fileserver with a two-disk disk array. %s toc@hpl.hp.com (Mon Mar 13 10:57:38 1995) %s wilkes%hplajw@hpl.hp.com (Sun Mar 19 12:38:01 PST 1995) %s wilkes%cello@hpl.hp.com (Sun Mar 19 12:38:53 PST 1995) %z InProceedings %K Ousterhout90 %s wilkes%cello@hplabs.hp.com (Fri Jun 29 20:46:08 PDT 1990) %A John K. Ousterhout %T Why aren't operating systems getting faster as fast as hardware? %B Proceedings USENIX Summer Conference %C Anaheim, CA %D June 1990 %P 247-256 %x This paper evaluates several hardware pplatforms and operating systems using %x a set of benchmarks that stress kernel entry/exit, file systems, and %x other things related to operating systems. The overall conclusion is that %x operating system performance is not improving at the same rate as the base speed of the %x underlying hardware. The most obvious ways to remedy this situation %x are to improve memory bandwidth and reduce operating systems' %x tendency to wait for disk operations to complete. %o Typical performance of 10-20 MIPS cpus is only 0.4 times what %o their raw hardware performance would suggest. HP-UX is %o particularly bad on the HP 9000/835, at about 0.2x. (Although %o this measurement discounted a highly-tuned getpid call.) %k OS performance, RISC machines, HP9000 Series 835 system calls %z InProceedings %K McVoy91 %A L. W. McVoy %A S. R. Kleiman %T Extent-like Performance from a Unix File System %B Proceedings USENIX Winter Conference %C Dallas, TX %D January 1991 %P 33-43 %z Article %K Chen93d %A Peter M. Chen %A David Patterson %T Storage performance \- metrics and benchmarks %J Proceedings of the IEEE %V 81 %N 8 %D August 1993 %P 1151-1165 %x Discusses metrics and benchmarks used in storage performance evaluation. %x Describes, reviews, and runs popular I/O benchmarks on three systems. Also %x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling %x benchmark with predicted performance. %k I/O, storage, benchmark, workload, self-scaling benchmark, %k predicted performance, disk, performance evaluation %s staelin%cello@hpl.hp.com (Wed Sep 27 16:21:11 PDT 1995) %z Article %K Park90a %A Arvin Park %A J. C. Becker %T IOStone: a synthetic file system benchmark %J Computer Architecture News %V 18 %N 2 %D June 1990 %P 45-52 %o this benchmark is useless for all modern systems; it fits %o completely inside the file system buffer cache. Soon it may even %o fit inside the processor cache! %k IOStone, I/O, benchmarks %s staelin%cello@hpl.hp.com (Wed Sep 27 16:37:26 PDT 1995) %z Article %K Fenwick95 %A David M. Fenwick %A Denis J. Foley %A William B. Gist %A Stephen R. VanDoren %A Danial Wissell %T The AlphaServer 8000 series: high-end server platform development %J Digital Technical Journal %V 7 %N 1 %D August 1995 %P 43-65 %x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end %x server products. Both servers are based on the 300Mhz Alpha 21164 %x microprocessor and on the AlphaServer 8000-series platform architecture. %x The AlphaServer 8000 platform development team set aggressive system data %x bandwidth and memory read latency targets in order to achieve high-performance %x goals. The low-latency criterion was factored into design decisions made at %x each of the seven layers of platform development. The combination of %x industry-leading microprocessor technology and a system platform focused %x on low latency has resulted in a 12-processor server implementation --- %x the AlphaServer 8400 --- capable of supercomputer levels of performance. %k DEC Alpha server, performance, memory latency %s staelin%cello@hpl.hp.com (Wed Sep 27 17:27:23 PDT 1995) %z Book %K Toshiba94 %Q Toshiba %T DRAM Components and Modules %I Toshiba America Electronic Components, Inc. %P A59-A77,C37-C42 %D 1994 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %D December 1995 %z Article %K McCalpin02 %A John D. McCalpin %T The STREAM2 home page %o http://www.cs.virginia.edu/stream/stream2/ %D 2002 %z Article %K FSF89 %A Richard Stallman %Q Free Software Foundation %T General Public License %D 1989 %O Included with \*[lmbench] %z Article %K Shein89 %A Barry Shein %A Mike Callahan %A Paul Woodbury %T NFSSTONE: A network file server performance benchmark %B Proceedings USENIX Summer Conference %C Baltimore, MD %D June 1989 %P 269-275 %z Article %K Weicker84 %A R.P. Weicker %T Dhrystone: A synthetic systems programming benchmark %J Communications of the ACM %V 27 %N 10 %P 1013--1030 %D 1984 %z Article %K Howard88 %A J. Howard %A M. Kazar %A S. Menees %A S. Nichols %A M. Satyanrayanan %A R. Sidebotham %A M. West %T Scale and performance in a distributed system %J ACM Transactions on Computer Systems %V 6 %N 1 %D February 1988 %P 51--81 %k Andrew benchmark %z Article %K Banga97 %A Guarav Banga %A Peter Druschel %T Measuring the capacity of a web server %B Proceedings USENIX Symposium on Internet Technologies and Systems %C Monterey, CA %D December 1997 %P 61--71 %z Article %K Banga98 %A Guarav Banga %A Jeffrey C. Mogul %T Scalable kernel performance for internet servers under realistic loads %B Proceedings of the 1998 USENIX Annual Technical Conference %C New Orleans, LA %D June 1998 %P 69--83 %z Article %K Mogul99 %A Jeffrey C. Mogul %T Brittle metrics in operating systems research %B Proceedings 7th IEEE Workshop on Hot Topics in Operating Systems (HotOS-VII) %C Rio Rico, AZ %P 90--95 %D March 1999 %z Article %K Regehr2002 %A John Regehr %T Inferring scheduling behavior with Hourglass %B Proceedings of the USENIX Annual Technical Conference FREENIX track %C Monterey, CA %D June 2002 %P 143--156 %z Article %K Seltzer99 %A Margo Seltzer %A David Krinsky %A Keith Smith %A Xiolan Zhang %T The case for application-specific benchmarking %B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems %C Rico, AZ %D 1999 %P 102--107 %z Article %K Smith97 %A Keith A. Smith %A Margo L. Seltzer %T File system aging --- Increasing the relevance of file system benchmarks %B Proceedings of the 1997 SIGMETRICS Conference %D June 1997 %C Seattle, WA %P 203-213 %z Article %K Tullsen96 %A Dean Tullsen %A Susan Eggers %A Joel Emer %A Henry Levy %A Jack Lo %A Rebecca Stamm %T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor %C Proceedings of the 23rd Annual International Symposium on Computer Architecture %D May 1996 %P 191-202 %O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps %z Article %K Tullsen99 %A Dean Tullsen %A Jack Lo %A Susan Eggers %A Henry Levy %T Supporting fine-grain synchronization on a simultaneous multithreaded processor %B Proceedings of the 5th International Symposium on High Performance Computer Architecture %D January 1999 %P 54-58 %O http://www.cs.washington.edu/research/smt/papers/hpca.ps %z Report %K Whaley97 %A R. Clint Whaley %A Jack Dongarra %T Automatically tuned linear algebra software %I Department of Computer Science, University of Tennessee %C Knoxville, TN %R UT-CS-97-366 %D 1997 %o http://math-atlas.sourceforge.net/ %z Article %K SPEChpc96 %Q Standard Performance Evaluation Corporation %T SPEC HPC96 benchmark %D 1996 %O http://www.specbench.org/hpg/hpc96/ %z Article %K Parkbench %Q PARallel Kernels and BENCHmarks committee %T PARKBENCH %D 2002 %O http://www.netlib.org/parkbench/ %z Article %K NAS %Q NASA Advanced Supercomputing Division, NASA Ames Research Center %T NAS parallel benchmarks %O http://www.nas.nasa.gov/NAS/NPB %z Article %K Glendinning94 %A Ian Glendinning %T GENESIS distributed memory benchmark suite %O http://wotug.ukc.ac.uk/parallel/performance/benchmarks/genesis %D 1994 %z Article %K Intel99 %Q Intel %T Profusion --- An 8-way symmetric multiprocessing chipset %O http://netserver.hp.com/docs/download.asp?file=tp_profusion(r).pdf %D July 1999 �����������������������������������������������������������������������lmbench-3.0-a9/doc/references-memhier���������������������������������������������������������������0000664�0000764�0000764�00000017470�07564162402�017441� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Staelin02b %A Carl Staelin %T lmbench3: Measuring scalability %D November 2002 %I Hewlett-Packard Laboratories %C Palo Alto, CA %z Article %K Staelin02c %A Carl Staelin %T Utilizing intra-processor parallelism %D December 2002 %I Hewlett-Packard Laboratories %C Palo Alto, CA %z Article %K Whaley98 %A R. Clint Whaley %A Jack Dongarra %T Automatically tuned linear algebra software %C Proceedings of the 1998 ACM/IEEE SC98 Conference %D 1998 %O http://sourceforge.net/projects/math-atlas %z Article %K Staelin98 %A Carl Staelin %A Larry McVoy %T mhz: Anatomy of a microbenchmark %C Proceedings USENIX Annual Technical Conference %c New Orleans, LA %D June 1998 %P 155-166 %z Article %K McVoy96 %A Larry McVoy %A Carl Staelin %T lmbench: Portable tools for performance analysis %C Proceedings USENIX Winter Conference %c San Diego, CA %D January 1996 %P 279-284 %a Thesis %K Prestor01 %A Uros Prestor %T Evaluating the memory performance of a ccNUMA system %R Masters Thesis %I School of Computing, University of Utah %c Salt Lake City, Utah %D May 2001 %O http://www.cs.utah.edu/~uros/thesis/thesis.pdf %z Article %K Saavedra95 %A R.H. Saavedra %A A.J. Smith %T Measuring cache and TLB performance and their effect on benchmark runtimes %J IEEE Transactions on Computers %V 44 %N 10 %D October 1995 %P 1223-1235 %z Article %K Wolman89 %A Barry L. Wolman %A Thomas M. Olson %T IOBENCH: a system independent IO benchmark %J Computer Architecture News %V 17 %N 5 %D September 1989 %P 55-70 %x IOBENCH is an operating system and processor independent synthetic %x input/output (IO) benchmark designed to put a configurable IO and %x processor (CP) load on the system under test. This paper discusses %x the UNIX versions. %k IOBENCH, synthetic I/O benchmark, UNIX workload %s vinton%cello@hplabs.hp.com (Fri Sep 20 12:55:58 PDT 1991) %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Article %K Chen94a %A P. M. Chen %A D. A. Patterson %T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance %D November 1994 %J Transactions on Computer Systems %V 12 %N 4 %P 308-339 %x Current I/O benchmarks suffer from several chronic problems: they %x quickly become obsolete; they do not stress the I/O system; and they %x do not help much in undelsi;anding I/O system performance. We %x propose a new approach to I/O performance analysis. First, we %x propose a self-scaling benchmark that dynamically adjusts aspects of %x its workload according to the performance characteristic of the %x system being measured. By doing so, the benchmark automatically %x scales across current and future systems. The evaluation aids in %x understanding system performance by reporting how performance varies %x according to each of five workload parameters. Second, we propose %x predicted performance, a technique for using the results from the %x self-scaling evaluation to estimate quickly the performance for %x workloads that have not been measured. We show that this technique %x yields reasonably accurate performance estimates and argue that this %x method gives a far more accurate comparative performance evaluation %x than traditional single-point benchmarks. We apply our new %x evaluation technique by measuring a SPARCstation 1+ with one SCSI %x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running %x the Sprite LFS operating system with a three-disk disk array, a %x Convex C240 minisupercomputer with a four-disk disk array, and a %x Solbourne 5E/905 fileserver with a two-disk disk array. %s toc@hpl.hp.com (Mon Mar 13 10:57:38 1995) %s wilkes%hplajw@hpl.hp.com (Sun Mar 19 12:38:01 PST 1995) %s wilkes%cello@hpl.hp.com (Sun Mar 19 12:38:53 PST 1995) %z InProceedings %K Ousterhout90 %s wilkes%cello@hplabs.hp.com (Fri Jun 29 20:46:08 PDT 1990) %A John K. Ousterhout %T Why aren't operating systems getting faster as fast as hardware? %C Proceedings USENIX Summer Conference %c Anaheim, CA %D June 1990 %P 247-256 %x This paper evaluates several hardware pplatforms and operating systems using %x a set of benchmarks that stress kernel entry/exit, file systems, and %x other things related to operating systems. The overall conclusion is that %x operating system performance is not improving at the same rate as the base speed of the %x underlying hardware. The most obvious ways to remedy this situation %x are to improve memory bandwidth and reduce operating systems' %x tendency to wait for disk operations to complete. %o Typical performance of 10-20 MIPS cpus is only 0.4 times what %o their raw hardware performance would suggest. HP-UX is %o particularly bad on the HP 9000/835, at about 0.2x. (Although %o this measurement discounted a highly-tuned getpid call.) %k OS performance, RISC machines, HP9000 Series 835 system calls %z InProceedings %K McVoy91 %A L. W. McVoy %A S. R. Kleiman %T Extent-like Performance from a Unix File System %C Proceedings USENIX Winter Conference %c Dallas, TX %D January 1991 %P 33-43 %z Article %K Chen93d %A Peter M. Chen %A David Patterson %T Storage performance \- metrics and benchmarks %J Proceedings of the IEEE %V 81 %N 8 %D August 1993 %P 1151-1165 %x Discusses metrics and benchmarks used in storage performance evaluation. %x Describes, reviews, and runs popular I/O benchmarks on three systems. Also %x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling %x benchmark with predicted performance. %k I/O, storage, benchmark, workload, self-scaling benchmark, %k predicted performance, disk, performance evaluation %s staelin%cello@hpl.hp.com (Wed Sep 27 16:21:11 PDT 1995) %z Article %K Park90a %A Arvin Park %A J. C. Becker %T IOStone: a synthetic file system benchmark %J Computer Architecture News %V 18 %N 2 %D June 1990 %P 45-52 %o this benchmark is useless for all modern systems; it fits %o completely inside the file system buffer cache. Soon it may even %o fit inside the processor cache! %k IOStone, I/O, benchmarks %s staelin%cello@hpl.hp.com (Wed Sep 27 16:37:26 PDT 1995) %z Article %K Fenwick95 %A David M. Fenwick %A Denis J. Foley %A William B. Gist %A Stephen R. VanDoren %A Danial Wissell %T The AlphaServer 8000 series: high-end server platform development %J Digital Technical Journal %V 7 %N 1 %D August 1995 %P 43-65 %x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end %x server products. Both servers are based on the 300Mhz Alpha 21164 %x microprocessor and on the AlphaServer 8000-series platform architecture. %x The AlphaServer 8000 platform development team set aggressive system data %x bandwidth and memory read latency targets in order to achieve high-performance %x goals. The low-latency criterion was factored into design decisions made at %x each of the seven layers of platform development. The combination of %x industry-leading microprocessor technology and a system platform focused %x on low latency has resulted in a 12-processor server implementation --- %x the AlphaServer 8400 --- capable of supercomputer levels of performance. %k DEC Alpha server, performance, memory latency %s staelin%cello@hpl.hp.com (Wed Sep 27 17:27:23 PDT 1995) %z Book %K Toshiba94 %A Toshiba %T DRAM Components and Modules %I Toshiba America Electronic Components, Inc. %P A59-A77,C37-C42 %D 1994 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %D December 1995 %z Article %K FSF89 %A Richard Stallman %Q Free Software Foundation %T General Public License %D 1989 %O Included with \*[lmbench] %z Article %K Min01 %A Rui Min %A Yiming Hu %T Improving performance of large physically indexed caches by decoupling memory addresses from cache addresses %J IEEE Transactions on Computers %V 50 %N 11 %D November 2001 %P 1191-1201 ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/references-parallel��������������������������������������������������������������0000664�0000764�0000764�00000007022�07414075675�017610� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Tullsen96 %A Dean Tullsen %A Susan Eggers %A Joel Emer %A Henry Levy %A Jack Lo %A Rebecca Stamm %T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor %C Proceedings of the 23rd Annual International Symposium on Computer Architecture %D May 1996 %P 191-202 %O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps %z Article %K Tullsen99 %A Dean Tullsen %A Jack Lo %A Susan Eggers %A Henry Levy %T Suppoerting fine-grain synchronization on a simultaneous multithreaded processor %C Proceedings of the 5th International Symposium on High Performance Computer Architecture %D January 1999 %P 54-58 %O http://www.cs.washington.edu/research/smt/papers/hpca.ps %z Article %K Kumar97 %A A. Kumar %T The HP PA-8000 RISC CPU %J IEEE Micro %V 17 %N 2 %D March-April 1997 %P 27-32 %z Article %K Schlansker00 %A M.S. Schlansker %A B.R. Rau %T EPIC: Explicitly parallel instruction computing %J IEEE Computer %V 33 %N 2 %D Feb. 2000 %P 37-45 %z Article %K Smith95 %A James E. Smith %A Gurindar S. Sohi %T The microarchitecture of superscalar processors %J Proceedings of the IEEE %V 83 %D October 1995 %P 1609-1624 %z Thesis %K Munoz97 %A Raul E. Silvera Munoz %T Static instruction scheduling for dynamic issue processors %I ACAPS Laboratory, School of Computer Science, McGill University %D 1997 %z Article %K Agarwal96 %A Ramesh K. Agarwal %T A super scalar sort algorithm for RISC processors %C Processings 1996 ACM SIGMOD International Conference on Management of Data %D 1996 %P 240-246 %O http://citeseer.nj.nec.com/agarwal96super.html %z Article %K Staelin01a %A Carl Staelin %T Analyzing the memory hierarchy %D October 2001 %I Hewlett-Packard Laboratories %C Palo Alto, CA %z Article %K Staelin01b %A Carl Staelin %T lmbench3: Measuring scalability %D October 2001 %I Hewlett-Packard Laboratories %C Palo Alto, CA %z Article %K Frigo98 %A M. Frigo %A S.G. Johnson %T FFTW: An adaptive software architecture for the FFT %C Proceedings 1998 ICASSP %V 3 %P 1381-1384 %O http://www.fftw.org/fftw-paper-icassp.pdf %z Article %K Whaley98 %A R. Clint Whaley %A Jack Dongarra %T Automatically tuned linear algebra software %C Proceedings of the 1998 ACM/IEEE SC98 Conference %D 1998 %O http://sourceforge.net/projects/math-atlas %z Article %K Staelin98 %A Carl Staelin %A Larry McVoy %T mhz: Anatomy of a microbenchmark %C Proceedings USENIX Annual Technical Conference %c New Orleans, LA %D June 1998 %P 155-166 %z Article %K McVoy96 %A Larry McVoy %A Carl Staelin %T lmbench: Portable tools for performance analysis %C Proceedings USENIX Winter Conference %c San Diego, CA %D January 1996 %P 279-284 %z Thesis %K Prestor01 %A Uros Prestor %T Evaluating the memory performance of a ccNUMA system %R Masters Thesis %I School of Computing, University of Utah %C Salt Lake City, Utah %D May 2001 %O http://www.cs.utah.edu/~uros/thesis/thesis.pdf %z Article %K Saavedra95 %A R.H. Saavedra %A A.J. Smith %T Measuring cache and TLB performance and their effect on benchmark runtimes %J IEEE Transactions on Computers %V 44 %N 10 %D October 1995 %P 1223-1235 %z Book %K Knuth73 %A Donald E. Knuth %T The Art of computer programming, 2nd Edition %I Addison-Wesley %D 1973 %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %D December 1995 ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/references-userguide�������������������������������������������������������������0000664�0000764�0000764�00000023570�07564162402�020005� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K Banga97 %A Guarav Banga %A Peter Druschel %T Measuring the capacity of a web server %B Proceedings USENIX Symposium on Internet Technologies and Systems %C Monterey, CA %D December 1997 %z Article %K Banga98 %A Guarav Banga %A Jeffrey C. Mogul %T Scalable kernel performance for internet servers under realistic loads %B Proceedings of the 1998 USENIX Annual Technical Conference %C New Orleans, LA %D June 1998 %K Bray90 %A Tim Bray %T Bonnie benchmark %D 1990 %O http://www.textuality.com/bonnie/ %z Article %K Brown97 %A Aaron Brown %A Margo Seltzer %T Operating system benchmarking in the wake of lmbench: A case study of the performance of NetBSD on the Intel x86 architecture %B Proceedings of the 1997 ACM SIGMETRICS Conference on Measurement and Modeling of Computer Systems %C Seattle, WA %D June 1997 %P 214-224 %O http://www.eecs.harvard.edu/~vino/perf/hbench/sigmetrics/hbench.html %z Article %K Chen93d %A Peter M. Chen %A David Patterson %T Storage performance \- metrics and benchmarks %J Proceedings of the IEEE %V 81 %N 8 %D August 1993 %P 1151-1165 %x Discusses metrics and benchmarks used in storage performance evaluation. %x Describes, reviews, and runs popular I/O benchmarks on three systems. Also %x describes two new approaches to storage benchmarks: LADDIS and a Self-Scaling %x benchmark with predicted performance. %k I/O, storage, benchmark, workload, self-scaling benchmark, %k predicted performance, disk, performance evaluation %s staelin%cello@hpl.hp.com (Wed Sep 27 16:21:11 PDT 1995) %z Article %K Chen94a %A P. M. Chen %A D. A. Patterson %T A new approach to I/O performance evaluation \- self-scaling I/O benchmarks, predicted I/O performance %D November 1994 %J Transactions on Computer Systems %V 12 %N 4 %P 308-339 %x Current I/O benchmarks suffer from several chronic problems: they %x quickly become obsolete; they do not stress the I/O system; and they %x do not help much in undelsi;anding I/O system performance. We %x propose a new approach to I/O performance analysis. First, we %x propose a self-scaling benchmark that dynamically adjusts aspects of %x its workload according to the performance characteristic of the %x system being measured. By doing so, the benchmark automatically %x scales across current and future systems. The evaluation aids in %x understanding system performance by reporting how performance varies %x according to each of five workload parameters. Second, we propose %x predicted performance, a technique for using the results from the %x self-scaling evaluation to estimate quickly the performance for %x workloads that have not been measured. We show that this technique %x yields reasonably accurate performance estimates and argue that this %x method gives a far more accurate comparative performance evaluation %x than traditional single-point benchmarks. We apply our new %x evaluation technique by measuring a SPARCstation 1+ with one SCSI %x disk, an HP 730 with one SCSI-II disk, a DECstation 5000/200 running %x the Sprite LFS operating system with a three-disk disk array, a %x Convex C240 minisupercomputer with a four-disk disk array, and a %x Solbourne 5E/905 fileserver with a two-disk disk array. %s toc@hpl.hp.com (Mon Mar 13 10:57:38 1995) %s wilkes%hplajw@hpl.hp.com (Sun Mar 19 12:38:01 PST 1995) %s wilkes%cello@hpl.hp.com (Sun Mar 19 12:38:53 PST 1995) %z Article %K Fenwick95 %A David M. Fenwick %A Denis J. Foley %A William B. Gist %A Stephen R. VanDoren %A Danial Wissell %T The AlphaServer 8000 series: high-end server platform development %J Digital Technical Journal %V 7 %N 1 %D August 1995 %P 43-65 %x The AlphaServer 8400 and the AlphaServer 8200 are Digital's newest high-end %x server products. Both servers are based on the 300Mhz Alpha 21164 %x microprocessor and on the AlphaServer 8000-series platform architecture. %x The AlphaServer 8000 platform development team set aggressive system data %x bandwidth and memory read latency targets in order to achieve high-performance %x goals. The low-latency criterion was factored into design decisions made at %x each of the seven layers of platform development. The combination of %x industry-leading microprocessor technology and a system platform focused %x on low latency has resulted in a 12-processor server implementation --- %x the AlphaServer 8400 --- capable of supercomputer levels of performance. %k DEC Alpha server, performance, memory latency %s staelin%cello@hpl.hp.com (Wed Sep 27 17:27:23 PDT 1995) %z Book %K Hennessy96 %A John L. Hennessy %A David A. Patterson %T Computer Architecture A Quantitative Approach, 2nd Edition %I Morgan Kaufman %D 1996 %z Article %K Howard88 %A J. Howard %A M. Kazar %A S. Menees %A S. Nichols %A M. Satyanrayanan %A R. Sidebotham %A M. West %T Scale and performance in a distributed system %J ACM Transactions on Computer Systems %V 6 %N 1 %D February 1988 %P 51-81 %k Andrew benchmark %z Book %K Jain91 %A Raj Jain %T The Art of Computer Systems Performance Analysis: Techniques for Experimental Design, Measurement, Simulation, and Modeling %I Wiley-Interscience %C New York, NY %D April 1991 %z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %D December 1995 %z InProceedings %K McVoy91 %A L. W. McVoy %A S. R. Kleiman %T Extent-like Performance from a Unix File System %B Proceedings USENIX Winter Conference %C Dallas, TX %D January 1991 %P 33-43 %z Article %K McVoy96 %A Larry McVoy %A Carl Staelin %T lmbench: Portable tools for performance analysis %B Proceedings USENIX Winter Conference %C San Diego, CA %D January 1996 %P 279-284 %z InProceedings %K Ousterhout90 %s wilkes%cello@hplabs.hp.com (Fri Jun 29 20:46:08 PDT 1990) %A John K. Ousterhout %T Why aren't operating systems getting faster as fast as hardware? %B Proceedings USENIX Summer Conference %C Anaheim, CA %D June 1990 %P 247-256 %x This paper evaluates several hardware pplatforms and operating systems using %x a set of benchmarks that stress kernel entry/exit, file systems, and %x other things related to operating systems. The overall conclusion is that %x operating system performance is not improving at the same rate as the base speed of the %x underlying hardware. The most obvious ways to remedy this situation %x are to improve memory bandwidth and reduce operating systems' %x tendency to wait for disk operations to complete. %o Typical performance of 10-20 MIPS cpus is only 0.4 times what %o their raw hardware performance would suggest. HP-UX is %o particularly bad on the HP 9000/835, at about 0.2x. (Although %o this measurement discounted a highly-tuned getpid call.) %k OS performance, RISC machines, HP9000 Series 835 system calls %z Article %K Park90a %A Arvin Park %A J. C. Becker %T IOStone: a synthetic file system benchmark %J Computer Architecture News %V 18 %N 2 %D June 1990 %P 45-52 %o this benchmark is useless for all modern systems; it fits %o completely inside the file system buffer cache. Soon it may even %o fit inside the processor cache! %k IOStone, I/O, benchmarks %s staelin%cello@hpl.hp.com (Wed Sep 27 16:37:26 PDT 1995) %z Thesis %K Prestor01 %A Uros Prestor %T Evaluating the memory performance of a ccNUMA system %I Department of Computer Science, University of Utah %D May 2001 %z Thesis %K Saavedra92 %A Rafael H. Saavedra-Barrera %T CPU Performance evaluation and execution time prediction using narrow spectrum benchmarking %I Department of Computer Science, University of California at Berkeley %D 1992 %z Article %K Saavedra95 %A R.H. Saavedra %A A.J. Smith %T Measuring cache and TLB performance and their effect on benchmark runtimes %J IEEE Transactions on Computers %V 44 %N 10 %D October 1995 %P 1223-1235 %z Article %k Seltzer99 %A Margo Seltzer %A David Krinsky %A Keith Smith %A Xiolan Zhang %T The case for application-specific benchmarking %B Proceedings of the 1999 Workshop on Hot Topics in Operating Systems %C Rico, AZ %D 1999 %z Article %K Shein89 %A Barry Shein %A Mike Callahan %A Paul Woodbury %T NFSSTONE: A network file server performance benchmark %B Proceedings USENIX Summer Conference %C Baltimore, MD %D June 1989 %P 269-275 %z Article %K Staelin98 %A Carl Staelin %A Larry McVoy %T mhz: Anatomy of a microbenchmark %B Proceedings USENIX Annual Technical Conference %C New Orleans, LA %D June 1998 %P 155-166 %z Article %K FSF89 %A Richard Stallman %Q Free Software Foundation %T General Public License %D 1989 %O Included with \*[lmbench] %z Book %K Toshiba94 %A Toshiba %T DRAM Components and Modules %I Toshiba America Electronic Components, Inc. %P A59-A77,C37-C42 %D 1994 %z Article %K Tullsen96 %A Dean Tullsen %A Susan Eggers %A Joel Emer %A Henry Levy %A Jack Lo %A Rebecca Stamm %T Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor %C Proceedings of the 23rd Annual International Symposium on Computer Architecture %D May 1996 %P 191-202 %O http://www.cs.washington.edu/research/smt/papers/ISCA96.ps %z Article %K Tullsen99 %A Dean Tullsen %A Jack Lo %A Susan Eggers %A Henry Levy %T Supporting fine-grain synchronization on a simultaneous multithreaded processor %B Proceedings of the 5th International Symposium on High Performance Computer Architecture %D January 1999 %P 54-58 %O http://www.cs.washington.edu/research/smt/papers/hpca.ps %z Article %K Weicker84 %A R.P. Weicker %T Dhrystone: A synthetic systems programming benchmark %J CACM %V 27 %N 10 %P 1013-1030 %D 1984 %z Article %K Wolman89 %A Barry L. Wolman %A Thomas M. Olson %T IOBENCH: a system independent IO benchmark %J Computer Architecture News %V 17 %N 5 %D September 1989 %P 55-70 %x IOBENCH is an operating system and processor independent synthetic %x input/output (IO) benchmark designed to put a configurable IO and %x processor (CP) load on the system under test. This paper discusses %x the UNIX versions. %k IOBENCH, synthetic I/O benchmark, UNIX workload %s vinton%cello@hplabs.hp.com (Fri Sep 20 12:55:58 PDT 1991) ����������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/references.private���������������������������������������������������������������0000664�0000764�0000764�00000000326�07045412511�017447� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������%z Article %K McCalpin95 %A John D. McCalpin %T Memory bandwidth and machine balance in current high performance computers %J IEEE Technical Committee on Computer Architecture newsletter %V to appear %D Dec. 1995 ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/reporting.3����������������������������������������������������������������������0000664�0000764�0000764�00000003767�07172615471�016056� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" @(#)lmbench.man 2.0 98/04/24 .\" .\" lmbench - benchmarking toolbox .\" .\" Copyright (C) 1998 Carl Staelin and Larry McVoy .\" E-mail: staelin@hpl.hp.com .\" .TH "lmbench reporting" 3 "$Date:" "(c)1998-2000 Larry McVoy and Carl Staelin" "LMBENCH" .SH "NAME" milli, micro, nano, mb, kb \- the lmbench reporting subsystem .SH "SYNOPSIS" .B "#include ``lmbench.h''" .LP .B "void milli(char *s, uint64 n)" .LP .B "void micro(char *s, uint64 n)" .LP .B "void nano(char *s, uint64 n)" .LP .B "void mb(uint64 bytes)" .LP .B "void kb(uint64 bytes)" .SH "DESCRIPTION" Creating benchmarks using the .I lmbench timing harness is easy. Since it is so easy to measure performance using .IR lmbench , it is possible to quickly answer questions that arise during system design, development, or tuning. For example, image processing .LP There are two attributes that are critical for performance, latency and bandwidth, and .IR lmbench 's timing harness makes it easy to measure and report results for both. The measurement interface, .B benchmp is the same, but the reporting functions are different. Latency is usually important for frequently executed operations, and bandwidth is usually important when moving large chunks of data. .TP .B "void milli(char *s, uint64 n)" print out the time per operation in milli-seconds. .I n is the number of operations during the timing interval, which is passed as a parameter because each .I loop_body can contain several operations. .TP .B "void micro(char *s, uint64 n)" print the time per opertaion in micro-seconds. .TP .B "void nano(char *s, uint64 n)" print the time per operation in nano-seconds. .TP .B "void mb(uint64 bytes)" print the bandwidth in megabytes per second. .TP .B "void kb(uint64 bytes)" print the bandwidth in kilobytes per second. .SH "FUTURES" Development of .I lmbench is continuing. .SH "SEE ALSO" lmbench(8), lmbench(3), timing(3), results(3) .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������lmbench-3.0-a9/doc/results.3������������������������������������������������������������������������0000664�0000764�0000764�00000004144�07172615471�015534� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" @(#)results.man 2.0 98/04/24 .\" .\" results - lmbench results subsystem .\" .\" Copyright (C) 1998 Carl Staelin and Larry McVoy .\" E-mail: staelin@hpl.hp.com .\" .TH "lmbench result management" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH" .SH "NAME" insertinit, insertsort, get_results, set_results, save_median, save_minimum \- the lmbench results subsystem .SH "SYNOPSIS" .B "#include ``lmbench.h''" .LP .B "#define TRIES 11" .LP .B "typedef struct { uint64 u, n } value_t;" .LP .B "typedef struct { int N; value_t v[TRIES]; } result_t;" .LP .B "int sizeof_result(int N)" .LP .B "void insertinit(result_t *r)" .LP .B "void insertsort(uint64 u, uint64 n, result_t *r)" .LP .B "result_t* get_results()" .LP .B "void set_results(result_t *r)" .LP .B "void save_median()" .LP .B "void save_minimum()" .SH "DESCRIPTION" These routines provide some simple data management functionality. In most cases, you will not need these routines. .LP The current timing results can be accessed using the routines in timing(3). The current timing results may be modified using .B save_median and .BR save_minimum . .TP .B "int sizeof_result(int N)" returns the number of bytes to allocate for a result_t which contains .I N results. .TP .B "void insertinit(result_t *r)" initializes the results array. .TP .B "void insertsort(uint64 u, uint64 n, result_t *r)" insert .I u and .I n into .IR r . Results are sorted in decreasing order by .IR u/n . .TP .B "void get_results(result_t *r)" get a copy of the current results. .TP .B "void set_results(result_t *r)" save a copy .I r as the current results. .TP .B "void save_median()" sets the timing results to the median of the current results. .TP .B "void save_minimum()" sets the timing restuls to the minimum of the current results. .LP Results are sorted in ascending order, so the minimum value is at .B TRIES-1 and the maximum value is at .BR 0 . .SH "FUTURES" Development of \fIlmbench\fR is continuing. .SH "SEE ALSO" lmbench(8), lmbench(3), reporting(3), results(3) .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/stream.8�������������������������������������������������������������������������0000664�0000764�0000764�00000001123�07172615471�015325� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH stream 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME stream \- John McCalpin's STREAM benchmark .SH SYNOPSIS .B stream [ .I "-M <len>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B stream mimics John McCalpin's STREAM benchmark. It measures memory bandwidth. .SH BUGS .B stream is an experimental benchmark, but it seems to work well on most systems. .SH "SEE ALSO" lmbench(8), bw_mem(8), line(8), tlb(8), cache(8), par_mem(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/timing.3�������������������������������������������������������������������������0000664�0000764�0000764�00000011334�10425064445�015313� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" .\" @(#)timing.man 2.0 98/04/24 .\" .\" timing - lmbench timing subsystem .\" .\" Copyright (C) 1998 Carl Staelin and Larry McVoy .\" E-mail: staelin@hpl.hp.com .\" .TH "lmbench timing" 3 "$Date:$" "(c)1998 Larry McVoy" "LMBENCH" .SH "NAME" benchmp, benchmp_getstate, benchmp_interval, start, stop, get_n, set_n, gettime, settime, get_enough, t_overhead, l_overhead \- the lmbench timing subsystem .SH "SYNOPSIS" .B "#include \"lmbench.h\"" .LP .B "typedef u_long iter_t;" .LP .B "typedef (*bench_f)(iter_t iterations, void* cookie);" .LP .B "typedef (*support_f)(iter_t iterations, void* cookie);" .LP .B "void benchmp(support_f initialize, bench_f benchmark, support_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie);" .LP .B "void* benchmp_getstate();" .LP .B "iter_t benchmp_interval(void* state);" .LP .B "void start(struct timeval *begin);" .LP .B "uint64 stop(struct timeval *begin, struct timeval *end);" .LP .B "uint64 get_n();" .LP .B "void set_n(uint64 n);" .LP .B "uint64 gettime();" .LP .B "void settime(uint64 u);" .LP .B "uint64 get_enough(uint64 enough);" .LP .B "uint64 t_overhead();" .LP .B "double l_overhead();" .SH "DESCRIPTION" The single most important element of a good benchmarking system is the quality and reliability of its measurement system. .IR lmbench 's timing subsystem manages the experimental timing process to produce accurate results in the least possible time. .I lmbench includes methods for measuring and eliminating several factors that influence the accuracy of timing measurements, such as the resolution of the system clock. .LP .I lmbench gets accurate results by considering clock resolution, auto-sizing the duration of each benchmark, and conducting multiple experiments. .TP .B "void benchmp(initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie)" measures the performance of .I benchmark repeatedly and reports the median result. .I benchmp creates .I parallel sub-processes which run .I benchmark in parallel. This allows lmbench to measure the system's ability to scale as the number of client processes increases. Each sub-process executes .I initialize before starting the benchmarking cycle. It will call .I benchmark several times in order to collect .I repetitions results. After all the benchmark results have been collected, .I cleanup is called to cleanup any resources which may have been allocated by .I initialize or .I benchmark . .I cookie is a void pointer to a hunk of memory that can be used to store any parameters or state that is needed by the benchmark. .TP .B "void benchmp_getstate()" returns a void pointer to the lmbench-internal state used during benchmarking. The state is not to be used or accessed directly by clients, but rather would be passed into .I benchmp_interval. .TP .B "iter_t benchmp_interval(void* state)" returns the number of times the benchmark should execute its benchmark loop during this timing interval. This is used only for weird benchmarks which cannot implement the benchmark body in a function which can return, such as the page fault handler. Please see .I lat_sig.c for sample usage. .TP .B "void start(struct timeval *begin)" starts a timing interval. If .I begin is non-null, save the start time in .I begin . .TP .B "uint64 stop(struct timeval *begin, struct timeval *end)" stops a timing interval, returning the number of elapsed micro-seconds. .TP .B "uint64 get_n()" returns the number of times .I loop_body was executed during the timing interval. .TP .B "void set_n(uint64 n)" sets the number of times .I loop_body was executed during the timing interval. .TP .B "uint64 gettime()" returns the number of micro-seconds in the timing interval. .TP .B "void settime(uint64 u)" sets the number of micro-seconds in the timing interval. .TP .B "uint64 get_enough(uint64 enough)" return the time in micro-seconds needed to accurately measure a timing interval. .TP .B "uint64 t_overhead()" return the time in micro-seconds needed to measure time. .TP .B "double l_overhead()" return the time in micro-seconds needed to do a simple loop. .SH "VARIABLES" There are three environment variables that can be used to modify the .I lmbench timing subsystem: ENOUGH, TIMING_O, and LOOP_O. The environment variables can be used to directly set the results of .B get_enough , .B t_overhead , and .B l_overhead . When running a large number of benchmarks, or repeating the same benchmark many times, this can save time by eliminating the necessity of recalculating these values for each run. .SH "FUTURES" Development of .I lmbench is continuing. .SH "SEE ALSO" lmbench(8), lmbench(3), reporting(3), results(3). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/tlb.8����������������������������������������������������������������������������0000664�0000764�0000764�00000003133�07172615471�014616� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" $Id$ .TH TLB 8 "$Date$" "(c)2000 Carl Staelin and Larry McVoy" "LMBENCH" .SH NAME tlb \- TLB size and latency benchmark .SH SYNOPSIS .B tlb [ .I "-L <line size>" ] [ .I "-M <len>" ] [ .I "-W <warmups>" ] [ .I "-N <repetitions>" ] .SH DESCRIPTION .B tlb tries to determine the size, in pages, of the TLB. The largest amount of memory it will examine is .I len bytes. .LP .B tlb compares the memory latency for two different pointer chains. The two chains occupy the same amount of cache space, but they stress the memory subsystem differently. The first chain accesses one word per page, while the second chain randomly jumps through all the lines on a page before jumping to the next page. When all of the pointers reside in the cache (which is the usual case), and all of the pages for the first chain reside in the TLB, then the average memory latencies should be identical. Assuming there is a fixed size TLB, then at some point the number of pages accessed by the first page will be larger than the TLB. At this point the average latency for each memory access for the first chain will be a cache hit plus some fraction of a TLB miss. .LP Once the TLB boundary is located .B tlb reports the TLB miss latency as the TLB latency for twice as many pages as the TLB can hold. .SH BUGS .B tlb is an experimental benchmark, but it seems to work well on most systems. However, if a processor has a TLB hierarchy .B tlb only finds the top level TLB. .SH "SEE ALSO" lmbench(8), line(8), cache(8), par_mem(8). .SH "AUTHOR" Carl Staelin and Larry McVoy .PP Comments, suggestions, and bug reports are always welcome. �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/tmac.usenix����������������������������������������������������������������������0000664�0000764�0000764�00000102557�07045412511�016124� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.ig Copyright (C) 1990, 1991 Free Software Foundation, Inc. Written by James Clark (jjc@jclark.uucp) This file is part of groff. groff is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version. groff is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with groff; see the file LICENSE. If not, write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. .. .if !\n(.g .ab These ms macros require groff. .if \n(.C \ . ab The groff ms macros do not work in compatibility mode. .\" Enable warnings. You can delete this if you want. .warn .\" See if already loaded. .if r GS .nx /dev/null .nr GS 1 .de @error .tm \\n(.F:\\n(.c: macro error: \\$* .. .de @warning .tm \\n(.F:\\n(.c: macro warning: \\$* .. .de @fatal .ab \\n(.F:\\n(.c: fatal macro error: \\$* .. .de @not-implemented .@error sorry, \\$0 not implemented .als \\$0 @nop .. .als TM @not-implemented .als CT @not-implemented .de @nop .. .de @init .\" a non-empty environment .ev ne \c .ev .ev nf 'nf .ev .. .ds REFERENCES References .ds ABSTRACT ABSTRACT .ds TOC Table of Contents .ds MONTH1 January .ds MONTH2 February .ds MONTH3 March .ds MONTH4 April .ds MONTH5 May .ds MONTH6 June .ds MONTH7 July .ds MONTH8 August .ds MONTH9 September .ds MONTH10 October .ds MONTH11 November .ds MONTH12 December .ds MO \\*[MONTH\n[mo]] .nr *year \n[yr]+1900 .ds DY \n[dy] \*[MO] \n[*year] .de ND .if \\n[.$] .ds DY "\\$* .. .de DA .if \\n[.$] .ds DY "\\$* .ds CF \\*[DY] .. .\" indexing .de IX .tm \\$1\t\\$2\t\\$3\t\\$4 ... \\n[PN] .. .\" print an error message and then try to recover .de @error-recover .@error \\$@ (recovering) .nr *pop-count 0 .while !'\\n(.z'' \{\ . \"@warning automatically terminating diversion \\n(.z . ie d @div-end!\\n(.z .@div-end!\\n(.z . el .*div-end-default . nr *pop-count +1 . \" ensure that we don't loop forever . if \\n[*pop-count]>20 .@fatal recovery failed .\} .while !'\\n[.ev]'0' .ev .par@reset-env .par@reset .. .de *div-end-default .ds *last-div \\n(.z .br .di .ev nf .\\*[*last-div] .ev .. .\" **************************** .\" ******** module cov ******** .\" **************************** .\" Cover sheet and first page. .de cov*err-not-after-first-page .@error \\$0 is not allowed after the first page has started .. .de cov*err-not-before-tl .@error \\$0 is not allowed before TL .. .de cov*err-not-again .@error \\$0 is not allowed more than once .. .de cov*err-not-after-ab .@error \\$0 is not allowed after first AB, LP, PP, IP, SH or NH .. .als AU cov*err-not-before-tl .als AI cov*err-not-before-tl .als AB cov*err-not-before-tl .de cov*first-page-init .rm cov*first-page-init .par@init .als RP cov*err-not-after-first-page .@init .ie \\n[cov*rp-format] \{\ . pg@cs-top . als FS cov*FS . als FE cov*FE .\} .el \{\ . pg@top . als FS @FS . als FE @FE .\} .wh 0 pg@top .. .wh 0 cov*first-page-init .\" This handles the case where FS occurs before TL or LP. .de FS .br \\*[FS]\\ .. .nr cov*rp-format 0 .nr cov*rp-no 0 .\" released paper format .de RP .nr cov*rp-format 1 .if \\n[.$] .if '\\$1'no' .nr cov*rp-no 1 .pn 0 .. .de TL .br .als TL cov*err-not-again .rn @AB AB .rn @AU AU .rn @AI AI .di cov*tl-div .par@reset .ft 3 .ie \\n[VARPS] \{\ . ps 14 . vs 16 .\} .el \{\ . ps +2 . vs +3p .\} .ll (u;\\n[LL]*5/6) .nr cov*n-au 0 .. .de @AU .par@reset .if !'\\n(.z'' \{\ . br . di .\} .nr cov*n-au +1 .di cov*au-div!\\n[cov*n-au] .nf .ft 2 .ps \\n[PS] .. .de @AI .par@reset .if !'\\n(.z'' \{\ . br . di .\} .ie !\\n[cov*n-au] .@error AI before AU .el \{\ . di cov*ai-div!\\n[cov*n-au] . nf . ft 1 . ps \\n[PS] .\} .. .de LP .if !'\\n[.z]'' \{\ . br . di .\} .br .cov*ab-init .cov*print \\*[\\$0]\\ .. .als IP LP .als PP LP .als XP LP .als NH LP .als SH LP .als MC LP .als RT LP .de cov*ab-init .als cov*ab-init @nop .als LP @LP .als IP @IP .als PP @PP .als XP @XP .als RT @RT .als SH @SH .als NH @NH .als QP @QP .als RS @RS .als RE @RE .als QS @QS .als QE @QE .als MC @MC .als EQ @EQ .als EN @EN .als AB cov*err-not-after-ab .als AU par@AU .als AI par@AI .als TL par@TL .. .de @AB .if !'\\n(.z'' \{\ . br . di .\} .cov*ab-init .di cov*ab-div .par@ab-indent .par@reset .if !'\\$1'no' \{\ . ft 2 . ce 1 \\*[ABSTRACT] . sp . ft 1 .\} .ns .@PP .. .de AE .ie '\\n(.z'cov*ab-div' \{\ . als AE cov*err-not-again . br . di .\" nr cov*ab-height \\n[dn] . par@reset-env . par@reset . cov*print .\} .el .@error AE without AB .. .de @div-end!cov*ab-div .AE .. .de cov*print .als cov*print @nop .ie d cov*tl-div \{\ . ie \\n[cov*rp-format] .cov*rp-print . el .cov*draft-print .\} .el \{\ . if \\n[cov*rp-format] \{\ . @warning RP format but no TL . bp 1 . als FS @FS . als FE @FE . \} . br .\} .. .de cov*rp-print .nr cov*page-length \\n[.p] .pl 1000i .cov*tl-au-print .sp 3 .if d cov*ab-div \{\ . nf . cov*ab-div .\} .sp 3 .par@reset \\*[DY] .br .if \\n[cov*fn-height] \{\ . sp |(u;\\n[cov*page-length]-\\n[FM]\ -\\n[cov*fn-height]-\\n[fn@sep-dist]>?\\n[nl]) . fn@print-sep . ev nf . cov*fn-div . ev . ie \\n[cov*rp-no] .rm cov*fn-div . el \{\ . rn cov*fn-div fn@overflow-div . nr fn@have-overflow 1 . \} .\} .als FS @FS .als FE @FE .\" If anything was printed below where the footer line is normally printed, .\" then that's an overflow. .if -\\n[FM]/2+1v+\\n[cov*page-length]<\\n[nl] .@error cover sheet overflow .pl \\n[cov*page-length]u .bp 1 .if !\\n[cov*rp-no] .cov*tl-au-print .rs .sp 1 .. .de cov*draft-print .cov*tl-au-print .if d cov*ab-div \{\ . nf . sp 2 . cov*ab-div .\} .sp 1 .. .de cov*tl-au-print .par@reset .nf .rs .sp 3 .ce 9999 .cov*tl-div .nr cov*i 1 .nr cov*sp 1v .while \\n[cov*i]<=\\n[cov*n-au] \{\ . sp \\n[cov*sp]u . cov*au-div!\\n[cov*i] . ie d cov*ai-div!\\n[cov*i] \{\ . sp .5v . cov*ai-div!\\n[cov*i] . nr cov*sp 1v . \} . el .nr cov*sp .5v . nr cov*i +1 .\} .ce 0 .. .nr cov*fn-height 0 .nr cov*in-fn 0 .\" start of footnote on cover .de cov*FS .if \\n[cov*in-fn] \{\ . @error nested FS . FE .\} .nr cov*in-fn 1 .ev fn .par@reset-env .da cov*fn-div .if !\\n[cov*fn-height] .ns .ie \\n[.$] .FP "\\$1" no .el .@LP .. .de @div-end!cov*fn-div .cov*FE .. .\" end of footnote on cover .de cov*FE .ie '\\n(.z'cov*fn-div' \{\ . br . ev . di . nr cov*in-fn 0 . nr cov*fn-height +\\n[dn] .\} .el .@error FE without matching FS .. .\" *************************** .\" ******** module pg ******** .\" *************************** .\" Page-level formatting. .\" > 0 if we have a footnote on the current page .nr pg@fn-flag 0 .nr pg@colw 0 .nr pg@fn-colw 0 .nr HM 1i .nr FM 1i .nr PO 1.25i .ds LF .ds CF .ds RF .ds LH .ds CH -\\n[PN]- .ds RH .ds pg*OH '\\*[LH]'\\*[CH]'\\*[RH]' .ds pg*EH '\\*[LH]'\\*[CH]'\\*[RH]' .ds pg*OF '\\*[LF]'\\*[CF]'\\*[RF]' .ds pg*EF '\\*[LF]'\\*[CF]'\\*[RF]' .de OH .ds pg*\\$0 "\\$* .. .als EH OH .als OF OH .als EF OH .de PT .ie \\n%=1 .if \\n[pg*P1] .tl \\*[pg*OH] .el \{\ . ie o .tl \\*[pg*OH] . el .tl \\*[pg*EH] .\} .. .de BT .ie o .tl \\*[pg*OF] .el .tl \\*[pg*EF] .. .nr pg*P1 0 .de P1 .nr pg*P1 1 .. .wh -\n[FM]u pg@bottom .wh -\n[FM]u/2u pg*footer .nr MINGW 2n .nr pg@ncols 1 .de @MC .if !'\\n(.z'' .error-recover MC while diversion open .br .ie \\n[pg@ncols]>1 .pg@super-eject .el \{\ . \" flush out any floating keeps . while \\n[kp@tail]>\\n[kp@head] \{\ . rs . bp . \} .\} .ie !\\n(.$ \{\ . nr pg@colw \\n[LL]*7/15 . nr pg*gutw \\n[LL]-(2*\\n[pg@colw]) . nr pg@ncols 2 .\} .el \{\ . nr pg@colw (n;\\$1)<?\\n[LL] . ie \\n[.$]<2 .nr pg*gutw \\n[MINGW] . el .nr pg*gutw (n;\\$2) . nr pg@ncols \\n[LL]-\\n[pg@colw]/(\\n[pg@colw]+\\n[pg*gutw])+1 . ie \\n[pg@ncols]>1 \ . nr pg*gutw \\n[LL]-(\\n[pg@ncols]*\\n[pg@colw])/(\\n[pg@ncols]-1) . el .nr pg*gutw 0 .\} .mk pg*col-top .ns .nr pg*col-num 0 .nr pg@fn-colw \\n[pg@colw]*5/6 .par@reset .. .de 2C .MC .. .de 1C .MC \\n[LL]u .. .\" top of page macro .de pg@top .ch pg*footer -\\n[FM]u/2u .nr PN \\n% .nr pg*col-num 0 .nr pg@fn-bottom-margin 0 .nr pg*saved-po \\n[PO] .po \\n[PO]u .ev h .par@reset .sp (u;\\n[HM]/2) .PT .sp |\\n[HM]u .if d HD .HD .mk pg@header-bottom .ev .mk pg*col-top .pg*start-col .. .de pg*start-col .\" Handle footnote overflow before floating keeps, because the keep .\" might contain an embedded footnote. .fn@top-hook .kp@top-hook .tbl@top-hook .ns .. .de pg@cs-top .sp \\n[HM]u .\" move pg@bottom and pg*footer out of the way .ch pg@bottom \\n[.p]u*2u .ch pg*footer \\n[.p]u*2u .ns .. .de pg@bottom .tbl@bottom-hook .if \\n[pg@fn-flag] .fn@bottom-hook .nr pg*col-num +1 .ie \\n[pg*col-num]<\\n[pg@ncols] .pg*end-col .el .pg*end-page .. .de pg*end-col 'sp |\\n[pg*col-top]u .po (u;\\n[pg*saved-po]+(\\n[pg@colw]+\\n[pg*gutw]*\\n[pg*col-num])) .\"po +(u;\\n[pg@colw]+\\n[pg*gutw]) .pg*start-col .. .de pg*end-page .po \\n[pg*saved-po]u .\" Make sure we don't exit if there are still floats or footnotes left-over. .ie \\n[kp@head]<\\n[kp@tail]:\\n[fn@have-overflow] \{\ . \" Switching environments ensures that we don't get an unnecessary . \" blank line at the top of the page. . ev ne ' bp . ev .\} .el \{\ . if r pg*next-number \{\ . pn \\n[pg*next-number] . rr pg*next-number . if d pg*next-format \{\ . af PN \\*[pg*next-format] . rm pg*next-format . \} . \} ' bp .\} .. .\" pg@begin number format .de pg@begin .ie \\n[.$]>0 \{\ . nr pg*next-number (;\\$1) . ie \\n[.$]>1 .ds pg*next-format \\$2 . el .rm pg*next-format .\} .el .rr pg*next-number .pg@super-eject .. .\" print the footer line .de pg*footer .ev h .par@reset .BT .ev .. .\" flush out any keeps or footnotes .de pg@super-eject .br .if !'\\n(.z'' .@error-recover diversion open while ejecting page .\" Make sure we stay in the end macro while there is still footnote overflow .\" left, or floating keeps. .while \\n[kp@tail]>\\n[kp@head]:\\n[pg@fn-flag] \{\ . rs . bp .\} .bp .. .em pg@super-eject .\" *************************** .\" ******** module fn ******** .\" *************************** .\" Footnotes. .nr fn@sep-dist 8p .ev fn .\" Round it vertically .vs \n[fn@sep-dist]u .nr fn@sep-dist \n[.v] .ev .nr fn*text-num 0 1 .nr fn*note-num 0 1 .ds * \\*[par@sup-start]\En+[fn*text-num]\\*[par@sup-end] .nr fn*open 0 .\" normal FS .de @FS .ie \\n[.$] .fn*do-FS "\\$1" no .el \{\ . ie \\n[fn*text-num]>\\n[fn*note-num] .fn*do-FS \\n+[fn*note-num] . el .fn*do-FS .\} .. .\" Second argument of `no' means don't embellish the first argument. .de fn*do-FS .if \\n[fn*open] .@error-recover nested FS .nr fn*open 1 .if \\n[.u] \{\ . \" Ensure that the first line of the footnote is on the same page . \" as the reference. I think this is minimal. . ev fn . nr fn*need 1v . ev . ie \\n[pg@fn-flag] .nr fn*need +\\n[fn:PD] . el .nr fn*need +\\n[fn@sep-dist] . ne \\n[fn*need]u+\\n[.V]u>?0 .\} .ev fn .par@reset-env .fn*start-div .par@reset .ie \\n[.$] .FP \\$@ .el .@LP .. .de @FE .ie !\\n[fn*open] .@error FE without FS .el \{\ . nr fn*open 0 . br . ev . fn*end-div .\} .. .nr fn@have-overflow 0 .\" called at the top of each column .de fn@top-hook .nr fn*max-width 0 .nr fn*page-bottom-pos 0-\\n[FM]-\\n[pg@fn-bottom-margin] .ch pg@bottom \\n[fn*page-bottom-pos]u .if \\n[fn@have-overflow] \{\ . nr fn@have-overflow 0 . fn*start-div . ev nf . fn@overflow-div . ev . fn*end-div .\} .. .\" This is called at the bottom of the column if pg@fn-flag is set. .de fn@bottom-hook .nr pg@fn-flag 0 .nr fn@have-overflow 0 .nr fn@bottom-pos \\n[.p]-\\n[FM]-\\n[pg@fn-bottom-margin]+\\n[.v] .ev fn .nr fn@bottom-pos -\\n[.v] .ev .ie \\n[nl]+\\n[fn@sep-dist]+\n[.V]>\\n[fn@bottom-pos] \{\ . rn fn@div fn@overflow-div . nr fn@have-overflow 1 .\} .el \{\ . if \\n[pg@ncols]>1 \ . if \\n[fn*max-width]>\\n[pg@fn-colw] \ . nr pg@fn-bottom-margin \\n[.p]-\\n[FM]-\\n[nl]+1v . wh \\n[fn@bottom-pos]u fn*catch-overflow . fn@print-sep . ev nf . fn@div . rm fn@div . ev . if '\\n(.z'fn@overflow-div' \{\ . di . nr fn@have-overflow \\n[dn]>0 . \} . ch fn*catch-overflow .\} .. .de fn*catch-overflow .di fn@overflow-div .. .nr fn*embed-count 0 .de @div-end!fn@div .br .if '\\n[.ev]'fn' .ev .fn*end-div .nr fn*open 0 .. .als @div-end!fn*embed-div @div-end!fn@div .de fn*start-div .ie '\\n(.z'' \{\ . da fn@div . if !\\n[pg@fn-flag] .ns .\} .el .di fn*embed-div .. .de fn*end-div .ie '\\n(.z'fn@div' \{\ . di . nr fn*page-bottom-pos -\\n[dn] . nr fn*max-width \\n[fn*max-width]>?\\n[dl] . if !\\n[pg@fn-flag] .nr fn*page-bottom-pos -\\n[fn@sep-dist] . nr pg@fn-flag 1 . nr fn*page-bottom-pos \\n[nl]-\\n[.p]+\n[.V]>?\\n[fn*page-bottom-pos] . ch pg@bottom \\n[fn*page-bottom-pos]u .\} .el \{\ . ie '\\n(.z'fn*embed-div' \{\ . di . rn fn*embed-div fn*embed-div!\\n[fn*embed-count] \!. fn*embed-start \\n[fn*embed-count] . rs ' sp (u;\\n[dn]+\\n[fn@sep-dist]+\\n[.V]) \!. fn*embed-end . nr fn*embed-count +1 . \} . el \{\ . ev fn . @error-recover unclosed diversion within footnote . \} .\} .. .de fn*embed-start .ie '\\n(.z'' \{\ . fn*start-div . ev nf . fn*embed-div!\\$1 . rm fn*embed-div!\\$1 . ev . fn*end-div . di fn*null .\} .el \{\ \!. fn*embed-start \\$1 . rs .\} .. .de fn*embed-end .ie '\\n(.z'fn*null' \{\ . di . rm fn*null .\} .el \!.fn*embed-end .. .\" It's important that fn@print-sep use up exactly fn@sep-dist vertical space. .de fn@print-sep .ev fn .in 0 .vs \\n[fn@sep-dist]u \D'l 1i 0' .br .ev .. .\" *************************** .\" ******** module kp ******** .\" *************************** .\" Keeps. .de KS .br .di kp*div .. .de KF .if !'\\n(.z'' .@error-recover KF while open diversion .di kp*fdiv .ev k .par@reset-env .par@reset .. .de KE .ie '\\n(.z'kp*div' .kp*end .el \{\ . ie '\\n(.z'kp*fdiv' .kp*fend . el .@error KE without KS or KF .\} .. .de @div-end!kp*div .kp*end .. .de @div-end!kp*fdiv .kp*fend .. .de kp*need .ie '\\n(.z'' .ds@need \\$1 .el \!.kp*need \\$1 .. .\" end non-floating keep .de kp*end .br .di .kp*need \\n[dn] .ev nf .kp*div .ev .rm kp*div .. .\" Floating keeps. .nr kp@head 0 .nr kp@tail 0 .\" end floating keep .de kp*fend .br .ev .di .ie \\n[.t]-(\\n[.k]>0*1v)>\\n[dn] \{\ . br . ev nf . kp*fdiv . rm kp*fdiv . ev .\} .el \{\ . rn kp*fdiv kp*div!\\n[kp@tail] . nr kp*ht!\\n[kp@tail] 0\\n[dn] . nr kp@tail +1 .\} .. .\" top of page processing for KF .nr kp*doing-top 0 .de kp@top-hook .if !\\n[kp*doing-top] \{\ . nr kp*doing-top 1 . kp*do-top . nr kp*doing-top 0 .\} .. .de kp*do-top .\" If the first keep won't fit, only force it out if we haven't had a footnote .\" and we're at the top of the page. .nr kp*force \\n[pg@fn-flag]=0&(\\n[nl]<=\\n[pg@header-bottom]) .nr kp*fits 1 .while \\n[kp@tail]>\\n[kp@head]&\\n[kp*fits] \{\ . ie \\n[.t]>\\n[kp*ht!\\n[kp@head]]:\\n[kp*force] \{\ . nr kp*force 0 . \" It's important to advance kp@head before bringing . \" back the keep, so that if the last line of the . \" last keep springs the bottom of page trap, a new . \" page will not be started unnecessarily. . rn kp*div!\\n[kp@head] kp*temp . nr kp@head +1 . ev nf . kp*temp . ev . rm kp*temp . \} . el .nr kp*fits 0 .\} .. .\" *************************** .\" ******** module ds ******** .\" *************************** .\" Displays and non-floating keeps. .de DE .ds*end!\\n[\\n[.ev]:ds-type] .nr \\n[.ev]:ds-type 0 .. .de ds@auto-end .if \\n[\\n[.ev]:ds-type] \{\ . @error automatically terminating display . DE .\} .. .de @div-end!ds*div .ie \\n[\\n[.ev]:ds-type] .DE .el .ds*end!2 .. .de ds*end!0 .@error DE without DS, ID, CD, LD or BD .. .de LD .br .nr \\n[.ev]:ds-type 1 .par@reset .nf .sp \\n[DD]u .. .de ID .LD .ie \\n[.$] .in +(n;\\$1) .el .in +\\n[DI]u .. .de CD .LD .ce 9999 .. .de RD .LD .rj 9999 .. .de ds*common-end .par@reset .sp \\n[DD]u .. .als ds*end!1 ds*common-end .de BD .LD .nr \\n[.ev]:ds-type 2 .di ds*div .. .de ds*end!2 .br .ie '\\n(.z'ds*div' \{\ . di . nf . in (u;\\n[.l]-\\n[dl]/2) . ds*div . rm ds*div . ds*common-end .\} .el .@error-recover mismatched DE .. .de DS .br .di ds*div .ie '\\$1'B' \{\ . LD . nr \\n[.ev]:ds-type 4 .\} .el \{\ . ie '\\$1'L' .LD . el \{\ . ie '\\$1'C' .CD . el \{\ . ie '\\$1'R' .RD . el \{\ . ie '\\$1'I' .ID \\$2 . el .ID \\$1 . \} . \} . \} . nr \\n[.ev]:ds-type 3 .\} .. .de ds@need .if '\\n(.z'' \{\ . while \\n[.t]<=(\\$1)&(\\n[nl]>\\n[pg@header-bottom]) \{\ . rs ' sp \\n[.t]u . \} .\} .. .de ds*end!3 .br .ie '\\n(.z'ds*div' \{\ . di . ds@need \\n[dn] . ev nf . ds*div . ev . rm ds*div . ds*common-end .\} .el .@error-recover mismatched DE .. .de ds*end!4 .ie '\\n(.z'ds*div' \{\ . br . di . nf . in (u;\\n[.l]-\\n[dl]/2) . ds@need \\n[dn] . ds*div . rm ds*div . ds*common-end .\} .el .@error-recover mismatched DE .. .\" **************************** .\" ******** module par ******** .\" **************************** .\" Paragraph-level formatting. .nr VARPS 0 .nr PS 10 .nr LL 6i .de par*vs .\" If it's too big to be in points, treat it as units. .ie (p;\\$1)>=40p .vs (u;\\$1) .el .vs (p;\\$1) .. .de par@ab-indent .nr 0:li (u;\\n[LL]/12) .nr 0:ri \\n[0:li] .. .de par*env-init .aln \\n[.ev]:PS PS .aln \\n[.ev]:VS VS .aln \\n[.ev]:LL LL .aln \\n[.ev]:MCLL LL .aln \\n[.ev]:LT LT .aln \\n[.ev]:MCLT LT .aln \\n[.ev]:PI PI .aln \\n[.ev]:PD PD .par@reset-env .. .\" happens when the first page begins .de par@init .if !rLT .nr LT \\n[LL] .if !rFL .nr FL \\n[LL]*5/6 .if !rVS .nr VS \\n[PS]+2 .ps \\n[PS] .if !rDI .nr DI .5i .if !rQI .nr QI 5n .if !rPI .nr PI 5n .par*vs \\n[VS] .if !rPD .nr PD .3v .if !rDD .nr DD .5v .if !dFAM .ds FAM \\n[.fam] .nr par*adj \\n[.j] .par*env-init .ev h .par*env-init .ev .ev fn .par*env-init .ev .ev k .par*env-init .ev .aln 0:MCLL pg@colw .aln 0:MCLT pg@colw .aln k:MCLL pg@colw .aln k:MCLT pg@colw .if !rFPS .nr FPS \\n[PS]-2 .if !rFVS .nr FVS (p;\\n[FPS]+2) .if !rFI .nr FI 2n .if !rFPD .nr FPD \\n[PD]/2 .aln fn:PS FPS .aln fn:VS FVS .aln fn:LL FL .aln fn:LT FL .aln fn:PI FI .aln fn:PD FPD .aln fn:MCLL pg@fn-colw .aln fn:MCLT pg@fn-colw .. .de par@reset-env .nr \\n[.ev]:il 0 .nr \\n[.ev]:li 0 .nr \\n[.ev]:ri 0 .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .nr \\n[.ev]:pli 0 .nr \\n[.ev]:pri 0 .nr \\n[.ev]:ds-type 0 .. .\" par@reset .de par@reset .br .ce 0 .rj 0 .ul 0 .fi .ad \\n[par*adj] .ie \\n[pg@ncols]>1 \{\ . ll (u;\\n[\\n[.ev]:MCLL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri]) . lt \\n[\\n[.ev]:MCLT]u .\} .el \{\ . ll (u;\\n[\\n[.ev]:LL]-\\n[\\n[.ev]:ri]-\\n[\\n[.ev]:pri]) . lt \\n[\\n[.ev]:LT]u .\} .in (u;\\n[\\n[.ev]:li]+\\n[\\n[.ev]:pli]) .ft 1 .fam \\*[FAM] .ps \\n[\\n[.ev]:PS] .par*vs \\n[\\n[.ev]:VS] .ls 1 .TA .hy 14 .. .als @RT par@reset .\" This can be redefined by the user. .de TA .ta T 5n .. .de par*start .ds@auto-end .nr \\n[.ev]:pli \\$1 .nr \\n[.ev]:pri \\$2 .par@reset .sp \\n[\\n[.ev]:PD]u .ne 1v+\\n(.Vu .. .de par@finish .nr \\n[.ev]:pli 0 .nr \\n[.ev]:pri 0 .par@reset .. .\" normal LP .de @LP .par*start 0 0 .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .. .de @PP .par*start 0 0 .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .ti +\\n[\\n[.ev]:ai]u .. .de @QP .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .par*start \\n[QI] \\n[QI] .. .de @XP .par*start \\n[\\n[.ev]:PI] 0 .ti -\\n[\\n[.ev]:PI]u .. .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] . br . \} . rm par*label .\} .. .de @RS .br .nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li] .nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri] .nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai] .nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli] .nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri] .nr \\n[.ev]:il +1 .nr \\n[.ev]:li +\\n[\\n[.ev]:ai] .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .par@reset .. .de @RE .br .ie \\n[\\n[.ev]:il] \{\ . nr \\n[.ev]:il -1 . nr \\n[.ev]:ai \\n[\\n[.ev]:ai!\\n[\\n[.ev]:il]] . nr \\n[.ev]:li \\n[\\n[.ev]:li!\\n[\\n[.ev]:il]] . nr \\n[.ev]:ri \\n[\\n[.ev]:ri!\\n[\\n[.ev]:il]] . nr \\n[.ev]:pli \\n[\\n[.ev]:pli!\\n[\\n[.ev]:il]] . nr \\n[.ev]:pri \\n[\\n[.ev]:pri!\\n[\\n[.ev]:il]] .\} .el .@error unbalanced \\$0 .par@reset .. .\" --------------------------------------------------------------------------- .de LINE . br . ps 32 \l'\\n[.l]u-\\n[\\n[.ev]:ri]u-\\n[\\n[.ev]:pri]u' . ps .. .\" --------------------------------------------------------------------------- .de QSTART . nr SaveQI \\n[QI] . if \\n[.$] .nr QI \\$1 . QS . LINE . ft 3 .. .\" --------------------------------------------------------------------------- .de QEND . ft P . sp -.5 . LINE . QE . nr QI \\n[SaveQI] . if \\n[.$] .sp \\$1 .. .de @QS .br .nr \\n[.ev]:li!\\n[\\n[.ev]:il] \\n[\\n[.ev]:li] .nr \\n[.ev]:ri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ri] .nr \\n[.ev]:ai!\\n[\\n[.ev]:il] \\n[\\n[.ev]:ai] .nr \\n[.ev]:pli!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pli] .nr \\n[.ev]:pri!\\n[\\n[.ev]:il] \\n[\\n[.ev]:pri] .nr \\n[.ev]:il +1 .nr \\n[.ev]:li +\\n[QI] .nr \\n[.ev]:ri +\\n[QI] .nr \\n[.ev]:ai \\n[\\n[.ev]:PI] .par@reset .. .als @QE @RE .\" start boxed text .de B1 .br .di par*box-div .nr \\n[.ev]:li +1n .nr \\n[.ev]:ri +1n .par@reset .. .de @div-end!par*box-div .B2 .. .\" end boxed text .\" Postpone the drawing of the box until we're in the top-level diversion, .\" in case there's a footnote inside the box. .de B2 .ie '\\n(.z'par*box-div' \{\ . br . di . ds@need \\n[dn] . par*box-mark-top . ev nf . par*box-div . ev . nr \\n[.ev]:ri -1n . nr \\n[.ev]:li -1n . par@finish . par*box-draw \\n[.i]u \\n[.l]u .\} .el .@error B2 without B1 .. .de par*box-mark-top .ie '\\n[.z]'' .mk par*box-top .el \!.par*box-mark-top .. .de par*box-draw .ie '\\n[.z]'' \{\ . nr par*box-in \\n[.i] . nr par*box-ll \\n[.l] . nr par*box-vpt \\n[.vpt] . vpt 0 . in \\$1 . ll \\$2 \v'-1v+.25m'\ \D'l (u;\\n[.l]-\\n[.i]) 0'\ \D'l 0 |\\n[par*box-top]u'\ \D'l -(u;\\n[.l]-\\n[.i]) 0'\ \D'l 0 -|\\n[par*box-top]u' . br . sp -1 . in \\n[par*box-in]u . ll \\n[par*box-ll]u . vpt \\n[par*box-vpt] .\} .el \!.par*box-draw \\$1 \\$2 .. .de @SH .par@finish .\" Keep together the heading and the first two lines of the next paragraph. .\" XXX - fix for variable PS. .ne 3v+\\n[\\n[.ev]:PD]u+\\n(.Vu .sp 1 .ft 3 .if \\n[VARPS] .ps \\n[PS]+2 .. .\" TL, AU, and AI are aliased to these in cov*ab-init. .de par@TL .par@finish .sp 1 .ft 3 .ps +2 .vs +3p .ce 9999 .. .de par@AU .par@finish .sp 1 .ft I .ce 9999 .. .de par@AI .par@finish .sp .5 .ce 9999 .. .\" In paragraph macros. .de NL .ps \\n[\\n[.ev]:PS] .. .de SM .ps -2 .. .de LG .ps +2 .. .de R .ft R .. .de par*set-font .ie \\n[.$] \{\ . nr par*prev-font \\n[.f] \&\\$3\f[\\*[par*font-name!\\$0]]\\$1\f[\\n[par*prev-font]]\\$2 .\} .el .ft \\*[par*font-name!\\$0] .. .ds par*font-name!B 3 .ds par*font-name!I 2 .ds par*font-name!BI BI .ds par*font-name!CW CR .als B par*set-font .als I par*set-font .als BI par*set-font .als CW par*set-font .\" underline a word .de UL \Z'\\$1'\v'.25m'\D'l \w'\\$1'u 0'\v'-.25m'\\$2 .. .\" box a word .de BX .nr par*bxw \w'\\$1'+.4m \Z'\v'.25m'\D'l 0 -1m'\D'l \\n[par*bxw]u 0'\D'l 0 1m'\D'l -\\n[par*bxw]u 0''\ \Z'\h'.2m'\\$1'\ \h'\\n[par*bxw]u' .. .\" The first time UX is used, put a registered mark after it. .ds par*ux-rg \(rg .de UX \s[\\n[.s]*8u/10u]UNIX\s0\\$1\\*[par*ux-rg] .ds par*ux-rg .. .ds par@sup-start \v'-.9m\s'\En[.s]*7u/10u'+.7m' .als { par@sup-start .ds par@sup-end \v'-.7m\s0+.9m' .als } par@sup-end .\" footnote paragraphs .\" FF is the footnote format .nr FF 0 .\" This can be redefined. It gets a second argument of `no' if the first .\" argument was supplied by the user, rather than automatically. .de FP .br .if !d par*fp!\\n[FF] \{\ . @error unknown footnote format `\\n[FF]' . nr FF 0 .\} .ie '\\$2'no' .par*fp!\\n[FF]-no "\\$1" .el .par*fp!\\n[FF] "\\$1" .. .de par*fp!0 .@PP \&\\*[par@sup-start]\\$1\\*[par@sup-end]\ \c .. .de par*fp!0-no .@PP \&\\$1\ \c .. .de par*fp!1 .@PP \&\\$1.\ \c .. .de par*fp!1-no .@PP \&\\$1\ \c .. .de par*fp!2 .@LP \&\\$1.\ \c .. .de par*fp!2-no .@LP \&\\$1\ \c .. .de par*fp!3 .@IP "\\$1." (u;\\n[\\n[.ev]:PI]*2) .. .de par*fp!3-no .@IP "\\$1" (u;\\n[\\n[.ev]:PI]*2) .. .\" *************************** .\" ******** module nh ******** .\" *************************** .\" Numbered headings. .\" nh*hl is the level of the last heading .nr nh*hl 0 .\" numbered heading .de @NH .ie '\\$1'S' \{\ . shift . nr nh*hl 0 . while \\n[.$] \{\ . nr nh*hl +1 . nr H\\n[nh*hl] 0\\$1 . shift . \} . if !\\n[nh*hl] \{\ . nr H1 1 . nr nh*hl 1 . @error missing arguments to .NH S . \} .\} .el \{\ . nr nh*ohl \\n[nh*hl] . ie \\n[.$] \{\ . nr nh*hl 0\\$1 . ie \\n[nh*hl]<=0 \{\ . nr nh*ohl 0 . nr nh*hl 1 . \} . el \{\ . if \\n[nh*hl]-\\n[nh*ohl]>1 \ . @warning .NH \\n[nh*ohl] followed by .NH \\n[nh*hl] . \} . \} . el .nr nh*hl 1 . while \\n[nh*hl]>\\n[nh*ohl] \{\ . nr nh*ohl +1 . nr H\\n[nh*ohl] 0 . \} . nr H\\n[nh*hl] +1 .\} .ds SN .nr nh*i 0 .while \\n[nh*i]<\\n[nh*hl] \{\ . nr nh*i +1 . as SN \\n[H\\n[nh*i]]. .\} .SH .if \\n[VARPS] \{\ . ps \\n[PS]+2 . ne 3 .\} \\*[SN] .. .de VARPS .nr VARPS 1 .. .\" **************************** .\" ******** module toc ******** .\" **************************** .\" Table of contents generation. .de XS .da toc*div .ev h .par@reset .fi .ie \\n[.$] .XA "\\$1" .el .XA .. .de @div-end!toc*div .XE .. .de XA .ie '\\n(.z'toc*div' \{\ . if d toc*num .toc*end-entry . ie \\n[.$] \{\ . ie '\\$1'no' .ds toc*num . el .ds toc*num "\\$1 . \} . el .ds toc*num \\n[PN] . in (n;0\\$2) .\} .el .@error XA without XS .. .de XE .ie '\\n(.z'toc*div' \{\ . if d toc*num .toc*end-entry . ev . di .\} .el .@error XS without XE .. .de toc*end-entry \\a\\t\\*[toc*num] .br .rm toc*num .. .de PX .1C .if !'\\$1'no' \{\ . ce 1 . ps \\n[PS]+2 . ft 3 \\*[TOC] . ft . ps .\} .nf .char \[toc*leader-char] .\h'1m' .lc \[toc*leader-char] .ta (u;\\n[.l]-\\n[.i]-\w'000') (u;\\n[.l]-\\n[.i])R .sp 2 .toc*div .par@reset .. .\" print the table of contents on page i .de TC .P1 .pg@begin 1 i .PX \\$1 .. .\" **************************** .\" ******** module eqn ******** .\" **************************** .\" Eqn support. .de EQ .. .de EN .. .de @EQ .br .ds eqn*num "\\$2 .ie '\\$1'L' .nr eqn*type 0 .el \{\ . ie '\\$1'I' .nr eqn*type 1 . el \{\ . nr eqn*type 2 . if !'\\$1'C' .ds eqn*num "\\$1 . \} .\} .di eqn*div .in 0 .nf .. .de @div-end!eqn*div .@EN .. .\" Note that geqn mark and lineup work correctly in centered equations. .de @EN .ie !'\\n(.z'eqn*div' .@error-recover mismatched EN .el \{\ . br . di . nr eqn*have-num 0 . if !'\\*[eqn*num]'' .nr eqn*have-num 1 . if \\n[dl]:\\n[eqn*have-num] \{\ . sp \\n[DD]u . par@reset . ds eqn*tabs \\n[.tabs] . nf . ie \\n[dl] \{\ . ds@need \\n[dn]u-1v+\n[.V]u . chop eqn*div . ie \\n[eqn*type]=0 \{\ . ta (u;\\n[.l]-\\n[.i])R \\*[eqn*div]\t\\*[eqn*num] . \} . el \{\ . ie \\n[eqn*type]=1 .ta \\n[DI]u \ (u;\\n[.l]-\\n[.i])R . el .ta (u;\\n[.l]-\\n[.i]/2)C \ (u;\\n[.l]-\\n[.i])R \t\\*[eqn*div]\t\\*[eqn*num] . \} . \} . el \{\ . ta (u;\\n[.l]-\\n[.i])R \t\\*[eqn*num] . \} . sp \\n[DD]u . fi . ta \\*[eqn*tabs] . \} .\} .. .\" **************************** .\" ******** module tbl ******** .\" **************************** .\" Tbl support. .nr tbl*have-header 0 .de TS .\" The break is necessary in the case where the first page has not yet begun. .br .sp \\n[DD]u .if '\\$1'H' .di tbl*header-div .. .de tbl@top-hook .if \\n[tbl*have-header] \{\ . ie \\n[.t]-\\n[tbl*header-ht]-1v .tbl*print-header . el .sp \\n[.t]u .\} .. .de tbl*print-header .ev nf .tbl*header-div .ev .mk #T .. .de TH .ie '\\n[.z]'tbl*header-div' \{\ . nr T. 0 . T# . br . di . ie \\n[dn]+\\n[FM]+\\n[HM]+2v>=\\n[.p] \{\ . @error ridiculously long table header . ds@need \\n[dn] . tbl*print-header . \} . el \{\ . nr tbl*header-ht \\n[dn] . ds@need \\n[dn]u+1v . tbl*print-header . nr tbl*have-header 1 . \} .\} .el .@error-recover .TH without .TS H .. .de @div-end!tbl*header-div .TH .TE .. .de TE .ie '\\n(.z'tbl*header-div' .@error-recover .TS H but no .TH before .TE .el \{\ . nr tbl*have-header 0 . sp \\n[DD]u .\} .\" reset tabs .TA .. .de tbl@bottom-hook .if \\n[tbl*have-header] \{\ . nr T. 1 . T# .\} .. .de T& .. .\" **************************** .\" ******** module pic ******** .\" **************************** .\" Pic support. .\" PS height width .de PS .br .sp \\n[DD]u .ie \\n[.$]<2 .@error bad arguments to PS (not preprocessed with pic?) .el \{\ . ds@need (u;\\$1)+1v . in +(u;\\n[.l]-\\n[.i]-\\$2/2>?0) .\} .. .de PE .par@reset .sp \\n[DD]u+.5m .. .\" **************************** .\" ******** module ref ******** .\" **************************** .\" Refer support. .de ]- .rm [A [B [C [D [E [G [I [J [N [O [P [Q [R [S [T [V .rm ref*string .. .\" Other .ds ref*spec!0 Q A T S V N P I C D O .\" Journal article .ds ref*spec!1 Q A T J S V N P I C D O .\" Book .ds ref*spec!2 Q A T S V P I C D O .\" Article within book .ds ref*spec!3 Q A T B E S V P I C D O .\" Tech report .ds ref*spec!4 Q A T R G P I C D O .\" ][ type .de ][ .ie d ref*spec!\\$1 .ref*build \\*[ref*spec!\\$1] .el \{\ . @error unknown reference type `\\$1' . ref*build \\*[ref*spec!0] .\} .ref*print .rm ref*string .rm [F .. .\" start of reference number .ds [. \\*[par@sup-start] .\" end of reference number .ds .] \\*[par@sup-end] .\" period before reference .ds <. . .\" period after reference .ds >. \" empty .\" comma before reference .ds <, , .\" comma after reference .ds >, \" empty .\" start collected references .de ]< .als ref*print ref*end-print .SH \&\\*[REFERENCES] .par@reset .. .\" end collected references .de ]> .par@finish .als ref*print ref*normal-print .. .de ref*normal-print .ie d [F .FS "\\*([.\\*([F\\*(.]" .el .FS \& \\*[ref*string] .FE .. .de ref*end-print .ie d [F .IP "\\*([F." .el .XP \\*[ref*string] .. .als ref*print ref*normal-print .de ref*build .rm ref*string ref*post-punct .nr ref*suppress-period 1 .while \\n[.$] \{\ . if d [\\$1 \{\ . ie d ref*add-\\$1 .ref*add-\\$1 . el .ref*add-dflt \\$1 . \} . shift .\} .\" now add a final period .ie d ref*string \{\ . if !\\n[ref*suppress-period] .as ref*string . . if d ref*post-punct \{\ . as ref*string "\\*[ref*post-punct] . rm ref*post-punct . \} .\} .el .ds ref*string .. .de ref*add-T .ref*field T , "\\*Q" "" "\\*U" .if r [T .nr ref*suppress-period \\n([T .. .de ref*add-P .ie \\n([P>0 .ref*field P , "pp. " .el .ref*field P , "p. " .. .de ref*add-J .ref*field J , \f2 "" \fP .. .de ref*add-D .ref*field D "" ( ) .. .de ref*add-E .ref*field E , "ed. " .. .de ref*add-G .ref*field G "" ( ) .. .de ref*add-B .ref*field B "" "in \f2" "" \fP .. .de ref*add-O .ref*field O . .ie r [O .nr ref*suppress-period \\n([O .el .nr ref*suppress-period 1 .. .de ref*add-A .ref*field A , .if r [A .nr ref*suppress-period \\n([A .. .de ref*add-dflt .ref*field \\$1 , .. .\" First argument is the field letter. .\" Second argument is the punctuation character to use to separate this field .\" from the previous field. .\" Third argument is a string with which to prefix this field. .\" Fourth argument is a string with which to postfix this field. .\" Fifth argument is a string to add after the punctuation character supplied .\" by the next field. .de ref*field .if d ref*string \{\ . ie d ref*post-punct \{\ . as ref*string "\\$2\\*[ref*post-punct] \" . rm ref*post-punct . \} . el .as ref*string "\\$2 \" .\} .as ref*string "\\$3\\*([\\$1\\$4 .if \\n[.$]>4 .ds ref*post-punct "\\$5 .nr ref*suppress-period 0 .. .\" **************************** .\" ******** module acc ******** .\" **************************** .\" Accents and special characters. .ds Q \)``\) .ds U \)''\) .ds - \(em .\" Characters .if !c\(rg .char \(rg (R) .if !c\(ah .char \(ah \v'-.55m'\s[\En[.s]/2u]v\s0\v'.55m' .if !c\(ad .char \(ad \v'-.55m'\s[\En[.s]*7u/10u].\h'.05m'.\s0\v'.55m' .if !c\(a- .char \(a- \v'-.55m'\D'l .25m 0'\v'.55m' .if !c\(ao .char \(ao \v'-.55m'\s[\En[.s]*6u/10u]\D'c .25m'\s0\v'.55m' .if !c\(ac .char \(ac \s[\En[.s]*8u/10u]\v'.05m',\v'-.05m'\s0 .if !c\(ho .char \(ho \s[\En[.s]/2u]\v'.4m'c\v'-.4m'\s0 .if !c\(-D .char \(-D \Z'\v'-.1m'-'D .if !c\(Sd .char \(Sd \Z'\v'-.3m'\h'.2m'-'\(pd .if !c\(TP .char \(TP I\h'-.25m'\v'-.33m'\s[\En[.s]*6u/10u]\v'.33m'D\ \v'-.33m'\s0\v'.33m' .if !c\(Tp .char \(Tp \zlp .if !c\(ss .char \(ss \(*b .if !c\(AE .char \(AE A\h'-.3m'E .if !c\(ae .char \(ae a\h'-.19m'e .if !c\(OE .char \(OE O\h'-.25m'E .if !c\(oe .char \(oe o\h'-.14m'e .if !c\(r? .char \(r? \Z'\h'.1m'\v'-.15m'\s[\En[.s]*7u/10u]i\s0\v'.15m''\ \v'.15m'\s[\En[.s]*7u/10u]c\s0\v'-.15m' .if !c\(r! .char \(r! \h'.1m'\Z'\v'-.4m'\s[\En[.s]*8u/10u].\s0\v'.4m''\ \s[\En[.s]*8u/10u]\v'.4m'\(or\v'-.4m'\s0\h'.1m' .\" The idea of this definition is for the top of the 3 to be at the x-height. .\" A yogh really ought to have a little line going north-west from the top .\" left of the 3. .if !c\[yogh] .char \[yogh] \Z'\v'\w'x'*0-\En[rst]u'\s[\En[.s]*8u/10u]\ \v'\w'3'*0+\En[rst]u'3\s0'\h'\w'\s[\En[.s]*8u/10u]3'u' .\" Accents .de acc*over-def .ds \\$1 \Z'\v'(u;\w'x'*0+\En[rst]-\En[.cht])'\ \h'(u;-\En[skw]+(-\En[.w]-\w'\\$2'/2)+\En[.csk])'\\$2' .. .de acc*under-def .ds \\$1 \Z'\v'\En[.cdp]u'\h'(u;-\En[.w]-\w'\\$2'/2)'\\$2' .. .de acc*slash-def .ds \\$1 \Z'\h'(u;-\En[.w]-\w'\\$2'/2)'\ \v'(u;\En[.cdp]-\En[.cht]+\En[rst]+\En[rsb]/2)'\\$2' .. .de acc*prefix-def .ds \\$1 \Z'\h'(u;\w'x'-\w'\\$2'/2)'\\$2' .. .acc*prefix-def ' \' .acc*prefix-def ` \` .acc*prefix-def ^ ^ .acc*prefix-def , \(ac .acc*prefix-def : \(ad .acc*prefix-def ~ ~ .\" improved accent marks .de AM .acc*over-def ' \' .acc*over-def ` \` .acc*over-def ^ ^ .acc*over-def ~ ~ .acc*over-def : \(ad .acc*over-def v \(ah .acc*over-def _ \(a- .acc*over-def o \(ao .acc*under-def , \(ac .acc*under-def . \s[\En[.s]*8u/10u]\v'.2m'.\v'-.2m'\s0 .acc*under-def hook \(ho .acc*slash-def / / .char \[hooko] o\\\\*[hook] .ds q \[hooko] .ds 3 \[yogh] .ds D- \(-D\" Icelandic uppercase eth .ds d- \(Sd\" Icelandic lowercase eth .ds Th \(TP\" Icelandic uppercase thorn .ds th \(Tp\" Icelandic lowercase thorn .ds 8 \(ss\" German double s .ds Ae \(AE\" AE ligature .ds ae \(ae\" ae ligature .ds Oe \(OE\" OE ligature .ds oe \(oe\" oe ligature .ds ? \(r?\" upside down ? .ds ! \(r!\" upside down ! .. .\" Make sure that no blank lines creep in at the end of this file. �������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/usenix.ol������������������������������������������������������������������������0000664�0000764�0000764�00000006372�07045412511�015610� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������Introduction What is it? A bunch of speed of light benchmarks, not MP, not throughput, not saturation, not stress tests. A microbenchmark suite Measures system performance Latency and bandwidth measurements Measurements focus on OS and hardware What is delivered to the application Not marketing numbers Benchmark performance predicts application performance Results for which systems? Sun, SGI, DEC, IBM, HP, PCs Useful information to whom? Performance engineers, system programmers, system architects. Motivation What are we measuring? Control / latecy operatins Bandwidth operations What aren't we measuring? Basic MIPS & MFLOPS. XXX - not unless I do it right. What can I learn? Cost of operations ****Operations per time unit**** Compare speed of alternative paths (e.g. mmap vs. read) Performance problems = f(bw issues + latency issues) Give at least two examples NFS control & data: UDP lat, proc lat, & various BW metrics Oracle lock manager: TCP lat Verilog: mem lat AIM: fs ops XXX -ask Scott about pipes. Knowing the speeds of primitives can provide speeds of apps. An example here would be nice. Outline Describe benchmark Give results from current machines Discuss results Future changes, enhancements, etc. Tutorial on benchmarks For each metric what is it? why is it being measured? How is it measured? Measuring subtlities Interpreting the results Latency Process stuff networking stuff file system stuff memory stuff whatever Bandwidth networking file system memory Results Tabular results - XXX update that table to reflect the newer metrics Graphs of memory latency & context switches Discussion Memory stuff Maybe contrast AIX with the $100K IBM uniprocessor w/ killer memory perf and point out that it is the memory that is making AIX go fast, it certainly isn't AIX. A more politic observation would be that systems with good memory performace tend to have good system performance; the point being to shift people's attention to system performance, especially memory subsystem, as opposed to processor mips. Comparisons Maybe look at the table and draw attention to really good and really bad numbers for various platforms (like Linux' context switch time, Linux fs ops, solaris syscall, process stuff, 990 memory BW). Graphs A graph showing a range of really fast to really slow ops, all on the same graph. Do bandwidth stuff normalized on MB/sec. Carl sez: show both ops/sec and cost/op on two graphs. A graph showing processor slow down due to memory misses, assuming each instruction misses. Maybe a graph that shows # of clocks (or better yet, # of instructions - think super scalar) that you would have to have between each memory miss in order to run at the clock speed. War stories Sun page coloring bug SGI page coloring bug SGI hippi bug - XXX ask Thomas Sun bcopy bug Lmbench [optional?] how to get lmbench how to compile how to run how to show results Future work More hardware stuff - better latency measurements (write lat, cache to cache latency). add throughput & saturation measurements TODO get some similar papers for comparison Someday I need reasonable I/O benchmarks to show off good big SMP machines like Challenge. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/usenix96.ms����������������������������������������������������������������������0000664�0000764�0000764�00000225460�07045412511�015775� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" This document is GNU groff -mgs -t -p -R -s .\" It will not print with normal troffs, it uses groff features, in particular, .\" long names for registers & strings. .\" Deal with it and use groff - it makes things portable. .\" .\" $X$ xroff -mgs -t -p -R -s $file .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr .VARPS .\" Define a page top that looks cool .\" HELLO CARL! To turn this off, s/PT/oldPT/ .de draftPT .\" .tl '\fBDRAFT\fP'Printed \\*(DY'\fBDRAFT\fP' .. .de PT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .\" HELLO CARL! To turn this off, s/BT/oldBT/ .de draftBT .\" .tl '\fBDRAFT\fP'Page %'\fBDRAFT\fP' .. .de BT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 1995 \\*[author]'\\*(DY'%' . ps .. .de SP . if t .sp .5 . if n .sp 1 .. .de BU . SP . ne 2 \(bu\ . if \\n[.$] \fB\\$1\fP\\$2 .. .nr FIGURE 0 .nr TABLE 0 .nr SMALL .25i .de TSTART . KF . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 . ps -1 . vs -1 .. .de TEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr TABLE \\n[TABLE]+1 . ce 1 \fBTable \\n[TABLE].\ \ \\$1\fP . SP . KE .. .de FEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr FIGURE \\n[FIGURE]+1 . ce 1 \fBFigure \\n[FIGURE].\ \ \\$1\fP . SP . KE .. .\" Configuration .nr PI 3n .nr HM .95i .nr FM 1i .nr PO .95i .if t .po .95i .nr LL 6.5i .if n .nr PO 0i .if n .nr LL 7.75i .nr PS 10 .nr VS \n(PS+1 .ds title Portable tools for performance analysis .ds author Larry McVoy .ds lmbench \f(CWlmbench\fP .ds lmdd \f(CWlmdd\fP .ds bcopy \f(CWbcopy\fP .ds connect \f(CWconnect\fP .ds execlp \f(CWexeclp\fP .ds exit \f(CWexit\fP .ds fork \f(CWfork\fP .ds gcc \f(CWgcc\fP .ds getpid \f(CWgetpid\fP .ds getpid \f(CWgetpid\fP .ds gettimeofday \f(CWgettimeofday\fP .ds kill \f(CWkill\fP .ds memmove \f(CWmemmove\fP .ds mmap \f(CWmmap\fP .ds popen \f(CWpopen\fP .ds read \f(CWread\fP .ds stream \f(CWstream\fP .ds system \f(CWsystem\fP .ds uiomove \f(CWuiomove\fP .ds write \f(CWwrite\fP .ds yield \f(CWyield\fP .\" References stuff .de RN \"Reference Name: .RN $1 -- prints the reference prettily .\"[\s-2\\$1\s+2]\\$2 [\s-1\\$1\s0]\\$2 .. .\" .R1 .\" sort A+DT .\" database references .\" label-in-text .\" label A.nD.y-2 .\" bracket-label \*([. \*(.] ", " .\" .R2 .TL \s(14lmbench: Portable tools for performance analysis\s0\** .AU \s+2\fR\*[author]\fP\s0 .AI \fI\s+2Silicon Graphics, Inc.\s0\fP .AU \s+2\fRCarl Staelin\fP .AI \s+2\fIHewlett-Packard Laboratories\s0\fP .SP .AB \*[lmbench] is a micro-benchmark suite designed to focus attention on the basic building blocks of many common system applications, such as databases, simulations, software development, and networking. In almost all cases, the individual tests are the result of analysis and isolation of a customer's actual performance problem. .\" .SP These tools can be, and currently are, used to compare different system implementations from different vendors. In several cases, the benchmarks have uncovered previously unknown bugs and design flaws. The results have shown a strong correlation between memory system performance and overall performance. .\" XXX - MP versus uniprocessors? \*[lmbench] includes an extensible database of results from systems current as of late 1995. .AE .if t .MC 3.05i .FS This paper first appeared in the January 1996 Usenix conference proceedings. The version you are reading has new results as well as some corrections. .FE .NH 1 Introduction .PP \*[lmbench] provides a suite of benchmarks that attempt to measure the most commonly found performance bottlenecks in a wide range of system applications. These bottlenecks have been identified, isolated, and reproduced in a set of small micro-benchmarks, which measure system latency and bandwidth of data movement among the processor and memory, network, file system, and disk. The intent is to produce numbers that real applications can reproduce, rather than the frequently quoted and somewhat less reproducible marketing performance numbers. .PP The benchmarks focus on latency and bandwidth because performance issues are usually caused by latency problems, bandwidth problems, or some combination of the two. Each benchmark exists because it captures some unique performance problem present in one or more important applications. For example, the TCP latency benchmark is an accurate predictor of the Oracle distributed lock manager's performance, the memory latency benchmark gives a strong indication of Verilog simulation performance, and the file system latency benchmark models a critical path in software development. .PP \*[lmbench] was developed to identify and evaluate system performance bottlenecks present in many machines in 1993-1995. It is entirely possible that computer architectures will have changed and advanced enough in the next few years to render parts of this benchmark suite obsolete or irrelevant. .PP \*[lmbench] is already in widespread use at many sites by both end users and system designers. In some cases, \*[lmbench] has provided the data necessary to discover and correct critical performance problems that might have gone unnoticed. \*[lmbench] uncovered a problem in Sun's memory management software that made all pages map to the same location in the cache, effectively turning a 512 kilobyte (K) cache into a 4K cache. .PP \*[lmbench] measures only a system's ability to transfer data between processor, cache, memory, network, and disk. It does not measure other parts of the system, such as the graphics subsystem, nor is it a MIPS, MFLOPS, throughput, saturation, stress, graphics, or multiprocessor test suite. It is frequently run on multiprocessor (MP) systems to compare their performance against uniprocessor systems, but it does not take advantage of any multiprocessor features. .PP The benchmarks are written using standard, portable system interfaces and facilities commonly used by applications, so \*[lmbench] is portable and comparable over a wide set of Unix systems. \*[lmbench] has been run on AIX, BSDI, HP-UX, IRIX, Linux, FreeBSD, NetBSD, OSF/1, Solaris, and SunOS. Part of the suite has been run on Windows/NT as well. .PP \*[lmbench] is freely distributed under the Free Software Foundation's General Public License .RN Stallman89 , with the additional restriction that results may be reported only if the benchmarks are unmodified. .NH 1 Prior work .PP Benchmarking and performance analysis is not a new endeavor. There are too many other benchmark suites to list all of them here. We compare \*[lmbench] to a set of similar benchmarks. .BU "I/O (disk) benchmarks" : IOstone .RN Park90 wants to be an I/O benchmark, but actually measures the memory subsystem; all of the tests fit easily in the cache. IObench .RN Wolman89 is a systematic file system and disk benchmark, but it is complicated and unwieldy. In .RN McVoy91 we reviewed many I/O benchmarks and found them all lacking because they took too long to run and were too complex a solution to a fairly simple problem. We wrote a small, simple I/O benchmark, \*[lmdd] that measures sequential and random I/O far faster than either IOstone or IObench. As part of .RN McVoy91 the results from \*[lmdd] were checked against IObench (as well as some other Sun internal I/O benchmarks). \*[lmdd] proved to be more accurate than any of the other benchmarks. At least one disk vendor routinely uses \*[lmdd] to do performance testing of its disk drives. .SP Chen and Patterson .RN "Chen93, Chen94" measure I/O performance under a variety of workloads that are automatically varied to test the range of the system's performance. Our efforts differ in that we are more interested in the CPU overhead of a single request, rather than the capacity of the system as a whole. .BU "Berkeley Software Distribution's microbench suite" : The BSD effort generated an extensive set of test benchmarks to do regression testing (both quality and performance) of the BSD releases. We did not use this as a basis for our work (although we used ideas) for the following reasons: (a) missing tests \(em such as memory latency, (b) too many tests, the results tended to be obscured under a mountain of numbers, and (c) wrong copyright \(em we wanted the Free Software Foundation's General Public License. .BU "Ousterhout's Operating System benchmark" : .RN Ousterhout90 proposes several system benchmarks to measure system call latency, context switch time, and file system performance. We used the same ideas as a basis for our work, while trying to go farther. We measured a more complete set of primitives, including some hardware measurements; went into greater depth on some of the tests, such as context switching; and went to great lengths to make the benchmark portable and extensible. .BU "Networking benchmarks" : \f(CWNetperf\fP measures networking bandwidth and latency and was written by Rick Jones of Hewlett-Packard. \*[lmbench] includes a smaller, less complex benchmark that produces similar results. .SP \f(CWttcp\fP is a widely used benchmark in the Internet community. Our version of the same benchmark routinely delivers bandwidth numbers that are within 2% of the numbers quoted by \f(CWttcp\fP. .BU "McCalpin's stream benchmark" : .RN McCalpin95 has memory bandwidth measurements and results for a large number of high-end systems. We did not use these because we discovered them only after we had results using our versions. We will probably include McCalpin's benchmarks in \*[lmbench] in the future. .PP In summary, we rolled our own because we wanted simple, portable benchmarks that accurately measured a wide variety of operations that we consider crucial to performance on today's systems. While portions of other benchmark suites include similar work, none includes all of it, few are as portable, and almost all are far more complex. Less filling, tastes great. .NH 1 Benchmarking notes .NH 2 Sizing the benchmarks .PP The proper sizing of various benchmark parameters is crucial to ensure that the benchmark is measuring the right component of system performance. For example, memory-to-memory copy speeds are dramatically affected by the location of the data: if the size parameter is too small so the data is in a cache, then the performance may be as much as ten times faster than if the data is in memory. On the other hand, if the memory size parameter is too big so the data is paged to disk, then performance may be slowed to such an extent that the benchmark seems to `never finish.' .PP \*[lmbench] takes the following approach to the cache and memory size issues: .BU All of the benchmarks that could be affected by cache size are run in a loop, with increasing sizes (typically powers of two) until some maximum size is reached. The results may then be plotted to see where the benchmark no longer fits in the cache. .BU The benchmark verifies that there is sufficient memory to run all of the benchmarks in main memory. A small test program allocates as much memory as it can, clears the memory, and then strides through that memory a page at a time, timing each reference. If any reference takes more than a few microseconds, the page is no longer in memory. The test program starts small and works forward until either enough memory is seen as present or the memory limit is reached. .NH 2 Compile time issues .PP The GNU C compiler, \*[gcc], is the compiler we chose because it gave the most reproducible results across platforms. When \*[gcc] was not present, we used the vendor-supplied \f(CWcc\fP. All of the benchmarks were compiled with optimization \f(CW-O\fP except the benchmarks that calculate clock speed and the context switch times, which must be compiled without optimization in order to produce correct results. No other optimization flags were enabled because we wanted results that would be commonly seen by application writers. .PP All of the benchmarks were linked using the default manner of the target system. For most if not all systems, the binaries were linked using shared libraries. .NH 2 Multiprocessor issues .PP All of the multiprocessor systems ran the benchmarks in the same way as the uniprocessor systems. Some systems allow users to pin processes to a particular CPU, which sometimes results in better cache reuse. We do not pin processes because it defeats the MP scheduler. .\" XXX - I should do this on an IP19 and mark it as pinned. In certain cases, this decision yields interesting results discussed later. .NH 2 Timing issues .LP .sp -.5 .BU "Clock resolution" : The benchmarks measure the elapsed time by reading the system clock via the \*[gettimeofday] interface. On some systems this interface has a resolution of 10 milliseconds, a long time relative to many of the benchmarks which have results measured in tens to hundreds of microseconds. To compensate for the coarse clock resolution, the benchmarks are hand-tuned to measure many operations within a single time interval lasting for many clock ticks. Typically, this is done by executing the operation in a small loop, sometimes unrolled if the operation is exceedingly fast, and then dividing the loop time by the loop count. .BU Caching : If the benchmark expects the data to be in the cache, the benchmark is typically run several times; only the last result is recorded. .SP If the benchmark does not want to measure cache performance it sets the size parameter larger than the cache. For example, the \*[bcopy] benchmark by default copies 8 megabytes to 8 megabytes, which largely defeats any second-level cache in use today. (Note that the benchmarks are not trying to defeat the file or process page cache, only the hardware caches.) .br .di bigtable .ev keep .ps 8 .vs 9 .so systems.tbl .ps \n[PS] .vs \n[VS] .nr TABLE \n[TABLE]+1 .ce 1 .SP \fBTable \n[TABLE].\ \ System descriptions.\fP .SP .di .ev .nr WHEN \n[dn]+\n[FM] .nr THT \n[dn] .de print*table ' sp .5 ' ev keep ' nf ' bigtable . ne 1 . wh -\n[WHEN]u skip*page . fi . ev .. .de skip*page ' sp \n[THT]u . wh -\n[WHEN]u .. .wh -\n[WHEN]u print*table .BU Variability : The results of some benchmarks, most notably the context switch benchmark, had a tendency to vary quite a bit, up to 30%. We suspect that the operating system is not using the same set of physical pages each time a process is created and we are seeing the effects of collisions in the external caches. We compensate by running the benchmark in a loop and taking the minimum result. Users interested in the most accurate data are advised to verify the results on their own platforms. .PP Many of the results included in the database were donated by users and were not created by the authors. Good benchmarking practice suggests that one should run the benchmarks as the only user of a machine, without other resource intensive or unpredictable processes or daemons. .NH 2 Using the \f(CBlmbench\fP database .PP \*[lmbench] includes a database of results that is useful for comparison purposes. It is quite easy to build the source, run the benchmark, and produce a table of results that includes the run. All of the tables in this paper were produced from the database included in \*[lmbench]. This paper is also included with \*[lmbench] and may be reproduced incorporating new results. For more information, consult the file \f(CWlmbench-HOWTO\fP in the \*[lmbench] distribution. .NH 1 Systems tested .PP \*[lmbench] has been run on a wide variety of platforms. This paper includes results from a representative subset of machines and operating systems. Comparisons between similar hardware running different operating systems can be very illuminating, and we have included a few examples in our results. .PP The systems are briefly characterized in Table 1. Please note that the list prices are very approximate as is the year of introduction. The SPECInt92 numbers are a little suspect since some vendors have been ``optimizing'' for certain parts of SPEC. We try and quote the original SPECInt92 numbers where we can. .NH 2 Reading the result tables .PP Throughout the rest of this paper, we present tables of results for many of the benchmarks. All of the tables are sorted, from best to worst. Some tables have multiple columns of results and those tables are sorted on only one of the columns. The sorted column's heading will be in \fBbold\fP. .NH 1 Bandwidth benchmarks .PP By bandwidth, we mean the rate at which a particular facility can move data. We attempt to measure the data movement ability of a number of different facilities: library \*[bcopy], hand-unrolled \*[bcopy], direct-memory read and write (no copying), pipes, TCP sockets, the \*[read] interface, and the \*[mmap] interface. .NH 2 Memory bandwidth .PP Data movement is fundamental to any operating system. In the past, performance was frequently measured in MFLOPS because floating point units were slow enough that microprocessor systems were rarely limited by memory bandwidth. Today, floating point units are usually much faster than memory bandwidth, so many current MFLOP ratings can not be maintained using memory-resident data; they are ``cache only'' ratings. .PP We measure the ability to copy, read, and write data over a varying set of sizes. There are too many results to report all of them here, so we concentrate on large memory transfers. .PP We measure copy bandwidth two ways. The first is the user-level library \*[bcopy] interface. The second is a hand-unrolled loop that loads and stores aligned 8-byte words. In both cases, we took care to ensure that the source and destination locations would not map to the same lines if the any of the caches were direct-mapped. In order to test memory bandwidth rather than cache bandwidth, both benchmarks copy an 8M\** area to another 8M area. (As secondary caches reach 16M, these benchmarks will have to be resized to reduce caching effects.) .FS Some of the PCs had less than 16M of available memory; those machines copied 4M. .FE .PP The copy results actually represent one-half to one-third of the memory bandwidth used to obtain those results since we are reading and writing memory. If the cache line size is larger than the word stored, then the written cache line will typically be read before it is written. The actual amount of memory bandwidth used varies because some architectures have special instructions specifically designed for the \*[bcopy] function. Those architectures will move twice as much memory as reported by this benchmark; less advanced architectures move three times as much memory: the memory read, the memory read because it is about to be overwritten, and the memory written. .PP The \*[bcopy] results reported in Table 2 may be correlated with John McCalpin's \*[stream] .RN McCalpin95 benchmark results in the following manner: the \*[stream] benchmark reports all of the memory moved whereas the \*[bcopy] benchmark reports the bytes copied. So our numbers should be approximately one-half to one-third of his numbers. .PP Memory reading is measured by an unrolled loop that sums up a series of integers. On most (perhaps all) systems measured the integer size is 4 bytes. The loop is unrolled such that most compilers generate code that uses a constant offset with the load, resulting in a load and an add for each word of memory. The add is an integer add that completes in one cycle on all of the processors. Given that today's processor typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000 ns per cache line, the results reported here should be dominated by the memory subsystem, not the processor add unit. .PP The memory contents are added up because almost all C compilers would optimize out the whole loop when optimization was turned on, and would generate far too many instructions without optimization. The solution is to add up the data and pass the result as an unused argument to the ``finish timing'' function. .PP Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect that pure reads should run at roughly twice the speed of \*[bcopy]. Exceptions to this rule should be studied, for exceptions indicate a bug in the benchmarks, a problem in \*[bcopy], or some unusual hardware. .TSTART .so ../Results/tmp/bw_allmem.tbl .TEND "Memory bandwidth (MB/s)" .PP Memory writing is measured by an unrolled loop that stores a value into an integer (typically a 4 byte integer) and then increments the pointer. The processor cost of each memory operation is approximately the same as the cost in the read case. .PP The numbers reported in Table \n[TABLE] are not the raw hardware speed in some cases. The Power2\** is capable of up to 800M/sec read rates .FS Someone described this machine as a $1,000 processor on a $99,000 memory subsystem. .FE .RN McCalpin95 and HP PA RISC (and other prefetching) systems also do better if higher levels of code optimization used and/or the code is hand tuned. .PP The Sun libc bcopy in Table \n[TABLE] is better because they use a hardware specific bcopy routine that uses instructions new in SPARC V9 that were added specifically for memory movement. .PP The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because, according to Intel, the write transaction turns into a read followed by a write to maintain cache consistency for MP systems. .NH 2 IPC bandwidth .PP Interprocess communication bandwidth is frequently a performance issue. Many Unix applications are composed of several processes communicating through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation system that prepared this paper, the \f(CWX Window System\fP, remote file access, and \f(CWWorld Wide Web\fP servers. .PP Unix pipes are an interprocess communication mechanism implemented as a one-way byte stream. Each end of the stream has an associated file descriptor; one is the write descriptor and the other the read descriptor. TCP sockets are similar to pipes except they are bidirectional and can cross machine boundaries. .PP Pipe bandwidth is measured by creating two processes, a writer and a reader, which transfer 50M of data in 64K transfers. The transfer size was chosen so that the overhead of system calls and context switching would not dominate the benchmark time. The reader prints the timing results, which guarantees that all data has been moved before the timing is finished. .PP TCP bandwidth is measured similarly, except the data is transferred in 1M page aligned transfers instead of 64K transfers. If the TCP implementation supports it, the send and receive socket buffers are enlarged to 1M, instead of the default 4-60K. We have found that setting the transfer size equal to the socket buffer size produces the greatest throughput over the most implementations. .TSTART .so ../Results/tmp/bw_ipc.tbl .TEND "Pipe and local TCP bandwidth (MB/s)" .PP \*[bcopy] is important to this test because the pipe write/read is typically implemented as a \*[bcopy] into the kernel from the writer and then a \*[bcopy] from the kernel to the reader. Ideally, these results would be approximately one-half of the \*[bcopy] results. It is possible for the kernel \*[bcopy] to be faster than the C library \*[bcopy] since the kernel may have access to \*[bcopy] hardware unavailable to the C library. .PP It is interesting to compare pipes with TCP because the TCP benchmark is identical to the pipe benchmark except for the transport mechanism. Ideally, the TCP bandwidth would be as good as the pipe bandwidth. It is not widely known that the majority of the TCP cost is in the \*[bcopy], the checksum, and the network interface driver. The checksum and the driver may be safely eliminated in the loopback case and if the costs have been eliminated, then TCP should be just as fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to see that Solaris and HP-UX have done this optimization. .PP Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the pipe transfers are done in 64K buffers, a size that frequently fits in caches, while the bcopy is typically an 8M-to-8M copy, which does not fit in the cache. .PP In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than the SGI MP on pipe bandwidth because of caching effects - in the UP case, both processes share the cache; on the MP, each process is communicating with a different cache. .PP All of the TCP results in Table \n[TABLE] are in loopback mode \(em that is both ends of the socket are on the same machine. It was impossible to get remote networking results for all the machines included in this paper. We are interested in receiving more results for identical machines with a dedicated network connecting them. The results we have for over the wire TCP bandwidth are shown below. .TSTART .so tcp_bw.tbl .TEND "Remote TCP bandwidth (MB/s)" .PP The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE]. The SGI Hippi interface has hardware support for TCP checksums and the IRIX operating system uses virtual memory tricks to avoid copying data as much as possible. For larger transfers, SGI Hippi has reached 92MB/s over TCP. .PP 100baseT is looking quite competitive when compared to FDDI in Table \n[TABLE], even though FDDI has packets that are almost three times larger. We wonder how long it will be before we see gigabit ethernet interfaces. .NH 2 Cached I/O bandwidth .PP Experience has shown us that reusing data in the file system page cache can be a performance issue. This section measures that operation through two interfaces, \*[read] and \*[mmap]. The benchmark here is not an I/O benchmark in that no disk activity is involved. We wanted to measure the overhead of reusing data, an overhead that is CPU intensive, rather than disk intensive. .PP The \*[read] interface copies data from the kernel's file system page cache into the process's buffer, using 64K buffers. The transfer size was chosen to minimize the kernel entry overhead while remaining realistically sized. .PP The difference between the \*[bcopy] and the \*[read] benchmarks is the cost of the file and virtual memory system overhead. In most systems, the \*[bcopy] speed should be faster than the \*[read] speed. The exceptions usually have hardware specifically designed for the \*[bcopy] function and that hardware may be available only to the operating system. .PP The \*[read] benchmark is implemented by rereading a file (typically 8M) in 64K buffers. Each buffer is summed as a series of integers in the user process. The summing is done for two reasons: for an apples-to-apples comparison the memory-mapped benchmark needs to touch all the data, and the file system can sometimes transfer data into memory faster than the processor can read the data. For example, \s-1SGI\s0's XFS can move data into memory at rates in excess of 500M per second, but it can move data into the cache at only 68M per second. The intent is to measure performance delivered to the application, not DMA performance to memory. .TSTART .so ../Results/tmp/bw_reread2.tbl .TEND "File vs. memory bandwidth (MB/s)" .PP The \*[mmap] interface provides a way to access the kernel's file cache without copying the data. The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M) into the process's address space. The file is then summed to force the data into the cache. .PP In Table \n[TABLE], a good system will have \fIFile read\fP as fast as (or even faster than) \fILibc bcopy\fP because as the file system overhead goes to zero, the file reread case is virtually the same as the library \*[bcopy] case. However, file reread can be faster because the kernel may have access to \*[bcopy] assist hardware not available to the C library. Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP performance, but \*[mmap] is often dramatically worse. Judging by the results, this looks to be a potential area for operating system improvements. .PP In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes full advantage of the memory subsystem from inside the kernel. The mmap reread is probably slower because of the lower clock rate; the page faults start to show up as a significant cost. .PP It is surprising that the Sun Ultra1 was able to bcopy at the high rates shown in Table 2 but did not show those rates for file reread in Table \n[TABLE]. HP has the opposite problem, they get file reread faster than bcopy, perhaps because the kernel \*[bcopy] has access to hardware support. .PP The Unixware system has outstanding mmap reread rates, better than systems of substantially higher cost. Linux needs to do some work on the \f(CWmmap\fP code. .NH 1 Latency measurements .PP Latency is an often-overlooked area of performance problems, possibly because resolving latency issues is frequently much harder than resolving bandwidth issues. For example, memory bandwidth may be increased by making wider cache lines and increasing memory ``width'' and interleave, but memory latency can be improved only by shortening paths or increasing (successful) prefetching. The first step toward improving latency is understanding the current latencies in a system. .PP The latency measurements included in this suite are memory latency, basic operating system entry cost, signal handling cost, process creation times, context switching, interprocess communication, .\" virtual memory system latency, file system latency, and disk latency. .NH 2 Memory read latency background .PP In this section, we expend considerable effort to define the different memory latencies and to explain and justify our benchmark. The background is a bit tedious but important, since we believe the memory latency measurements to be one of the most thought-provoking and useful measurements in \*[lmbench]. .PP The most basic latency measurement is memory latency since most of the other latency measurements can be expressed in terms of memory latency. For example, context switches require saving the current process state and loading the state of the next process. However, memory latency is rarely accurately measured and frequently misunderstood. .PP Memory read latency has many definitions; the most common, in increasing time order, are memory chip cycle time, processor-pins-to-memory-and-back time, load-in-a-vacuum time, and back-to-back-load time. .BU "Memory chip cycle latency" : Memory chips are rated in nanoseconds; typical speeds are around 60ns. A general overview on DRAM architecture may be found in .RN Hennessy96 . The specific information we describe here is from .RN Toshiba94 and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0 \s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the time from .ps -1 .nr width \w'R\&A\&S' .nr height \n[rst]+1000 RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' .ps assertion to the when the data will be available on the \s-1DRAM\s0 pins (assuming .ps -1 .nr width \w'C\&A\&S' .nr height \n[rst]+1000 CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' .ps access time requirements were met). While it is possible to get data out of a \s-1DRAM\s0 in 60ns, that is not all of the time involved. There is a precharge time that must occur after every access. .RN Toshiba94 quotes 110ns as the random read or write cycle time and this time is more representative of the cycle time. .\" For example, most systems offer a wide range of memory .\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number .\" of simms range from 1 to 16. The more simms there are, the more .\" capacitance there is in the memory subsystem. More capacitance means .\" longer setup times for the fully populated memory subsystem. System .\" designers have to allow time for this setup. .\" For more details, consult [XXX - reference on DRAM]. .\" This is sometimes referred to as the chip latency. The .\" chip cycle time is the chip latency plus the time required to restore .\" the data in the capacitors which is often referred to as the precharge .\" time. This means that 60 nanosecond memory chips really are more like .\" 100 nanosecond memory chips. Some systems operate memory in ``page .\" mode'' or ``static column'' memory systems hold either RAS or CAS and .\" allow subsequent accesses in the same row or column in one cycle instead .\" of two. .BU "Pin-to-pin latency" : This number represents the time needed for the memory request to travel from the processor's pins to the memory subsystem and back again. Many vendors have used the pin-to-pin definition of memory latency in their reports. For example, .RN Fenwick95 while describing the \s-1DEC\s0 8400 quotes memory latencies of 265ns; a careful reading of that paper shows that these are pin-to-pin numbers. In spite of the historical precedent in vendor reports, this definition of memory latency is misleading since it ignores actual delays seen when a load instruction is immediately followed by a use of the data being loaded. The number of additional cycles inside the processor can be significant and grows more significant with today's highly pipelined architectures. .PP It is worth noting that the pin-to-pin numbers include the amount of time it takes to charge the lines going to the \s-1SIMM\s0s, a time that increases with the (potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean more capacitance which requires in longer charge times. This is one reason why personal computers frequently have better memory latencies than workstations: the PCs typically have less memory capacity. .BU "Load-in-a-vacuum latency" : A load in a vacuum is the time that the processor will wait for one load that must be fetched from main memory (i.e., a cache miss). The ``vacuum'' means that there is no other activity on the system bus, including no other loads. While this number is frequently used as the memory latency, it is not very useful. It is basically a ``not to exceed'' number important only for marketing reasons. Some architects point out that since most processors implement nonblocking loads (the load does not cause a stall until the data is used), the perceived load latency may be much less that the real latency. When pressed, however, most will admit that cache misses occur in bursts, resulting in perceived latencies of at least the load-in-a-vacuum latency. .BU "Back-to-back-load latency" : Back-to-back-load latency is the time that each load takes, assuming that the instructions before and after are also cache-missing loads. Back-to-back loads may take longer than loads in a vacuum for the following reason: many systems implement something known as \fIcritical word first\fP, which means that the subblock of the cache line that contains the word being loaded is delivered to the processor before the entire cache line has been brought into the cache. If another load occurs quickly enough after the processor gets restarted from the current load, the second load may stall because the cache is still busy filling the cache line for the previous load. On some systems, such as the current implementation of UltraSPARC, the difference between back to back and load in a vacuum is about 35%. .PP \*[lmbench] measures back-to-back-load latency because it is the only measurement that may be easily measured from software and because we feel that it is what most software developers consider to be memory latency. Consider the following C code fragment: .DS .nf .ft CW p = head; while (p->p_next) p = p->p_next; .ft .fi .DE On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the load. A 300 Mhz processor has a 3.33ns cycle time, so the loop could execute in slightly less than 10ns. However, the load itself takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the instructions cost 10ns but the load stalls for 400. Another way to look at it is that 400/3.3, or 121, nondependent, nonloading instructions following the load would be needed to hide the load latency. Because superscalar processors typically execute multiple operations per clock cycle, they need even more useful operations between cache misses to keep the processor from stalling. .PP This benchmark illuminates the tradeoffs in processor cache design. Architects like large cache lines, up to 64 bytes or so, because the prefetch effect of gathering a whole line increases hit rate given reasonable spatial locality. Small stride sizes have high spatial locality and should have higher performance, but large stride sizes have poor spatial locality causing the system to prefetch useless data. So the benchmark provides the following insight into negative effects of large line prefetch: .BU Multi-cycle fill operations are typically atomic events at the caches, and sometimes block other cache accesses until they complete. .BU Caches are typically single-ported. Having a large line prefetch of unused data causes extra bandwidth demands at the cache, and can cause increased access latency for normal cache accesses. .PP In summary, we believe that processors are so fast that the average load latency for cache misses will be closer to the back-to-back-load number than to the load-in-a-vacuum number. We are hopeful that the industry will standardize on this definition of memory latency. .NH 2 Memory read latency .PP The entire memory hierarchy can be measured, including on-board data cache latency and size, external data cache latency and size, and main memory latency. Instruction caches are not measured. TLB miss latency can also be measured, as in .RN Saavedra92 , but we stopped at main memory. Measuring TLB miss time is problematic because different systems map different amounts of memory with their TLB hardware. .PP The benchmark varies two parameters, array size and array stride. For each size, a list of pointers is created for all of the different strides. Then the list is walked thus: .DS .ft CW mov r4,(r4) # C code: p = *p; .ft .DE The time to do about 1,000,000 loads (the list wraps) is measured and reported. The time reported is pure latency time and may be zero even though the load instruction does not execute in zero time. Zero is defined as one clock cycle; in other words, the time reported is \fBonly\fP memory latency time, as it does not include the instruction execution time. It is assumed that all processors can do a load instruction in one processor cycle (not counting stalls). In other words, if the processor cache load time is 60ns on a 20ns processor, the load latency reported would be 40ns, the additional 20ns is for the load instruction itself.\** .FS In retrospect, this was a bad idea because we calculate the clock rate to get the instruction execution time. If the clock rate is off, so is the load time. .FE Processors that can manage to get the load address out to the address pins before the end of the load cycle get some free time in this benchmark (we don't know of any processors that do that). .PP This benchmark has been validated by logic analyzer measurements on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer Research Center. .TSTART 1 .so mem.pic .FEND "Memory latency" 1 .PP Results from the memory latency benchmark are plotted as a series of data sets as shown in Figure \n[FIGURE]. Each data set represents a stride size, with the array size varying from 512 bytes up to 8M or more. The curves contain a series of horizontal plateaus, where each plateau represents a level in the memory hierarchy. The point where each plateau ends and the line rises marks the end of that portion of the memory hierarchy (e.g., external cache). Most machines have similar memory hierarchies: on-board cache, external cache, main memory, and main memory plus TLB miss costs. There are variations: some processors are missing a cache, while others add another cache to the hierarchy. .\" XXX Larry please double-check this; I am going on dim memory... For example, the Alpha 8400 has two on-board caches, one 8K and the other 96K. .PP The cache line size can be derived by comparing curves and noticing which strides are faster than main memory times. The smallest stride that is the same as main memory speed is likely to be the cache line size because the strides that are faster than memory are getting more than one hit per cache line. .\" Prefetching may confuse .\" the issue because a demand read may stall behind a prefetch load, .\" causing cache lines to appear twice as large as they are. .\" XXX .\" Larry --- can we use prime modulus arithmetic to set up pointer .\" loops which might appear random but which really aren't and which .\" hit every stride once before looping? .\" .\" XXX .\" Larry --- is there any way we can defeat/disable prefetching .\" so the cache line size can be more accurately determined? .\" .\" XXX .\" Larry --- can we create a benchmark for TLB misses? .\" I think it was Tom Rokicki who suggested that we create a .\" benchmark where the data fits in the cache, but the pages don't .\" fit in the TLB. .\" .\" XXX .\" Larry --- is the description of the memory hierarchy correct? .\" I am not sure I haven't added an extra level of external cache... .EQ delim $$ .EN .PP Figure \n[FIGURE] shows memory latencies on a nicely made machine, a \s-1DEC\s0 Alpha. We use this machine as the example because it shows the latencies and sizes of the on-chip level 1 and motherboard level 2 caches, and because it has good all-around numbers, especially considering it can support a 4M level 2 cache. The on-board cache is $2 sup 13$ bytes or 8K, while the external cache is $2 sup 19$ bytes or 512K. .EQ delim off .EN .TSTART .so lat_allmem.tbl .TEND "Cache and memory latency (ns)" .nr MEMTABLE \n[TABLE] .PP Table \n[TABLE] shows the cache size, cache latency, and main memory latency as extracted from the memory latency graphs. The graphs and the tools for extracting the data are included with \*[lmbench]. It is worthwhile to plot all of the graphs and examine them since the table is missing some details, such as the \s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache. .PP We sorted Table \n[TABLE] on level 2 cache latency because we think that many applications will fit in the level 2 cache. The HP and IBM systems have only one level of cache so we count that as both level 1 and level 2. Those two systems have remarkable cache performance for caches of that size. In both cases, the cache delivers data in one clock cycle after the load instruction. .PP HP systems usually focus on large caches as close as possible to the processor. A older HP multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped cache with a 2K victim cache, accessible in one clock (16ns).\** That system is primarily a database server. .FS The Usenix version of this paper had this as a set associate cache; that was incorrect. .FE .PP The IBM focus is on low latency, high bandwidth memory. The IBM memory subsystem is good because all of memory is close to the processor, but has the weakness that it is extremely difficult to evolve the design to a multiprocessor system. .PP The 586 and PowerPC motherboards have quite poor second level caches, the caches are not substantially better than main memory. .PP The Pentium Pro and Sun Ultra second level caches are of medium speed at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared against the HP and IBM one cycle latency caches of similar size. Given the tight integration of the Pentium Pro level 2 cache, it is surprising that it has such high latencies. .PP The 300Mhz DEC Alpha has a rather high 22 clock latency to the second level cache which is probably one of the reasons that they needed a 96K level 1.5 cache. SGI and DEC have used large second level caches to hide their long latency from main memory. .PP .NH 2 Operating system entry .PP Entry into the operating system is required for many system facilities. When calculating the cost of a facility, it is useful to know how expensive it is to perform a nontrivial entry into the operating system. .PP We measure nontrivial entry into the system by repeatedly writing one word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but discard the data. This particular entry point was chosen because it has never been optimized in any system that we have measured. Other entry points, typically \*[getpid] and \*[gettimeofday], are heavily used, heavily optimized, and sometimes implemented as user-level library routines rather than system calls. A write to the \f(CW/dev/null\fP driver will go through the system call table to \*[write], verify the user area as readable, look up the file descriptor to get the vnode, call the vnode's write function, and then return. .TSTART .so ../Results/tmp/lat_nullsys.tbl .TEND "Simple system call time (microseconds)" .PP Linux is the clear winner in the system call time. The reasons are twofold: Linux is a uniprocessor operating system, without any MP overhead, and Linux is a small operating system, without all of the ``features'' accumulated by the commercial offers. .PP Unixware and Solaris are doing quite well, given that they are both fairly large, commercially oriented operating systems with a large accumulation of ``features.'' .NH 2 Signal handling cost .PP Signals in Unix are a way to tell another process to handle an event. They are to processes as interrupts are to the CPU. .PP Signal handling is often critical to layered systems. Some applications, such as databases, software development environments, and threading libraries provide an operating system-like layer on top of the operating system, making signal handling a critical path in many of these applications. .PP \*[lmbench] measure both signal installation and signal dispatching in two separate loops, within the context of one process. It measures signal handling by installing a signal handler and then repeatedly sending itself the signal. .TSTART .so ../Results/tmp/lat_signal.tbl .TEND "Signal times (microseconds)" .PP Table \n[TABLE] shows the signal handling costs. Note that there are no context switches in this benchmark; the signal goes to the same process that generated the signal. In real applications, the signals usually go to another process, which implies that the true cost of sending that signal is the signal overhead plus the context switch overhead. We wanted to measure signal and context switch overheads separately since context switch times vary widely among operating systems. .PP SGI does very well on signal processing, especially since their hardware is of an older generation than many of the others. .PP The Linux/Alpha signal handling numbers are so poor that we suspect that this is a bug, especially given that the Linux/x86 numbers are quite reasonable. .NH 2 Process creation costs .PP Process benchmarks are used to measure the basic process primitives, such as creating a new process, running a different program, and context switching. Process creation benchmarks are of particular interest in distributed systems since many remote operations include the creation of a remote process to shepherd the remote operation to completion. Context switching is important for the same reasons. .BU "Simple process creation" . The Unix process creation primitive is \*[fork], which creates a (virtually) exact copy of the calling process. Unlike VMS and some other operating systems, Unix starts any new process with a \*[fork]. Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and ``light,'' facts that many have been ignoring for some time. .PP \*[lmbench] measures simple process creation by creating a process and immediately exiting the child process. The parent process waits for the child process to exit. The benchmark is intended to measure the overhead for creating a new thread of control, so it includes the \*[fork] and the \*[exit] time. .PP The benchmark also includes a \f(CWwait\fP system call in the parent and context switches from the parent to the child and back again. Given that context switches of this sort are on the order of 20 microseconds and a system call is on the order of 5 microseconds, and that the entire benchmark time is on the order of a millisecond or more, the extra overhead is insignificant. Note that even this relatively simple task is very expensive and is measured in milliseconds while most of the other operations we consider are measured in microseconds. .BU "New process creation" . The preceding benchmark did not create a new application; it created a copy of the old application. This benchmark measures the cost of creating a new process and changing that process into a new application, which. forms the basis of every Unix command line interface, or shell. \*[lmbench] measures this facility by forking a new child and having that child execute a new program \(em in this case, a tiny program that prints ``hello world'' and exits. .PP The startup cost is especially noticeable on (some) systems that have shared libraries. Shared libraries can introduce a substantial (tens of milliseconds) startup cost. .\" XXX - statically linked example? .TSTART .so ../Results/tmp/lat_allproc.tbl .TEND "Process creation time (milliseconds)" .BU "Complicated new process creation" . When programs start other programs, they frequently use one of three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first two interfaces start a new process by invoking the standard command interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way guarantees that the shell will look for the requested application in all of the places that the user would look \(em in other words, the shell uses the user's $PATH variable as a list of places to find the application. \*[execlp] is a C library routine which also looks for the program using the user's $PATH variable. .PP Since this is a common way of starting applications, we felt it was useful to show the costs of the generality. .PP We measure this by starting \f(CW/bin/sh\fP to start the same tiny program we ran in the last case. In Table \n[TABLE] the cost of asking the shell to go look for the program is quite large, frequently ten times as expensive as just creating a new process, and four times as expensive as explicitly naming the location of the new program. .PP The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results. Given that the processor is one of the fastest, the problem is likely to be software. There is room for substantial improvement in the Solaris process creation code. .NH 2 Context switching .PP Context switch time is defined here as the time needed to save the state of one process and restore the state of another process. .PP Context switches are frequently in the critical performance path of distributed applications. For example, the multiprocessor versions of the IRIX operating system use processes to move data through the networking stack. This means that the processing time for each new packet arriving at an idle system includes the time needed to switch in the networking process. .PP Typical context switch benchmarks measure just the minimal context switch time \(em the time to switch between two processes that are doing nothing but context switching. We feel that this is misleading because there are frequently more than two active processes, and they usually have a larger working set (cache footprint) than the benchmark processes. .PP Other benchmarks frequently include the cost of the system calls needed to force the context switches. For example, Ousterhout's context switch benchmark measures context switch time plus a \*[read] and a \*[write] on a pipe. In many of the systems measured by \*[lmbench], the pipe overhead varies between 30% and 300% of the context switch time, so we were careful to factor out the pipe overhead. .BU "Number of processes." The context switch benchmark is implemented as a ring of two to twenty processes that are connected with Unix pipes. A token is passed from process to process, forcing context switches. The benchmark measures the time needed to pass the token two thousand times from process to process. Each transfer of the token has two costs: the context switch, and the overhead of passing the token. In order to calculate just the context switching time, the benchmark first measures the cost of passing the token through a ring of pipes in a single process. This overhead time is defined as the cost of passing the token and is not included in the reported context switch time. .BU "Size of processes." In order to measure more realistic context switch times, we add an artificial variable size ``cache footprint'' to the switching processes. The cost of the context switch then includes the cost of restoring user-level state (cache footprint). The cache footprint is implemented by having the process allocate an array of data\** .FS All arrays are at the same virtual address in all processes. .FE and sum the array as a series of integers after receiving the token but before passing the token to the next process. Since most systems will cache data across context switches, the working set for the benchmark is slightly larger than the number of processes times the array size. .PP It is worthwhile to point out that the overhead mentioned above also includes the cost of accessing the data, in the same way as the actual benchmark. However, because the overhead is measured in a single process, the cost is typically the cost with ``hot'' caches. In the Figure 2, each size is plotted as a line, with context switch times on the Y axis, number of processes on the X axis, and the process size as the data set. The process size and the hot cache overhead costs for the pipe read/writes and any data access is what is labeled as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead is in microseconds. .PP The context switch time does not include anything other than the context switch, provided that all the benchmark processes fit in the cache. If the total size of all of the benchmark processes is larger than the cache size, the cost of each context switch will include cache misses. We are trying to show realistic context switch times as a function of both size and number of processes. .TSTART 1 .so ctx.pic .FEND "Context switch times" 1 .PP Results for an Intel Pentium Pro system running Linux at 167 MHz are shown in Figure \n[FIGURE]. The data points on the figure are labeled with the working set due to the sum of data in all of the processes. The actual working set is larger, as it includes the process and kernel overhead as well. One would expect the context switch times to stay constant until the working set is approximately the size of the second level cache. The Intel system has a 256K second level cache, and the context switch times stay almost constant until about 256K (marked as .25M in the graph). .BU "Cache issues" The context switch benchmark is a deliberate measurement of the effectiveness of the caches across process context switches. If the cache does not include the process identifier (PID, also sometimes called an address space identifier) as part of the address, then the cache must be flushed on every context switch. If the cache does not map the same virtual addresses from different processes to different cache lines, then the cache will appear to be flushed on every context switch. .PP If the caches do not cache across context switches there would be no grouping at the lower left corner of Figure \n[FIGURE], instead, the graph would appear as a series of straight, horizontal, parallel lines. The number of processes will not matter, the two process case will be just as bad as the twenty process case since the cache would not be useful across context switches. .TSTART .so ../Results/tmp/ctx.tbl .TEND "Context switch time (microseconds)" .PP We picked four points on the graph and extracted those values for Table \n[TABLE]. The complete set of values, as well as tools to graph them, are included with \*[lmbench]. .PP Note that multiprocessor context switch times are frequently more expensive than uniprocessor context switch times. This is because multiprocessor operating systems tend to have very complicated scheduling code. We believe that multiprocessor context switch times can be, and should be, within 10% of the uniprocessor times. .PP Linux does quite well on context switching, especially on the more recent architectures. By comparing the Linux 2 0K processes to the Linux 2 32K processes, it is apparent that there is something wrong with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can find at least part of the cause. The second level cache latency for the i586 is substantially worse than either the i686 or the Alpha. .PP Given the poor second level cache behavior of the PowerPC, it is surprising that it does so well on context switches, especially the larger sized cases. .PP The Sun Ultra1 context switches quite well in part because of enhancements to the register window handling in SPARC V9. .NH 2 Interprocess communication latencies .PP Interprocess communication latency is important because many operations are control messages to another process (frequently on another system). The time to tell the remote process to do something is pure overhead and is frequently in the critical path of important functions such as distributed applications (e.g., databases, network servers). .PP The interprocess communication latency benchmarks typically have the following form: pass a small message (a byte or so) back and forth between two processes. The reported results are always the microseconds needed to do one round trip. For one way timing, about half the round trip is right. However, the CPU cycles tend to be somewhat asymmetric for one trip: receiving is typically more expensive than sending. .BU "Pipe latency" . Unix pipes are an interprocess communication mechanism implemented as a one-way byte stream. Each end of the stream has an associated file descriptor; one is the write descriptor and the other the read descriptor. .PP Pipes are frequently used as a local IPC mechanism. Because of the simplicity of pipes, they are frequently the fastest portable communication mechanism. .PP Pipe latency is measured by creating a pair of pipes, forking a child process, and passing a word back and forth. This benchmark is identical to the two-process, zero-sized context switch benchmark, except that it includes both the context switching time and the pipe overhead in the results. .nr NTABLE \n[TABLE]+1 .nr LTABLE \n[TABLE] Table \n[NTABLE] shows the round trip latency from process A to process B and back to process A. .TSTART .so ../Results/tmp/lat_pipe.tbl .TEND "Pipe latency (microseconds)" .PP The time can be broken down to two context switches plus four system calls plus the pipe overhead. The context switch component is two of the small processes in Table \n[LTABLE]. This benchmark is identical to the context switch benchmark in .RN Ousterhout90 . .BU "TCP and RPC/TCP latency" . TCP sockets may be viewed as an interprocess communication mechanism similar to pipes with the added feature that TCP sockets work across machine boundaries. .PP TCP and RPC/TCP connections are frequently used in low-bandwidth, latency-sensitive applications. The default Oracle distributed lock manager uses TCP sockets, and the locks per second available from this service are accurately modeled by the TCP latency test. .TSTART .so ../Results/tmp/lat_tcp.tbl .TEND "TCP latency (microseconds)" .PP Sun's RPC is layered either over TCP or over UDP. The RPC layer is responsible for managing connections (the port mapper), managing different byte orders and word sizes (XDR), and implementing a remote procedure call abstraction. Table \n[TABLE] shows the same benchmark with and without the RPC layer to show the cost of the RPC implementation. .PP TCP latency is measured by having a server process that waits for connections and a client process that connects to the server. The two processes then exchange a word between them in a loop. The latency reported is one round-trip time. The measurements in Table \n[TABLE] are local or loopback measurements, since our intent is to show the overhead of the software. The same benchmark may be, and frequently is, used to measure host-to-host latency. .PP Note that the RPC layer frequently adds hundreds of microseconds of additional latency. The problem is not the external data representation (XDR) layer \(em the data being passed back and forth is a byte, so there is no XDR to be done. There is no justification for the extra cost; it is simply an expensive implementation. DCE RPC is worse. .TSTART .so ../Results/tmp/lat_udp.tbl .TEND "UDP latency (microseconds)" .BU "UDP and RPC/UDP latency" . UDP sockets are an alternative to TCP sockets. They differ in that UDP sockets are unreliable messages that leave the retransmission issues to the application. UDP sockets have a few advantages, however. They preserve message boundaries, whereas TCP does not; and a single UDP socket may send messages to any number of other sockets, whereas TCP sends data to only one place. .PP UDP and RPC/UDP messages are commonly used in many client/server applications. NFS is probably the most widely used RPC/UDP application in the world. .PP Like TCP latency, UDP latency is measured by having a server process that waits for connections and a client process that connects to the server. The two processes then exchange a word between them in a loop. The latency reported is round-trip time. The measurements in Table \n[TABLE] are local or loopback measurements, since our intent is to show the overhead of the software. Again, note that the RPC library can add hundreds of microseconds of extra latency. .\" .PP .\" It is interesting to compare UDP latency with TCP latency. In many cases the .\" TCP latency is \fBless\fP than the UDP latency. This flies in the face .\" of conventional wisdom, which says that TCP is an inherently more expensive .\" protocol than UDP. The reasons that TCP may appear faster are: in this .\" benchmark, the protocol costs are dwarfed by the other costs (context .\" switching, system calls, and driver overhead); and TCP is frequently .\" hand-tuned for performance, while UDP is rarely hand-tuned. .TSTART .so ipc.tbl .TEND "Remote latencies (microseconds)" .BU "Network latency" . We have a few results for over the wire latency included in Table \n[TABLE]. As might be expected, the most heavily used network interfaces (i.e., ethernet) have the lowest latencies. The times shown include the time on the wire, which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit ethernet and FDDI, and less than 10 microseconds for Hippi. .BU "TCP connection latency" . TCP is a connection-based, reliable, byte-stream-oriented protocol. As part of this reliability, a connection must be established before any data can be transferred. The connection is accomplished by a ``three-way handshake,'' an exchange of packets when the client attempts to connect to the server. .PP Unlike UDP, where no connection is established, TCP sends packets at startup time. If an application creates a TCP connection to send one message, then the startup time can be a substantial fraction of the total connection and transfer costs. The benchmark shows that the connection cost is approximately half of the cost. .PP Connection cost is measured by having a server, registered using the port mapper, waiting for connections. The client figures out where the server is registered and then repeatedly times a \*[connect] system call to the server. The socket is closed after each connect. Twenty connects are completed and the fastest of them is used as the result. The time measured will include two of the three packets that make up the three way TCP handshake, so the cost is actually greater than the times listed. .\" XXX Larry --- if a machine's clock granularity is on the order of .\" 10 milliseconds, won't this benchmark run into granularity problems? .TSTART .so ../Results/tmp/lat_connect.tbl .TEND "TCP connect latency (microseconds)" .PP Table \n[TABLE] shows that if the need is to send a quick message to another process, given that most packets get through, a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive acknowledgments are needed, which they are in order to have an apples-to-apples comparison with TCP). If the transmission medium is 10Mbit Ethernet, the time on the wire will be approximately 65 microseconds each way, or 130 microseconds total. To do the same thing with a short-lived TCP connection would cost 896 microseconds of wire time alone. .PP The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor is the point to suggest that all messages should be UDP. In many cases, the difference between 130 microseconds and 900 microseconds is insignificant compared with other aspects of application performance. However, if the application is very latency sensitive and the transmission medium is slow (such as serial link or a message through many routers), then a UDP message may prove cheaper. .NH 2 File system latency .PP File system latency is defined as the time required to create or delete a zero length file. We define it this way because in many file systems, such as the BSD fast file system, the directory operations are done synchronously in order to maintain on-disk integrity. Since the file data is typically cached and sent to disk at some later date, the file creation and deletion become the bottleneck seen by an application. This bottleneck is substantial: to do a synchronous update to a disk is a matter of tens of milliseconds. In many cases, this bottleneck is much more of a perceived performance issue than processor speed. .PP The benchmark creates 1,000 zero-sized files and then deletes them. All the files are created in one directory and their names are short, such as "a", "b", "c", ... "aa", "ab", .... .TSTART .so lat_fs.tbl .TEND "File system latency (microseconds)" .PP The create and delete latencies are shown in Table \n[TABLE]. Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster than the slowest systems. However, Linux does not guarantee anything about the disk integrity; the directory operations are done in memory. Other fast systems, such as SGI's XFS, use a log to guarantee the file system integrity. The slower systems, all those with ~10 millisecond file latencies, are using synchronous writes to guarantee the file system integrity. Unless Unixware has modified UFS substantially, they must be running in an unsafe mode since the FreeBSD UFS is much slower and both file systems are basically the 4BSD fast file system. .NH 2 Disk latency .\" XXX - either get more results for this benchmark or delete it. .\" I'd really like to not delete it - lmdd is probably the most .\" useful tool and it gets the least press. .PP Included with \*[lmbench] is a small benchmarking program useful for measuring disk and file I/O. \*[lmdd], which is patterned after the Unix utility \f(CWdd\fP, measures both sequential and random I/O, optionally generates patterns on output and checks them on input, supports flushing the data from the buffer cache on systems that support \f(CWmsync\fP, and has a very flexible user interface. Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script wrapped around \*[lmdd]. .PP While we could have generated both sequential and random I/O results as part of this paper, we did not because those benchmarks are heavily influenced by the performance of the disk drives used in the test. We intentionally measure only the system overhead of a SCSI command since that overhead may become a bottleneck in large database configurations. .PP Some important applications, such as transaction processing, are limited by random disk IO latency. Administrators can increase the number of disk operations per second by buying more disks, until the processor overhead becomes the bottleneck. The \f(CWlmdd\fP benchmark measures the processor overhead associated with each disk operation, and it can provide an upper bound on the number of disk operations the processor can support. It is designed for SCSI disks, and it assumes that most disks have 32-128K read-ahead buffers and that they can read ahead faster than the processor can request the chunks of data.\** .FS This may not always be true: a processor could be fast enough to make the requests faster than the rotating disk. If we take 6M/second to be disk speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or 81 microseconds/IO. We don't know of any processor/OS/IO controller combinations that can do an IO in 81 microseconds. .FE .PP The benchmark simulates a large number of disks by reading 512byte transfers sequentially from the raw disk device (raw disks are unbuffered and are not read ahead by Unix). Since the disk can read ahead faster than the system can request data, the benchmark is doing small transfers of data from the disk's track buffer. Another way to look at this is that the benchmark is doing memory-to-memory transfers across a SCSI channel. It is possible to generate loads of more than 1,000 SCSI operations/second on a single SCSI disk. For comparison, disks under database load typically run at 20-80 operations per second. .TSTART .so ../Results/tmp/lat_disk.tbl .TEND "SCSI I/O overhead (microseconds)" .PP The resulting overhead number represents a \fBlower\fP bound on the overhead of a disk I/O. The real overhead numbers will be higher on SCSI systems because most SCSI controllers will not disconnect if the request can be satisfied immediately. During the benchmark, the processor simply sends the request and transfers the data, while during normal operation, the processor will send the request, disconnect, get interrupted, reconnect, and transfer the data. .PP This technique can be used to discover how many drives a system can support before the system becomes CPU-limited because it can produce the overhead load of a fully configured system with just a few disks. .NH 1 Future work .PP There are several known improvements and extensions that could be made to \*[lmbench]. .BU "Memory latency" . The current benchmark measures clean-read latency. By clean, we mean that the cache lines being replaced are highly likely to be unmodified, so there is no associated write-back cost. We would like to extend the benchmark to measure dirty-read latency, as well as write latency. Other changes include making the benchmark impervious to sequential prefetching and measuring TLB miss cost. .BU "MP benchmarks" . None of the benchmarks in \*[lmbench] is designed to measure any multiprocessor features directly. At a minimum, we could measure cache-to-cache latency as well as cache-to-cache bandwidth. .BU "Static vs. dynamic processes" . In the process creation section, we allude to the cost of starting up processes that use shared libraries. When we figure out how to create statically linked processes on all or most systems, we could quantify these costs exactly. .BU "McCalpin's stream benchmark" . We will probably incorporate part or all of this benchmark into \*[lmbench]. .BU "Automatic sizing" . We have enough technology that we could determine the size of the external cache and autosize the memory used such that the external cache had no effect. .BU "More detailed papers" . There are several areas that could yield some interesting papers. The memory latency section could use an in-depth treatment, and the context switching section could turn into an interesting discussion of caching technology. .NH 1 Conclusion .PP \*[lmbench] is a useful, portable micro-benchmark suite designed to measure important aspects of system performance. We have found that a good memory subsystem is at least as important as the processor speed. As processors get faster and faster, more and more of the system design effort will need to move to the cache and memory subsystems. .NH 1 Acknowledgments .PP Many people have provided invaluable help and insight into both the benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers were especially helpful. We thank all of them and especially thank: Ken Okin \s-1(SUN)\s0, Kevin Normoyle \s-1(SUN)\s0, Satya Nishtala \s-1(SUN)\s0, Greg Chesson \s-1(SGI)\s0, John Mashey \s-1(SGI)\s0, Neal Nuckolls \s-1(SGI)\s0, John McCalpin \s-1(Univ. of Delaware)\s0, Ron Minnich \s-1(Sarnoff)\s0, Chris Ruemmler \s-1(HP)\s0, Tom Rokicki \s-1(HP)\s0, and John Weitz \s-1(Digidesign)\s0. .PP We would also like to thank all of the people that have run the benchmark and contributed their results; none of this would have been possible without their assistance. .PP Our thanks to all of the free software community for tools that were used during this project. \*[lmbench] is currently developed on Linux, a copylefted Unix written by Linus Torvalds and his band of happy hackers. This paper and all of the \*[lmbench] documentation was produced using the \f(CWgroff\fP suite of tools written by James Clark. Finally, all of the data processing of the results is done with \f(CWperl\fP written by Larry Wall. .PP Sun Microsystems, and in particular Paul Borrill, supported the initial development of this project. Silicon Graphics has supported ongoing development that turned into far more time then we ever imagined. We are grateful to both of these companies for their financial support. .NH 1 Obtaining the benchmarks .PP The benchmarks are available at .ft I http://reality.sgi.com/employees/lm_engr/lmbench.tgz .ft as well as via a mail server. You may request the latest version of \*[lmbench] by sending email to \fIarchives@slovax.engr.sgi.com\fP with \fIlmbench-current*\fP as the subject. .\" .R1 .\" bibliography references .\" .R2 .\"******************************************************************** .\" Redefine the IP paragraph format so it won't insert a useless line .\" break when the paragraph tag is longer than the indent distance .\" .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] .\". br . \} . rm par*label .\} .. .\"******************************************************************** .\" redefine the way the reference tag is printed so it is enclosed in .\" square brackets .\" .de ref*end-print .ie d [F .IP "[\\*([F]" 2 .el .XP \\*[ref*string] .. .\"******************************************************************** .\" Get journal number entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-N .ref*field N "" ( ) .. .\"******************************************************************** .\" Get journal volume entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-V .ref*field V , "" "" "" .. .\"******************************************************************** .\" Get the date entry right. Should not be enclosed in parentheses. .\" .de ref*add-D .ref*field D "," .. .R1 accumulate sort A+DT database references label-in-text label A.nD.y-2 bracket-label [ ] ", " bibliography references .R2 .so bios ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/doc/userguide.ms���������������������������������������������������������������������0000664�0000764�0000764�00000426331�07564162402�016306� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" This document is GNU groff -mgs -t -p -R -s .\" It will not print with normal troffs, it uses groff features, in particular, .\" long names for registers & strings. .\" Deal with it and use groff - it makes things portable. .\" .\" $X$ xroff -mgs -t -p -R -s $file .\" $tty$ groff -mgs -t -p -R -s $file | colcrt - | more .\" $lpr$ groff -mgs -t -p -R -s $file > ${file}.lpr .VARPS .\" Define a page top that looks cool .\" HELLO CARL! To turn this off, s/PT/oldPT/ .de PT .tl '\fBDRAFT\fP'\\*(DY'\fBDRAFT\fP' .. .de lmPT .if \\n%>1 \{\ . sp -.1i . ps 14 . ft 3 . nr big 24 . nr space \\w'XXX' . nr titlewid \\w'\\*[title]' . nr barwid (\\n[LL]-(\\n[titlewid]+(2*\\n[space])))/2 . ds ln \\l'\\n[barwid]u'\\h'-\\n[barwid]u'\v'-.25' . ds bar \\s(\\n[big]\\*(ln\\*(ln\\*(ln\\*(ln\\*(ln\v'1.25'\\h'\\n[barwid]u'\\s0 . ce 1 \\*[bar]\h'\\n[space]u'\v'-.15'\\*[title]\v'.15'\h'\\n[space]u'\\*[bar] . ps . sp -.70 . ps 12 \\l'\\n[LL]u' . ft . ps .\} .. .\" Define a page bottom that looks cool .\" HELLO CARL! To turn this off, s/BT/oldBT/ .de BT .tl '\(co 2002 \\*[author]'%'\fB\\*(DY DRAFT DO NOT DISTRIBUTE\fP' .. .de lmBT . ps 9 \v'-1'\\l'\\n(LLu' . sp -1 . tl '\(co 2002 \\*[author]'\\*(DY'%' . ps .. .de SP . if t .sp .5 . if n .sp 1 .. .de BU . SP . ne 2 \(bu\ . if \\n[.$] \fB\\$1\fP\\$2 .. .nr FIGURE 0 .nr TABLE 0 .nr SMALL .25i .de TSTART . KF . if \\n[.$] \s(24\\l'\\n[pg@colw]u'\s0 . ps -1 . vs -1 .. .de TEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr TABLE \\n[TABLE]+1 . ce 1 \fBTable \\n[TABLE].\ \ \\$1\fP . SP . KE .. .de FEND . ps +1 . vs +1 . if \\n[.$]=2 \{\ . sp -.5 \s(24\\l'\\n[pg@colw]u'\s0 \} . sp .25 . nr FIGURE \\n[FIGURE]+1 . ce 1 \fBFigure \\n[FIGURE].\ \ \\$1\fP . SP . KE .. .\" Configuration .nr PI 3n .nr HM 1i .nr FM 1i .nr PO 1i .if t .po 1i .nr LL 6.5i .if n .nr PO 0i .if n .nr LL 7.5i .nr PS 10 .nr VS \n(PS+1 .ds title Measuring scalability .ds author Carl Staelin .ds lmbench \f(CWlmbench\fP .ds lmbench1 \f(CWlmbench1\fP .ds lmbench2 \f(CWlmbench2\fP .ds lmbench3 \f(CWlmbench3\fP .ds bcopy \f(CWbcopy\fP .ds benchmp \f(CWbenchmp\fP .ds bw_file_rd \f(CWbw_file_rd\fP .ds bw_mem \f(CWbw_mem\fP .ds bw_mmap_rd \f(CWbw_mmap_rd\fP .ds bw_pipe \f(CWbw_pipe\fP .ds bw_tcp \f(CWbw_tcp\fP .ds bw_udp \f(CWbw_udp\fP .ds bw_unix \f(CWbw_unix\fP .ds connect \f(CWconnect\fP .ds execlp \f(CWexeclp\fP .ds execve \f(CWexecve\fP .ds exit \f(CWexit\fP .ds fcntl \f(CWfcntl\fP .ds fork \f(CWfork\fP .ds fstat \f(CWfstat\fP .ds gcc \f(CWgcc\fP .ds getpid \f(CWgetpid\fP .ds getppid \f(CWgetppid\fP .ds gettimeofday \f(CWgettimeofday\fP .ds kill \f(CWkill\fP .ds lat_connect \f(CWlat_connect\fP .ds lat_ctx \f(CWlat_ctx\fP .ds lat_fcntl \f(CWlat_fcntl\fP .ds lat_fifo \f(CWlat_fifo\fP .ds lat_fs \f(CWlat_fs\fP .ds lat_http \f(CWlat_http\fP .ds lat_mem_rd \f(CWlat_mem_rd\fP .ds lat_mmap \f(CWlat_mmap\fP .ds lat_ops \f(CWlat_ops\fP .ds lat_pagefault \f(CWlat_pagefault\fP .ds lat_pipe \f(CWlat_pipe\fP .ds lat_proc \f(CWlat_proc\fP .ds lat_rpc \f(CWlat_rpc\fP .ds lat_select \f(CWlat_select\fP .ds lat_sem \f(CWlat_sem\fP .ds lat_sig \f(CWlat_sig\fP .ds lat_syscall \f(CWlat_syscall\fP .ds lat_tcp \f(CWlat_tcp\fP .ds lat_udp \f(CWlat_udp\fP .ds lat_unix \f(CWlat_unix\fP .ds lat_unix_connect \f(CWlat_unix_connect\fP .ds line \f(CWline\fP .ds lmdd \f(CWlmdd\fP .ds lmdd \f(CWlmdd\fP .ds memmove \f(CWmemmove\fP .ds mhz \f(CWmhz\fP .ds mmap \f(CWmmap\fP .ds par_mem \f(CWpar_mem\fP .ds par_ops \f(CWpar_ops\fP .ds pipe \f(CWpipe\fP .ds popen \f(CWpopen\fP .ds read \f(CWread\fP .ds select \f(CWselect\fP .ds semop \f(CWsemop\fP .ds sh \f(CW/bin/sh\fP .ds stat \f(CWstat\fP .ds stream \f(CWstream\fP .ds system \f(CWsystem\fP .ds tlb \f(CWtlb\fP .ds uiomove \f(CWuiomove\fP .ds write \f(CWwrite\fP .ds yield \f(CWyield\fP .\" References stuff .de RN \"Reference Name: .RN $1 -- prints the reference prettily .\" [\s-2\\$1\s+2]\\$2 [\s-1\\$1\s0]\\$2 .. .\" .R1 .\" sort A+DT .\" database references .\" label-in-text .\" label A.nD.y-2 .\" bracket-label \*([. \*(.] ", " .\" .R2 .EQ delim $$ .EN .TL \s(14lmbench user guide\s0 .AU \s+2\fR\*[author]\fP\s0 .AI \fI\s+2Hewlett-Packard Laboratories Israel\s0\fP .SP .AB \*[lmbench] is a micro-benchmark suite designed to focus attention on the basic building blocks of many common system applications, such as databases, simulations, software development, and networking. It is also designed to make it easy for users to create additional micro-benchmarks that can measure features, algorithms, or subsystems of particular interest to the user. .SP There is a timing harness, \*[benchmp], designed to measure performance at specific levels of parallel (simultaneous) load. .AE .if t .MC 3.05i .NH 1 Introduction .LP \*[lmbench] is a widely used suite of micro-benchmarks that measures important aspects of computer system performance, such as memory latency and bandwidth. Crucially, the suite is written in portable ANSI-C using POSIX interfaces and is intended to run on a wide range of systems without modification. .LP The benchmarks included in the suite were chosen because in the \*[lmbench] developer's experience, they each represent an aspect of system performance which has been crucial to an application's performance. .LP In general the benchmarks report either the latency or bandwidth of an operation or data pathway. The exceptions are generally those benchmarks that report on a specific aspect of the hardware, such as the processor clock rate, which is reported in MHz and nanoseconds. .LP \*[lmbench] consists of three major components: a timing harness, the individual benchmarks built on top of the timing harness, and the various scripts and glue that build and run the benchmarks and process the results. .NH 2 \*[lmbench] history .LP \*[lmbench1] was written by Larry McVoy while he was at Sun Microsystems. It focussed on two measures of system performance: latency and bandwidth. It measured a number of basic operating system functions, such as file system read/write bandwidth or file creation time. It also focussed a great deal of energy on measuring data transfer operations, such as \*[bcopy] and \*[pipe] latency and bandwidth as well as raw memory latency and bandwidth. .LP Shortly after .RN McVoy96 was published, .RN Brown97 examined the \*[lmbench] benchmarks and published a detailed critique of its strengths and weaknesses. Largely in response to these remarks, development of \*[lmbench2] began with a focus on improving the experimental design and statistical data analysis. The primary change was the development and adoption across all the benchmarks of a timing harness that incorporated loop-autosizing and clock resolution detection. In addition, each experiment was typically repeated eleven times with the median result reported to the user. .LP \*[lmbench3] focussed on extending \*[lmbench]'s functionality along two dimensions: measuring multi-processor scalability and measuring basic aspects of processor architecture. .LP There are any number of aspects of a computer's micro-architecture that can impact a program's performance, such as the design of the memory hierarchy and the basic performance of the various arithmetic units. .LP All of the new benchmarks were added to \*[lmbench] because the author needed them to help guide his design decisions in one or more projects over the last few years. For example, \*[lat_ops] was added because the author was trying to decide whether a particular image processing algorithm should be implemented using integer or floating point arithmetic. Floating point arithmetic was preferred for a variety of reasons, but it was feared that floating point arithmetic would be prohibitively expensive compared to integer operations. By quickly building \*[lat_ops] the author was able to verify that the floating point performance should be no worse than integer performance. .LP An important feature of multi-processor systems is their ability to scale their performance. \*[lmbench1] was able to measure various important aspects of system performance, except that only one client process was active at a time .RN McVoy96 . \*[lmbench2] introduced a new macro, BENCH(), which implemented a sophisticated timing harness that automatically managed nearly all aspects of accurately timing operations .RN Staelin98 . For example, it automatically detects the minimal timing interval necessary to provide timing results within 1% accuracy, and it automatically repeats most experiments eleven times and reports the median result. .LP However, this timing harness is incapable of measuring the performance of a system under scalable loads. \*[lmbench3] took the ideas and techniques developed in the earlier versions and extended them to create a new timing harness which can measure system performance under parallel, scalable loads. .LP \*[lmbench3] also includes a version of John McCalpin's STREAM benchmarks. Essentially the STREAM kernels were placed in the new \*[lmbench] timing harness. Since the new timing harness also measures scalability under parallel load, the \*[lmbench3] STREAM benchmarks include this capability automatically. .LP Finally, \*[lmbench3] includes a number of new benchmarks which measure various aspects of the processor architecture, such as basic operation latency and parallelism, to provide developers with a better understanding of system capabilities. The hope is that better informed developers will be able to better design and evaluate performance critical software in light of their increased understanding of basic system performance. .NH 1 Prior Work .LP Benchmarking is not a new field of endeavor. There are a wide variety of approaches to benchmarking, many of which differ greatly from that taken by \*[lmbench]. .LP One common form of benchmark is to take an important application or application and worklist, and to measure the time required to complete the entire task. This approach is particularly useful when evaluating the utility of systems for a single and well-known task. .LP Other benchmarks, such as SPECint, use a variation on this approach by measuring several applications and combining the results to predict overall performance. .\" .LP .\" XXX Byte benchmark .LP Another variation takes the "kernel" of an important application and measures its performance, where the "kernel" is usually a simplification of the most expensive portion of a program. Dhrystone .RN Weicker84 is an example of this type of benchmark as it measures the performance of important matrix operations and was often used to predict system performance for numerical operations. .LP .RN Banga98 developed a benchmark to measure HTTP server performance which can accurately measure server performance under high load. Due to the idiosyncracies of the HTTP protocol and TCP design and implementation, there are generally operating system limits on the rate at which a single system can generate independent HTTP requests. However, .RN Banga98 developed a system which can scalably present load to HTTP servers in spite of this limitation. .LP John McCalpin's STREAM benchmark measures memory bandwidth during four common vector operations .RN McCalpin95 . It does not measure memory latency, and strictly speaking it does not measure raw memory bandwith although memory bandwidth is crucial to STREAM performance. More recently, work has begun on extending STREAM to measure scalable memory subsystem performance, particularly for multi-processor machines. .LP Uros Prestor .RN Prestor01 XXX .LP Micro-benchmarking extends this "kernel" approach, by measuring the performance of operations or resources in isolation. \*[lmbench] and many other benchmarks, such as nfsstone .RN Shein89 , measure the performance of key operations so users can predict performance for certain workloads and applications by combining the performance of these operations in the right mixture. .LP .RN Saavedra92 takes the micro-benchmark approach and applies it to the problem of predicting application performance. They analyze applications or other benchmarks in terms of their ``narrow spectrum benchmarks'' to create a linear model of the application's computing requirements. They then measure the computer system's performance across this set of micro-benchmarks and use a linear model to predict the application's performance on the computer system. .RN Seltzer99 applied this technique using the features measured by \*[lmbench] as the basis for application prediction. .LP Benchmarking I/O systems has proven particularly troublesome over the years, largely due to the strong non-linearities exhibited by disk systems. Sequential I/O provides much higher bandwidth than non-sequential I/O, so performance is highly dependent on the workload characteristics as well as the file system's ability to capitalize on available sequentiality by laying out data contiguously on disk. .LP I/O benchmarks have a tendency to age poorly. For example, IOStone .RN Park90a , IOBench .RN Wolman89 , and the Andrew benchmark .RN Howard88 used fixed size datasets, whose size was significant at the time, but which no longer measure I/O performance as the data can now fit in the processor cache of many modern machines. .LP The Andrew benchmark attempts to separately measure the time to create, write, re-read, and then delete a large number of files in a hierarchical file system. .LP Bonnie .RN Bray90 measures sequential, streaming I/O bandwidth for a single process, and random I/O latency for multiple processes. .LP Peter Chen developed an adaptive harness for I/O benchmarking .RN Chen94a , which defines I/O load in terms of five parameters, uniqueBytes, sizeMean, readFrac, seqFrac, and processNum. The benchmark then explores the parameter space to measure file system performance in a scalable fashion. .NH 1 Computer Architecture Primer .LP A processor architecture is generally defined by its instruction set, but most computer architectures incorporate a large number of common building blocks and concepts, such as registers, arithmetic logic units, and caches. .LP Of necessity, this primer over-simplifies the many details and variations of specific computer designs and architectures. For more information, please see .RN Hennessy96 . .TSTART 1 .so lmbench3_arch.pic .FEND "Architecture diagram" 1 .LP Figure \n[FIGURE] contains a greatly simplified block diagram of a computer. Various important elements, such as the I/O bus and devices, have been left out. The core of the processor are the registers (r0, ..., rn and f0, ..., fn) and the arithmetic units (ALU and FPU). In general, the arithmetic units can access data in registers ''instantly''. Often data must be explicitly loaded from memory into a register before it can be manipulated by the arithmetic units. .LP The ALU handles integer arithmetic, such as bit operations (AND, OR, XOR, NOT, and SHIFT) as well as ADD, MUL, DIV, and MOD. Sometimes there is specialized hardware to handle one or more operations, such as a barrel shifter for SHIFT or a multiplier, and sometimes there is no hardware support for certain operations, such as MUL, DIV, and MOD. .LP The FPU handles floating point arithmetic. Sometimes there are separate FPUs for single and double precision floating point operations. .NH 2 Memory Hierarchy .LP Nearly all modern, general purpose computers use virtual memory with phyically addressed caches. As such, there is typically one or more caches between the physical memory and the processor, and virtual-to-physical address translation occurs between the processor and the top-level cache. Cache staging and replacement is done in \fIcache line\fR units, which are typically several words in length, and caches lower in the hierarchy sometimes have cache lines which are larger than those in the higher caches. .LP Modern processors usually incorporate at least an L1 cache on-chip, and some are starting to also incorporate the L2 cache on-chip. In addition, most include a translation look-aside buffer (TLB) on-chip for fast virtual-to-physical address translation. .LP One key element of any cache design is its replacement strategy. Most caches use either direct-mapped or set associative caches. In the first instance any word in physical memory has exactly one cache line where into which it may be staged, while set associative caches allow a given word to be cached into one of a set of lines. Direct-mapped caches have a very simple replacement policy: the contents of the line that is needed is discarded. Set associative caches usually use LRU or some variant within each set, so the least recently used line in the set of possible cache lines is replaced. The control logic for direct-mapped caches is much cheaper to build, but they are generally only as effective as a set-associative cache half the size .RN Hennessy96 . .LP Another key element of memory hierarchy design is the management of dirty data; at what point are writes passed down the memory hierarchy to lower caches and main memory? The two basic policies are write-through and write-back. A write-through policy means that writes are immediately passed through the cache to the next level in the hierarchy, so the lower levels are updated at the same time as the cache. A write-back policy means that the cache line is marked as dirty in the cache, and only when the line is ejected from the cache is the data passed down the hierarchy. Write-through policies are often used in higher (smaller) caches because multi- processor systems need to keep a coherent view of memory and the writes are often propagated to other processors by \fIsnoopy\fR caches. .LP One often overlooked aspect of cache performance is cache behavior during writes. Most cache lines contain several words, and most instructions only update the line a word at a time. This means that when the processor writes a word to a cache line that is not present, the cache will read the line from memory before completing the write operation. For \*[bcopy]-like operations this means that the overall memory bandwidth requirement is actually two reads and one write per copied word, rather than the expected read and write. .LP Most modern processors now include some form of prefetch in the memory hierarchy. For the most part these are simple systems that can recognize fixed strided accesses through memory, such as might be seen in many array operations. However, prefetching systems appear to be growing in complexity and capability. .LP Additionally, modern memory subsystems can usually support multiple outstanding requests; the level of parallelism is usually dependent on the level of the hierarchy being accessed. Top-level caches can sometimes support as many as six or eight outstanding requests, while main memory can usually support two outstanding requests. Other elements of the memory hierarchy, such as the TLB, often have additional limits on the level of achievable parallelism in practice.\** .FS For example, if the TLB serializes all TLB misses, and if each memory access causes a TLB miss, then the memory accesses will be serialized even if the data was in a cache supporting six outstanding requests. .FE .LP For more information and details on memory subsystem design, and computer architecture in general, please see .RN Hennessy96 which has an excellent description of these and many other issues. .NH 1 Timing Harness .LP The first, and most crucial element in extending \*[lmbench2] so that it could measure scalable performance, was to develop a new timing harness that could accurately measure performance for any given load. Once this was done, then each benchmark would be migrated to the new timing harness. .LP The harness is designed to accomplish a number of goals: .IP 1. during any timing interval of any child it is guaranteed that all other child processes are also running the benchmark .IP 2. the timing intervals are long enough to average out most transient OS scheduler affects .IP 3. the timing intervals are long enough to ensure that error due to clock resolution is negligible .IP 4. timing measurements can be postponed to allow the OS scheduler to settle and adjust to the load .IP 5. the reported results should be representative and the data analysis should be robust .IP 6. timing intervals should be as short as possible while ensuring accurate results .LP Developing an accurate timing harness with a valid experimental design is more difficult than is generally supposed. Many programs incorporate elementary timing harnesses which may suffer from one or more defects, such as insufficient care taken to ensure that the benchmarked operation is run long enough to ensure that the error introduced by the clock resolution is insignificant. The basic elements of a good timing harness are discussed in .RN Staelin98 . .LP The new timing harness must also collect and process the timing results from all the child processes so that it can report the representative performance. It currently reports the median performance over all timing intervals from all child processes. It might perhaps be argued that it should report the median of the medians. .LP Most of the benchmarks now accept a "-P <parallelism>" flag, and the timing harness does the right thing to try and measure parallel application performance. .LP When running benchmarks with more than one child, the harness must first get a baseline estimate of performance by running the benchmark in only one process using the standard \*[lmbench] timing interval, which is often 5,000 micro-seconds. Using this information, the harness can compute the average time per iteration for a single process, and it uses this figure to compute the number of iterations necessary to ensure that each child runs for at least one second. .NH 2 Clock resolution .LP \*[lmbench] uses the \*[gettimeofday] clock, whose interface resolves time down to 1 micro-second. However, many system clock's resolution is only 10 milli-seconds, and there is no portable way to query the system to discover the true clock resolution. .LP The problem is that the timing intervals must be substantially larger than the clock resolution in order to ensure that the timing error doesn't impact the results. For example, the true duration of an event measured with a 10 milli-second clock can vary $+-$10 milli-seconds from the true time, assuming that the reported time is always a truncated version of the true time. If the clock itself is not updated precisely, the true error can be even larger. This implies that timing intervals on these systems should be at least 1 second. .LP However, the \*[gettimeofday] clock resolution in most modern systems is 1 micro-second, so timing intervals can as small as a few milli-seconds without incurring significant timing errors related to clock resolution. .LP Since there is no standard interface to query the operating system for the clock resolution, \*[lmbench] must experimentally determine the appropriate timing interval duration which provides results in a timely fashion with a negligible clock resolution error. .NH 2 Coordination .LP Developing a timing harness that correctly manages $N$ processes and accurately measures system performance over those same $N$ processes is significantly more difficult than simply measuring system performance with a single process because of the asynchronous nature of parallel programming. .LP In essence, the new timing harness needs to create $N$ jobs, and measure the average performance of the target subsystem while all $N$ jobs are running. This is a standard problem for parallel and distributed programming, and involves starting the child processes and then stepping through a handshaking process to ensure that all children have started executing the benchmarked operation before any child starts taking measurements. .TSTART 1 .TS box tab (/) allbox expand ; c c l l . Parent/Child T{ start up P child processes T}/T{ run benchmark operation for a little while T} T{ wait for P "ready" signals T}/T{ send a "ready" signal T} T{ [sleep for "warmup" microseconds] T}/T{ run benchmark operation while polling for a "go" signal T} T{ send "go" signal to P children T}/T{ begin timing benchmark operation T} T{ wait for P "done" signals T}/T{ send a "done" signal T} T{ for each child, send "results" signal and gather results T}/T{ run benchmark operation while polling for a "results" signal T} T{ collate results T}/T{ send timing results and wait for "exit" signal T} T{ send "exit" signal T}/T{ exit T} .TE .TEND "Timing harness sequencing" .LP Table \n[TABLE] shows how the parent and child processes coordinate their activities to ensure that all children are actively running the benchmark activity while any child could be taking timing measurements. .LP .NH 2 Accuracy .LP The new timing harness also needs to ensure that the timing intervals are long enough for the results to be representative. The previous timing harness assumed that only single process results were important, and it was able to use timing intervals as short as possible while ensuring that errors introduced by the clock resolution were negligible. In many instances this meant that the timing intervals were smaller than a single scheduler time slice. The new timing harness must run benchmarked items long enough to ensure that timing intervals are longer than a single scheduler time slice. Otherwise, you can get results which are complete nonsense. For example, running several copies of an \*[lmbench2] benchmark on a uni-processor machine will often report that the performance with $N$ jobs running in parallel is equivalent to the performance with a single job running!\** .FS This was discovered by someone who naively attempted to parallelize \*[lmbench2] in this fashion, and I received a note from the dismayed developer describing the failed experiment. .FE .LP In addition, since the timing intervals now have to be longer than a single scheduler time slice, they also need to be long enough so that a single scheduler time slice is insignificant compared to the timing interval. Otherwise the timing results can be dramatically affected by small variations in the scheduler's behavior. .NH 2 Resource consumption .LP One important design goal was that resource consumption be constant with respect to the number of child processes. This is why the harness uses shared pipes to communicate with the children, rather than having a separate set of pipes to communicate with each child. An early design of the system utilized a pair of pipes per child for communication and synchronization between the master and slave processes. However, as the number of child processes grew, the fraction of system resources consumed by the harness grew and the additional system overhead could start to interfere with the accuracy of the measurements. .LP Additionally, if the master has to poll (\*[select]) $N$ pipes, then the system overhead of that operation also scales with the number of children. .NH 2 Pipe atomicity .LP Since all communication between the master process and the slave (child) processes is done via a set of shared pipes, we have to ensure that we never have a situation where the message can be garbled by the intermingling of two separate messages from two separate children. This is ensured by either using pipe operations that are guaranteed to be atomic on all machines, or by coordinating between processes so that at most one process is writing at a time. .LP The atomicity guarantees are provided by having each client communicate synchronization states in one-byte messages. For example, the signals from the master to each child are one-byte messages, so each child only reads a single byte from the pipe. Similarly, the responses from the children back to the master are also one-byte messages. In this way no child can receive partial messages, and no message can be interleaved with any other message. .LP However, using this design means that we need to have a separate pipe for each \fIbarrier\fR in the process, so the master uses three pipes to send messages to the children, namely: \fIstart_signal\fR, \fIresult_signal\fR, and \fIexit_signal\fR. If a single pipe was used for all three barrier events, then it is possible for a child to miss a signal, or if the signal is encoded into the message, then it is possible for a child to infinite loop pulling a signal off the pipe, recognizing that it has already received that signal so that it needs to push it back into the pipe, and then then re-receiving the same message it just re-sent. .LP However, all children share a single pipe to send data back to the master process. Usually the messages on this pipe are single-byte signals, such as \fIready\fR or \fIdone\fR. However, the timing data results need to be sent from the children to the master and they are (much) larger than a single-byte message. In this case, the timing harness sends a single-byte message on the \fIresult_signal\fR channel, which can be received by at most one child process. This child then knows that it has sole ownership of the response pipe, and it writes its entire set of timing results to this pipe. Once the master has received all of the timing results from a single child, it sends the next one-byte message on the \fIresult_signal\fR channel to gather the next set of timing results. .TSTART 1 .so lmbench3_signals.pic .FEND "Control signals" 1 .LP The design of the signals is shown in Figure \n[FIGURE]. .NH 2 Benchmark initialization .LP By allowing the benchmark to specify an initialization routine that is run in the child processes, the new timing harness allows benchmarks to do either or both global initializations that are shared by all children and specific per-child initializations that are done independently by each child. Global initialization is done in the master process before the \*[benchmp] harness is called, so the state is preserved across the \*[fork] operations. Per-child initialization is done inside the \*[benchmp] harness by the optional initialization routine and is done after the \*[fork] operation. .LP Similarly, each benchmark is allowed to specify a cleanup routine that is run by the child processes just before exiting. This allows the benchmark routines to release any resources that they may have used during the benchmark. Most system resources would be automatically released on process exit, such as file descriptors and shared memory segments, but some resources such as temporary files might need to be explicitly released by the benchmark. .NH 2 Scheduler transients .LP Particularly on multi-processor systems, side-effects of process migration can dramatically affect program runtimes. For example, if the processes are all initially assigned to the same processor as the parent process, and the timing is done before the scheduler migrates the processes to other available processors, then the system performance will appear to be that of a uniprocessor. Similarly, if the scheduler is over-enthusiastic about re-assigning processes to processors, then performance will be worse than necessary because the processes will keep encountering cold caches and will pay exhorbitant memory access costs. .LP The first case is a scheduler transient, and users may not want to measure such transient phenomena if their primary interest is in predicting performance for long-running programs. Conversely, that same user would be extraordinarily interested in the second phenomena. The harness was designed to allow users to specify that the benchmarked processes are run for long enough to (hopefully) get the scheduler past the transient startup phase, so it can measure the steady-state behavior. .NH 2 Data analysis .LP Analyzing the data to produce representative results is a crucial step in the benchmarking process. \*[lmbench] generally reports the \fImedian\fP result for $11$ measurements. Most benchmarks report the results of a single measurement .RN Howard88 , an average of several results .RN McCalpin95 , or a trimmed mean .RN Brown97 . XXX UNKNOWN: .RN Weicker84,Shein89,Park,Wolman89,Banga97,Saavedra92,Chen94a,Bray90 .LP Since \*[lmbench] is able to use timing intervals that are often smaller than a scheduler time slice, the raw timing results are often severely skewed. The median is preferable to the mean when the data can be very skewed .RN Jain91 . .LP In some instances, however, \*[lmbench] internally uses the \fIminimum\fP rather than the median, such as in \*[mhz]. In those instances, we are not trying to find the \fIrepresentative\fP value, but rather the \fIminimum\fP value. There are only a few sources of error which could cause a the measured timing result to be shorter than the true elapsed time: the system clock is adjusted, or round-off error in the clock resolution. The timing interval duration is set to ensure that the round-off error is bounded to 1% of the timing interval, and we blithely assume that people don't reset their system clocks while benchmarking their systems. .LP \*[lmbench] does not currently report any statistics representing measurement variation, such as the difference between the first and third quartiles. .NH 1 Interface .LP Unfortunately we had to move away from the macro-based timing harness used in \*[lmbench2] and migrate to a function-based system. .LP The new interface looks like: .DS typedef void (*bench_f)(uint64 iterations, void* cookie); typedef void (*support_f)(void* cookie); extern void benchmp(support_f initialize, bench_f benchmark, support_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie); .DE .LP A brief description of the parameters: .IP \fIenough\fR Enough can be used to ensure that a timing interval is at least 'enough' microseconds in duration. For most benchmarks this should be zero, but some benchmarks have to run for more time due to startup effects or other strange behavior. .IP \fIparallel\fR is simply the number of instances of the benchmark that will be run in parallel on the system. .IP \fIwarmup\fR can be used to force the benchmark to run for warmup microseconds before the system starts making timing measurements. Note that it is a lower bound, not a fixed value, since it is simply the time that the parent sleeps after receiving the last "ready" signal from each child (and before it sends the "go" signal to the children). .IP \fIrepetitions\fR is the number of times the experiment should be repeated. The default is eleven. .IP \fIcookie\fR is a pointer that can be used by the benchmark writer to pass in configuration information, such as buffer size or other parameters needed by the inner loop. In \*[lmbench3] it is generally used to point to a structure containing the relevant configuration information. .LP To write a simple benchmark for getppid() all you would need to do is: .DS void benchmark_getppid(uint64 iterations, void* cookie) { while (iterations-- > 0) { getppid(); } } .DE .LP and then somewhere in your program you might call: .DS benchmp(NULL, benchmark_getppid, NULL, 0, 1, 0, NULL); micro("getppid", get_n()); .DE .LP A more complex example which has "state" and uses the initialization and cleanup capabilities might look something like this: .DS struct bcopy_state { int len; char* src; char* dst; }; .DE .DS void initialize_bcopy(void* cookie) { struct bcopy_state* state = (struct bcopy_state*)cookie; state->src = valloc(state->len); state->dst = valloc(state->len); bzero(src, state->len); bzero(src, state->len); } .DE .DS void benchmark_bcopy(uint64 iterations, void* cookie) { struct bcopy_state* state = (struct bcopy_state*)cookie; while (iterations-- > 0) { bcopy(state->src, state->dst, state->len); } } .DE .DS void cleanup_bcopy(void* cookie) { struct bcopy_state* state = (struct bcopy_state*)cookie; free(state->src); free(state->dst); } .DE .LP and then your program look something like: .DS #include "bench.h" int main() { struct bcopy_state state; state.len = 8 * 1024 * 1024; benchmp(initialize_bcopy, benchmark_bcopy, cleanup_bcopy, 0, 1, 0, TRIES, &state); fprintf(stderr, "bcopy: "); mb(state.len * get_n()); exit(0); } .DE .LP Note that this particular micro-benchmark would measure cache-to-cache \*[bcopy] performance unless the amount of memory being copied was larger than half the cache size. A slightly more sophisticated approach might allocate as much memory as possible and then \*[bcopy] from one segment to another, changing segments within the allocated memory before each \*[bcopy] to defeat the caches. .NH 1 Benchmarks .LP \*[lmbench] contains a large number of micro-benchmarks that measure various aspects of hardware and operating system performance. The benchmarks generally measure latency or bandwidth, but some new benchmarks also measure parallelism. .TSTART .TS center box tab (&); c c l & l . Name&Measures _ &Bandwidth bw_file_rd&T{ \*[read] and then load into processor T} bw_mem&T{ read, write, and copy data to/from memory T} bw_mmap_rd&read from \*[mmap]'ed memory bw_pipe&\*[pipe] inter-process data copy bw_tcp&TCP inter-process data copy bw_unix&UNIX inter-process _ &Latency lat_connect&TCP socket connection lat_ctx&T{ context switch via \*[pipe]-based ``hot-potato'' token passing T} lat_fcntl&\*[fcntl] operation lat_fifo&T{ FIFO ``hot-potato'' token passing T} lat_fs&file creation and deletion lat_http&http GET request latency lat_mem_rd&memory read lat_mmap&\*[mmap] operation lat_ops&basic operations lat_pagefault&page fault handler lat_pipe&\*[pipe] ``hot-potato'' token passing lat_proc&T{ procedure call overhead and process creation using \*[fork], \*[fork] and \*[execve], and \*[fork] and \*[sh] T} lat_rpc&SUN RPC procedure call lat_select&\*[select] lat_sem&T{ semaphore ``hot-potato'' token passing T} lat_sig&T{ signal handle installation and handling T} lat_syscall&\*[getppid], \*[write], \*[stat], \*[fstat], \*[open], \*[close] lat_tcp&TCP ``hot-potato'' token passing lat_udp&UDP ``hot-potato'' token passing lat_unix&UNIX ``hot-potato'' token passing lat_unix_connect&UNIX socket connection _ &Parallelism par_mem&memory subsystem par_ops&T{ instruction-level parallelism of basic arithmetic operations T} _ mhz&CPU clock frequency line&cache line size tlb&number of pages mapped by TLB stream&STREAM clones lmdd&\fIdd\fR clone .TE .TEND "\*[lmbench] micro-benchmarks" .LP Table \n[TABLE] contains the full list of micro-benchmarks in \*[lmbench]. .NH 2 Bandwidth .LP .LP By bandwidth, we mean the rate at which a particular facility can move data. We attempt to measure the data movement ability of a number of different facilities: library \*[bcopy], hand-unrolled \*[bcopy], direct-memory read and write (no copying), pipes, TCP sockets, the \*[read] interface, and the \*[mmap] interface. .NH 2 Memory bandwidth .LP Data movement is fundamental to any operating system. In the past, performance was frequently measured in MFLOPS because floating point units were slow enough that microprocessor systems were rarely limited by memory bandwidth. Today, floating point units are usually much faster than memory bandwidth, so many current MFLOP ratings can not be maintained using memory-resident data; they are ``cache only'' ratings. .LP We measure the ability to copy, read, and write data over a varying set of sizes. There are too many results to report all of them here, so we concentrate on large memory transfers. .LP We measure copy bandwidth two ways. The first is the user-level library \*[bcopy] interface. The second is a hand-unrolled loop that loads and stores aligned 8-byte words. In both cases, we took care to ensure that the source and destination locations would not map to the same lines if the any of the caches were direct-mapped. In order to test memory bandwidth rather than cache bandwidth, both benchmarks copy an 8M\** area to another 8M area. (As secondary caches reach 16M, these benchmarks will have to be resized to reduce caching effects.) .FS Some of the PCs had less than 16M of available memory; those machines copied 4M. .FE .LP The copy results actually represent one-half to one-third of the memory bandwidth used to obtain those results since we are reading and writing memory. If the cache line size is larger than the word stored, then the written cache line will typically be read before it is written. The actual amount of memory bandwidth used varies because some architectures have special instructions specifically designed for the \*[bcopy] function. Those architectures will move twice as much memory as reported by this benchmark; less advanced architectures move three times as much memory: the memory read, the memory read because it is about to be overwritten, and the memory written. .LP The \*[bcopy] results reported in Table 2 may be correlated with John McCalpin's \*[stream] .RN McCalpin95 benchmark results in the following manner: the \*[stream] benchmark reports all of the memory moved whereas the \*[bcopy] benchmark reports the bytes copied. So our numbers should be approximately one-half to one-third of his numbers. .LP Memory reading is measured by an unrolled loop that sums up a series of integers. On most (perhaps all) systems measured the integer size is 4 bytes. The loop is unrolled such that most compilers generate code that uses a constant offset with the load, resulting in a load and an add for each word of memory. The add is an integer add that completes in one cycle on all of the processors. Given that today's processor typically cycles at 10 or fewer nanoseconds (ns) and that memory is typically 200-1,000 ns per cache line, the results reported here should be dominated by the memory subsystem, not the processor add unit. .LP The memory contents are added up because almost all C compilers would optimize out the whole loop when optimization was turned on, and would generate far too many instructions without optimization. The solution is to add up the data and pass the result as an unused argument to the ``finish timing'' function. .LP Memory reads represent about one-third to one-half of the \*[bcopy] work, and we expect that pure reads should run at roughly twice the speed of \*[bcopy]. Exceptions to this rule should be studied, for exceptions indicate a bug in the benchmarks, a problem in \*[bcopy], or some unusual hardware. .TSTART .so bw_allmem.tbl .TEND "Memory bandwidth (MB/s)" .LP Memory writing is measured by an unrolled loop that stores a value into an integer (typically a 4 byte integer) and then increments the pointer. The processor cost of each memory operation is approximately the same as the cost in the read case. .LP The numbers reported in Table \n[TABLE] are not the raw hardware speed in some cases. The Power2\** is capable of up to 800M/sec read rates .FS Someone described this machine as a $1,000 processor on a $99,000 memory subsystem. .FE .RN McCalpin95 and HP PA RISC (and other prefetching) systems also do better if higher levels of code optimization used and/or the code is hand tuned. .LP The Sun libc bcopy in Table \n[TABLE] is better because they use a hardware specific bcopy routine that uses instructions new in SPARC V9 that were added specifically for memory movement. .LP The Pentium Pro read rate in Table \n[TABLE] is much higher than the write rate because, according to Intel, the write transaction turns into a read followed by a write to maintain cache consistency for MP systems. .NH 2 IPC bandwidth .LP Interprocess communication bandwidth is frequently a performance issue. Many Unix applications are composed of several processes communicating through pipes or TCP sockets. Examples include the \f(CWgroff\fP documentation system that prepared this paper, the \f(CWX Window System\fP, remote file access, and \f(CWWorld Wide Web\fP servers. .LP Unix pipes are an interprocess communication mechanism implemented as a one-way byte stream. Each end of the stream has an associated file descriptor; one is the write descriptor and the other the read descriptor. TCP sockets are similar to pipes except they are bidirectional and can cross machine boundaries. .LP Pipe bandwidth is measured by creating two processes, a writer and a reader, which transfer 50M of data in 64K transfers. The transfer size was chosen so that the overhead of system calls and context switching would not dominate the benchmark time. The reader prints the timing results, which guarantees that all data has been moved before the timing is finished. .LP TCP bandwidth is measured similarly, except the data is transferred in 1M page aligned transfers instead of 64K transfers. If the TCP implementation supports it, the send and receive socket buffers are enlarged to 1M, instead of the default 4-60K. We have found that setting the transfer size equal to the socket buffer size produces the greatest throughput over the most implementations. .TSTART .so bw_ipc.tbl .TEND "Pipe and local TCP bandwidth (MB/s)" .LP \*[bcopy] is important to this test because the pipe write/read is typically implemented as a \*[bcopy] into the kernel from the writer and then a \*[bcopy] from the kernel to the reader. Ideally, these results would be approximately one-half of the \*[bcopy] results. It is possible for the kernel \*[bcopy] to be faster than the C library \*[bcopy] since the kernel may have access to \*[bcopy] hardware unavailable to the C library. .LP It is interesting to compare pipes with TCP because the TCP benchmark is identical to the pipe benchmark except for the transport mechanism. Ideally, the TCP bandwidth would be as good as the pipe bandwidth. It is not widely known that the majority of the TCP cost is in the \*[bcopy], the checksum, and the network interface driver. The checksum and the driver may be safely eliminated in the loopback case and if the costs have been eliminated, then TCP should be just as fast as pipes. From the pipe and TCP results in Table \n[TABLE], it is easy to see that Solaris and HP-UX have done this optimization. .LP Bcopy rates in Table \n[TABLE] can be lower than pipe rates because the pipe transfers are done in 64K buffers, a size that frequently fits in caches, while the bcopy is typically an 8M-to-8M copy, which does not fit in the cache. .LP In Table \n[TABLE], the SGI Indigo2, a uniprocessor, does better than the SGI MP on pipe bandwidth because of caching effects - in the UP case, both processes share the cache; on the MP, each process is communicating with a different cache. .LP All of the TCP results in Table \n[TABLE] are in loopback mode \(em that is both ends of the socket are on the same machine. It was impossible to get remote networking results for all the machines included in this paper. We are interested in receiving more results for identical machines with a dedicated network connecting them. The results we have for over the wire TCP bandwidth are shown below. .TSTART .so bw_tcp.tbl .TEND "Remote TCP bandwidth (MB/s)" .LP The SGI using 100MB/s Hippi is by far the fastest in Table \n[TABLE]. The SGI Hippi interface has hardware support for TCP checksums and the IRIX operating system uses virtual memory tricks to avoid copying data as much as possible. For larger transfers, SGI Hippi has reached 92MB/s over TCP. .LP 100baseT is looking quite competitive when compared to FDDI in Table \n[TABLE], even though FDDI has packets that are almost three times larger. We wonder how long it will be before we see gigabit ethernet interfaces. .NH 2 Cached I/O bandwidth .LP Experience has shown us that reusing data in the file system page cache can be a performance issue. This section measures that operation through two interfaces, \*[read] and \*[mmap]. The benchmark here is not an I/O benchmark in that no disk activity is involved. We wanted to measure the overhead of reusing data, an overhead that is CPU intensive, rather than disk intensive. .LP The \*[read] interface copies data from the kernel's file system page cache into the process's buffer, using 64K buffers. The transfer size was chosen to minimize the kernel entry overhead while remaining realistically sized. .LP The difference between the \*[bcopy] and the \*[read] benchmarks is the cost of the file and virtual memory system overhead. In most systems, the \*[bcopy] speed should be faster than the \*[read] speed. The exceptions usually have hardware specifically designed for the \*[bcopy] function and that hardware may be available only to the operating system. .LP The \*[read] benchmark is implemented by rereading a file (typically 8M) in 64K buffers. Each buffer is summed as a series of integers in the user process. The summing is done for two reasons: for an apples-to-apples comparison the memory-mapped benchmark needs to touch all the data, and the file system can sometimes transfer data into memory faster than the processor can read the data. For example, \s-1SGI\s0's XFS can move data into memory at rates in excess of 500M per second, but it can move data into the cache at only 68M per second. The intent is to measure performance delivered to the application, not DMA performance to memory. .TSTART .so bw_reread2.tbl .TEND "File vs. memory bandwidth (MB/s)" .LP The \*[mmap] interface provides a way to access the kernel's file cache without copying the data. The \*[mmap] benchmark is implemented by mapping the entire file (typically 8M) into the process's address space. The file is then summed to force the data into the cache. .LP In Table \n[TABLE], a good system will have \fIFile read\fP as fast as (or even faster than) \fILibc bcopy\fP because as the file system overhead goes to zero, the file reread case is virtually the same as the library \*[bcopy] case. However, file reread can be faster because the kernel may have access to \*[bcopy] assist hardware not available to the C library. Ideally, \fIFile mmap\fP performance should approach \fIMemory read\fP performance, but \*[mmap] is often dramatically worse. Judging by the results, this looks to be a potential area for operating system improvements. .LP In Table \n[TABLE] the Power2 does better on file reread than bcopy because it takes full advantage of the memory subsystem from inside the kernel. The mmap reread is probably slower because of the lower clock rate; the page faults start to show up as a significant cost. .LP It is surprising that the Sun Ultra1 was able to bcopy at the high rates shown in Table 2 but did not show those rates for file reread in Table \n[TABLE]. HP has the opposite problem, they get file reread faster than bcopy, perhaps because the kernel \*[bcopy] has access to hardware support. .LP The Unixware system has outstanding mmap reread rates, better than systems of substantially higher cost. Linux needs to do some work on the \f(CWmmap\fP code. .NH 2 Latency .LP Latency is an often-overlooked area of performance problems, possibly because resolving latency issues is frequently much harder than resolving bandwidth issues. For example, memory bandwidth may be increased by making wider cache lines and increasing memory ``width'' and interleave, but memory latency can be improved only by shortening paths or increasing (successful) prefetching. The first step toward improving latency is understanding the current latencies in a system. .LP The latency measurements included in this suite are memory latency, basic operating system entry cost, signal handling cost, process creation times, context switching, interprocess communication, .\" virtual memory system latency, file system latency, and disk latency. .NH 2 Memory read latency background .LP In this section, we expend considerable effort to define the different memory latencies and to explain and justify our benchmark. The background is a bit tedious but important, since we believe the memory latency measurements to be one of the most thought-provoking and useful measurements in \*[lmbench]. .LP The most basic latency measurement is memory latency since most of the other latency measurements can be expressed in terms of memory latency. For example, context switches require saving the current process state and loading the state of the next process. However, memory latency is rarely accurately measured and frequently misunderstood. .LP Memory read latency has many definitions; the most common, in increasing time order, are memory chip cycle time, processor-pins-to-memory-and-back time, load-in-a-vacuum time, and back-to-back-load time. .BU "Memory chip cycle latency" : Memory chips are rated in nanoseconds; typical speeds are around 60ns. A general overview on DRAM architecture may be found in .RN Hennessy96 . The specific information we describe here is from .RN Toshiba94 and pertains to the \s-1THM361020AS-60\s0 module and \s-1TC514400AJS\s0 \s-1DRAM\s0 used in \s-1SGI\s0 workstations. The 60ns time is the time from .ps -1 .nr width \w'R\&A\&S' .nr height \n[rst]+1000 RAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' .ps assertion to the when the data will be available on the \s-1DRAM\s0 pins (assuming .ps -1 .nr width \w'C\&A\&S' .nr height \n[rst]+1000 CAS\v'-\n[height]u'\h'-\n[width]u'\fB\l'\n[width]u'\fP\v'\n[height]u' .ps access time requirements were met). While it is possible to get data out of a \s-1DRAM\s0 in 60ns, that is not all of the time involved. There is a precharge time that must occur after every access. .RN Toshiba94 quotes 110ns as the random read or write cycle time and this time is more representative of the cycle time. .\" For example, most systems offer a wide range of memory .\" capacity, from 64MB to 1GB or more. If 64MB simms are used, the number .\" of simms range from 1 to 16. The more simms there are, the more .\" capacitance there is in the memory subsystem. More capacitance means .\" longer setup times for the fully populated memory subsystem. System .\" designers have to allow time for this setup. .\" For more details, consult [XXX - reference on DRAM]. .\" This is sometimes referred to as the chip latency. The .\" chip cycle time is the chip latency plus the time required to restore .\" the data in the capacitors which is often referred to as the precharge .\" time. This means that 60 nanosecond memory chips really are more like .\" 100 nanosecond memory chips. Some systems operate memory in ``page .\" mode'' or ``static column'' memory systems hold either RAS or CAS and .\" allow subsequent accesses in the same row or column in one cycle instead .\" of two. .BU "Pin-to-pin latency" : This number represents the time needed for the memory request to travel from the processor's pins to the memory subsystem and back again. Many vendors have used the pin-to-pin definition of memory latency in their reports. For example, .RN Fenwick95 while describing the \s-1DEC\s0 8400 quotes memory latencies of 265ns; a careful reading of that paper shows that these are pin-to-pin numbers. In spite of the historical precedent in vendor reports, this definition of memory latency is misleading since it ignores actual delays seen when a load instruction is immediately followed by a use of the data being loaded. The number of additional cycles inside the processor can be significant and grows more significant with today's highly pipelined architectures. .LP It is worth noting that the pin-to-pin numbers include the amount of time it takes to charge the lines going to the \s-1SIMM\s0s, a time that increases with the (potential) number of \s-1SIMM\s0s in a system. More \s-1SIMM\s0s mean more capacitance which requires in longer charge times. This is one reason why personal computers frequently have better memory latencies than workstations: the PCs typically have less memory capacity. .BU "Load-in-a-vacuum latency" : A load in a vacuum is the time that the processor will wait for one load that must be fetched from main memory (i.e., a cache miss). The ``vacuum'' means that there is no other activity on the system bus, including no other loads. While this number is frequently used as the memory latency, it is not very useful. It is basically a ``not to exceed'' number important only for marketing reasons. Some architects point out that since most processors implement nonblocking loads (the load does not cause a stall until the data is used), the perceived load latency may be much less that the real latency. When pressed, however, most will admit that cache misses occur in bursts, resulting in perceived latencies of at least the load-in-a-vacuum latency. .BU "Back-to-back-load latency" : Back-to-back-load latency is the time that each load takes, assuming that the instructions before and after are also cache-missing loads. Back-to-back loads may take longer than loads in a vacuum for the following reason: many systems implement something known as \fIcritical word first\fP, which means that the subblock of the cache line that contains the word being loaded is delivered to the processor before the entire cache line has been brought into the cache. If another load occurs quickly enough after the processor gets restarted from the current load, the second load may stall because the cache is still busy filling the cache line for the previous load. On some systems, such as the current implementation of UltraSPARC, the difference between back to back and load in a vacuum is about 35%. .LP \*[lmbench] measures back-to-back-load latency because it is the only measurement that may be easily measured from software and because we feel that it is what most software developers consider to be memory latency. Consider the following C code fragment: .DS .nf .ft CW p = head; while (p->p_next) p = p->p_next; .ft .fi .DE On a \s-1DEC\s0 Alpha, the loop part turns into three instructions, including the load. A 300 Mhz processor has a 3.33ns cycle time, so the loop could execute in slightly less than 10ns. However, the load itself takes 400ns on a 300 Mhz \s-1DEC\s0 8400. In other words, the instructions cost 10ns but the load stalls for 400. Another way to look at it is that 400/3.3, or 121, nondependent, nonloading instructions following the load would be needed to hide the load latency. Because superscalar processors typically execute multiple operations per clock cycle, they need even more useful operations between cache misses to keep the processor from stalling. .LP This benchmark illuminates the tradeoffs in processor cache design. Architects like large cache lines, up to 64 bytes or so, because the prefetch effect of gathering a whole line increases hit rate given reasonable spatial locality. Small stride sizes have high spatial locality and should have higher performance, but large stride sizes have poor spatial locality causing the system to prefetch useless data. So the benchmark provides the following insight into negative effects of large line prefetch: .BU Multi-cycle fill operations are typically atomic events at the caches, and sometimes block other cache accesses until they complete. .BU Caches are typically single-ported. Having a large line prefetch of unused data causes extra bandwidth demands at the cache, and can cause increased access latency for normal cache accesses. .LP In summary, we believe that processors are so fast that the average load latency for cache misses will be closer to the back-to-back-load number than to the load-in-a-vacuum number. We are hopeful that the industry will standardize on this definition of memory latency. .NH 2 Memory read latency .LP The entire memory hierarchy can be measured, including on-board data cache latency and size, external data cache latency and size, and main memory latency. Instruction caches are not measured. TLB miss latency can also be measured, as in .RN Saavedra92 , but we stopped at main memory. Measuring TLB miss time is problematic because different systems map different amounts of memory with their TLB hardware. .LP The benchmark varies two parameters, array size and array stride. For each size, a list of pointers is created for all of the different strides. Then the list is walked thus: .DS .ft CW mov r4,(r4) # C code: p = *p; .ft .DE The time to do about 1,000,000 loads (the list wraps) is measured and reported. The time reported is pure latency time and may be zero even though the load instruction does not execute in zero time. Zero is defined as one clock cycle; in other words, the time reported is \fBonly\fP memory latency time, as it does not include the instruction execution time. It is assumed that all processors can do a load instruction in one processor cycle (not counting stalls). In other words, if the processor cache load time is 60ns on a 20ns processor, the load latency reported would be 40ns, the additional 20ns is for the load instruction itself.\** .FS In retrospect, this was a bad idea because we calculate the clock rate to get the instruction execution time. If the clock rate is off, so is the load time. .FE Processors that can manage to get the load address out to the address pins before the end of the load cycle get some free time in this benchmark (we don't know of any processors that do that). .LP This benchmark has been validated by logic analyzer measurements on an \s-1SGI\s0 Indy by Ron Minnich while he was at the Maryland Supercomputer Research Center. .TSTART 1 .so mem.pic .FEND "Memory latency" 1 .LP Results from the memory latency benchmark are plotted as a series of data sets as shown in Figure \n[FIGURE]. Each data set represents a stride size, with the array size varying from 512 bytes up to 8M or more. The curves contain a series of horizontal plateaus, where each plateau represents a level in the memory hierarchy. The point where each plateau ends and the line rises marks the end of that portion of the memory hierarchy (e.g., external cache). Most machines have similar memory hierarchies: on-board cache, external cache, main memory, and main memory plus TLB miss costs. There are variations: some processors are missing a cache, while others add another cache to the hierarchy. .\" XXX Larry please double-check this; I am going on dim memory... For example, the Alpha 8400 has two on-board caches, one 8K and the other 96K. .LP The cache line size can be derived by comparing curves and noticing which strides are faster than main memory times. The smallest stride that is the same as main memory speed is likely to be the cache line size because the strides that are faster than memory are getting more than one hit per cache line. .\" Prefetching may confuse .\" the issue because a demand read may stall behind a prefetch load, .\" causing cache lines to appear twice as large as they are. .\" XXX .\" Larry --- can we use prime modulus arithmetic to set up pointer .\" loops which might appear random but which really aren't and which .\" hit every stride once before looping? .\" .\" XXX .\" Larry --- is there any way we can defeat/disable prefetching .\" so the cache line size can be more accurately determined? .\" .\" XXX .\" Larry --- can we create a benchmark for TLB misses? .\" I think it was Tom Rokicki who suggested that we create a .\" benchmark where the data fits in the cache, but the pages don't .\" fit in the TLB. .\" .\" XXX .\" Larry --- is the description of the memory hierarchy correct? .\" I am not sure I haven't added an extra level of external cache... .EQ delim $$ .EN .LP Figure \n[FIGURE] shows memory latencies on a nicely made machine, a \s-1DEC\s0 Alpha. We use this machine as the example because it shows the latencies and sizes of the on-chip level 1 and motherboard level 2 caches, and because it has good all-around numbers, especially considering it can support a 4M level 2 cache. The on-board cache is $2 sup 13$ bytes or 8K, while the external cache is $2 sup 19$ bytes or 512K. .EQ delim off .EN .TSTART .so lat_allmem.tbl .TEND "Cache and memory latency (ns)" .nr MEMTABLE \n[TABLE] .LP Table \n[TABLE] shows the cache size, cache latency, and main memory latency as extracted from the memory latency graphs. The graphs and the tools for extracting the data are included with \*[lmbench]. It is worthwhile to plot all of the graphs and examine them since the table is missing some details, such as the \s-1DEC\s0 Alpha 8400 processor's second 96K on-chip cache. .LP We sorted Table \n[TABLE] on level 2 cache latency because we think that many applications will fit in the level 2 cache. The HP and IBM systems have only one level of cache so we count that as both level 1 and level 2. Those two systems have remarkable cache performance for caches of that size. In both cases, the cache delivers data in one clock cycle after the load instruction. .LP HP systems usually focus on large caches as close as possible to the processor. A older HP multiprocessor system, the 9000/890, has a 4M, split I&D, direct mapped cache with a 2K victim cache, accessible in one clock (16ns).\** That system is primarily a database server. .FS The Usenix version of this paper had this as a set associate cache; that was incorrect. .FE .LP The IBM focus is on low latency, high bandwidth memory. The IBM memory subsystem is good because all of memory is close to the processor, but has the weakness that it is extremely difficult to evolve the design to a multiprocessor system. .LP The 586 and PowerPC motherboards have quite poor second level caches, the caches are not substantially better than main memory. .LP The Pentium Pro and Sun Ultra second level caches are of medium speed at 5-6 clocks latency each. 5-6 clocks seems fast until it is compared against the HP and IBM one cycle latency caches of similar size. Given the tight integration of the Pentium Pro level 2 cache, it is surprising that it has such high latencies. .LP The 300Mhz DEC Alpha has a rather high 22 clock latency to the second level cache which is probably one of the reasons that they needed a 96K level 1.5 cache. SGI and DEC have used large second level caches to hide their long latency from main memory. .LP .NH 2 Operating system entry .LP Entry into the operating system is required for many system facilities. When calculating the cost of a facility, it is useful to know how expensive it is to perform a nontrivial entry into the operating system. .LP We measure nontrivial entry into the system by repeatedly writing one word to \f(CW/dev/null\fP, a pseudo device driver that does nothing but discard the data. This particular entry point was chosen because it has never been optimized in any system that we have measured. Other entry points, typically \*[getpid] and \*[gettimeofday], are heavily used, heavily optimized, and sometimes implemented as user-level library routines rather than system calls. A write to the \f(CW/dev/null\fP driver will go through the system call table to \*[write], verify the user area as readable, look up the file descriptor to get the vnode, call the vnode's write function, and then return. .TSTART .so lat_nullsys.tbl .TEND "Simple system call time (microseconds)" .LP Linux is the clear winner in the system call time. The reasons are twofold: Linux is a uniprocessor operating system, without any MP overhead, and Linux is a small operating system, without all of the ``features'' accumulated by the commercial offers. .LP Unixware and Solaris are doing quite well, given that they are both fairly large, commercially oriented operating systems with a large accumulation of ``features.'' .NH 2 Signal handling cost .LP Signals in Unix are a way to tell another process to handle an event. They are to processes as interrupts are to the CPU. .LP Signal handling is often critical to layered systems. Some applications, such as databases, software development environments, and threading libraries provide an operating system-like layer on top of the operating system, making signal handling a critical path in many of these applications. .LP \*[lmbench] measure both signal installation and signal dispatching in two separate loops, within the context of one process. It measures signal handling by installing a signal handler and then repeatedly sending itself the signal. .TSTART .so lat_signal.tbl .TEND "Signal times (microseconds)" .LP Table \n[TABLE] shows the signal handling costs. Note that there are no context switches in this benchmark; the signal goes to the same process that generated the signal. In real applications, the signals usually go to another process, which implies that the true cost of sending that signal is the signal overhead plus the context switch overhead. We wanted to measure signal and context switch overheads separately since context switch times vary widely among operating systems. .LP SGI does very well on signal processing, especially since their hardware is of an older generation than many of the others. .LP The Linux/Alpha signal handling numbers are so poor that we suspect that this is a bug, especially given that the Linux/x86 numbers are quite reasonable. .NH 2 Process creation costs .LP Process benchmarks are used to measure the basic process primitives, such as creating a new process, running a different program, and context switching. Process creation benchmarks are of particular interest in distributed systems since many remote operations include the creation of a remote process to shepherd the remote operation to completion. Context switching is important for the same reasons. .BU "Simple process creation" . The Unix process creation primitive is \*[fork], which creates a (virtually) exact copy of the calling process. Unlike VMS and some other operating systems, Unix starts any new process with a \*[fork]. Consequently, \*[fork] and/or \f(CWexecve\fP should be fast and ``light,'' facts that many have been ignoring for some time. .LP \*[lmbench] measures simple process creation by creating a process and immediately exiting the child process. The parent process waits for the child process to exit. The benchmark is intended to measure the overhead for creating a new thread of control, so it includes the \*[fork] and the \*[exit] time. .LP The benchmark also includes a \f(CWwait\fP system call in the parent and context switches from the parent to the child and back again. Given that context switches of this sort are on the order of 20 microseconds and a system call is on the order of 5 microseconds, and that the entire benchmark time is on the order of a millisecond or more, the extra overhead is insignificant. Note that even this relatively simple task is very expensive and is measured in milliseconds while most of the other operations we consider are measured in microseconds. .BU "New process creation" . The preceding benchmark did not create a new application; it created a copy of the old application. This benchmark measures the cost of creating a new process and changing that process into a new application, which. forms the basis of every Unix command line interface, or shell. \*[lmbench] measures this facility by forking a new child and having that child execute a new program \(em in this case, a tiny program that prints ``hello world'' and exits. .LP The startup cost is especially noticeable on (some) systems that have shared libraries. Shared libraries can introduce a substantial (tens of milliseconds) startup cost. .\" XXX - statically linked example? .TSTART .so lat_allproc.tbl .TEND "Process creation time (milliseconds)" .BU "Complicated new process creation" . When programs start other programs, they frequently use one of three standard interfaces: \*[popen], \*[system], and/or \*[execlp]. The first two interfaces start a new process by invoking the standard command interpreter, \f(CW/bin/sh\fP, to start the process. Starting programs this way guarantees that the shell will look for the requested application in all of the places that the user would look \(em in other words, the shell uses the user's $PATH variable as a list of places to find the application. \*[execlp] is a C library routine which also looks for the program using the user's $PATH variable. .LP Since this is a common way of starting applications, we felt it was useful to show the costs of the generality. .LP We measure this by starting \f(CW/bin/sh\fP to start the same tiny program we ran in the last case. In Table \n[TABLE] the cost of asking the shell to go look for the program is quite large, frequently ten times as expensive as just creating a new process, and four times as expensive as explicitly naming the location of the new program. .LP The results that stand out in Table \n[TABLE] are the poor Sun Ultra 1 results. Given that the processor is one of the fastest, the problem is likely to be software. There is room for substantial improvement in the Solaris process creation code. .NH 2 Context switching .LP Context switch time is defined here as the time needed to save the state of one process and restore the state of another process. .LP Context switches are frequently in the critical performance path of distributed applications. For example, the multiprocessor versions of the IRIX operating system use processes to move data through the networking stack. This means that the processing time for each new packet arriving at an idle system includes the time needed to switch in the networking process. .LP Typical context switch benchmarks measure just the minimal context switch time \(em the time to switch between two processes that are doing nothing but context switching. We feel that this is misleading because there are frequently more than two active processes, and they usually have a larger working set (cache footprint) than the benchmark processes. .LP Other benchmarks frequently include the cost of the system calls needed to force the context switches. For example, Ousterhout's context switch benchmark measures context switch time plus a \*[read] and a \*[write] on a pipe. In many of the systems measured by \*[lmbench], the pipe overhead varies between 30% and 300% of the context switch time, so we were careful to factor out the pipe overhead. .BU "Number of processes." The context switch benchmark is implemented as a ring of two to twenty processes that are connected with Unix pipes. A token is passed from process to process, forcing context switches. The benchmark measures the time needed to pass the token two thousand times from process to process. Each transfer of the token has two costs: the context switch, and the overhead of passing the token. In order to calculate just the context switching time, the benchmark first measures the cost of passing the token through a ring of pipes in a single process. This overhead time is defined as the cost of passing the token and is not included in the reported context switch time. .BU "Size of processes." In order to measure more realistic context switch times, we add an artificial variable size ``cache footprint'' to the switching processes. The cost of the context switch then includes the cost of restoring user-level state (cache footprint). The cache footprint is implemented by having the process allocate an array of data\** .FS All arrays are at the same virtual address in all processes. .FE and sum the array as a series of integers after receiving the token but before passing the token to the next process. Since most systems will cache data across context switches, the working set for the benchmark is slightly larger than the number of processes times the array size. .LP It is worthwhile to point out that the overhead mentioned above also includes the cost of accessing the data, in the same way as the actual benchmark. However, because the overhead is measured in a single process, the cost is typically the cost with ``hot'' caches. In the Figure 2, each size is plotted as a line, with context switch times on the Y axis, number of processes on the X axis, and the process size as the data set. The process size and the hot cache overhead costs for the pipe read/writes and any data access is what is labeled as \f(CWsize=0KB overhead=10\fP. The size is in kilobytes and the overhead is in microseconds. .LP The context switch time does not include anything other than the context switch, provided that all the benchmark processes fit in the cache. If the total size of all of the benchmark processes is larger than the cache size, the cost of each context switch will include cache misses. We are trying to show realistic context switch times as a function of both size and number of processes. .TSTART 1 .so ctx.pic .FEND "Context switch times" 1 .LP Results for an Intel Pentium Pro system running Linux at 167 MHz are shown in Figure \n[FIGURE]. The data points on the figure are labeled with the working set due to the sum of data in all of the processes. The actual working set is larger, as it includes the process and kernel overhead as well. One would expect the context switch times to stay constant until the working set is approximately the size of the second level cache. The Intel system has a 256K second level cache, and the context switch times stay almost constant until about 256K (marked as .25M in the graph). .BU "Cache issues" The context switch benchmark is a deliberate measurement of the effectiveness of the caches across process context switches. If the cache does not include the process identifier (PID, also sometimes called an address space identifier) as part of the address, then the cache must be flushed on every context switch. If the cache does not map the same virtual addresses from different processes to different cache lines, then the cache will appear to be flushed on every context switch. .LP If the caches do not cache across context switches there would be no grouping at the lower left corner of Figure \n[FIGURE], instead, the graph would appear as a series of straight, horizontal, parallel lines. The number of processes will not matter, the two process case will be just as bad as the twenty process case since the cache would not be useful across context switches. .TSTART .so ctx.tbl .TEND "Context switch time (microseconds)" .LP We picked four points on the graph and extracted those values for Table \n[TABLE]. The complete set of values, as well as tools to graph them, are included with \*[lmbench]. .LP Note that multiprocessor context switch times are frequently more expensive than uniprocessor context switch times. This is because multiprocessor operating systems tend to have very complicated scheduling code. We believe that multiprocessor context switch times can be, and should be, within 10% of the uniprocessor times. .LP Linux does quite well on context switching, especially on the more recent architectures. By comparing the Linux 2 0K processes to the Linux 2 32K processes, it is apparent that there is something wrong with the Linux/i586 case. If we look back to Table \n[MEMTABLE], we can find at least part of the cause. The second level cache latency for the i586 is substantially worse than either the i686 or the Alpha. .LP Given the poor second level cache behavior of the PowerPC, it is surprising that it does so well on context switches, especially the larger sized cases. .LP The Sun Ultra1 context switches quite well in part because of enhancements to the register window handling in SPARC V9. .NH 2 Interprocess communication latencies .LP Interprocess communication latency is important because many operations are control messages to another process (frequently on another system). The time to tell the remote process to do something is pure overhead and is frequently in the critical path of important functions such as distributed applications (e.g., databases, network servers). .LP The interprocess communication latency benchmarks typically have the following form: pass a small message (a byte or so) back and forth between two processes. The reported results are always the microseconds needed to do one round trip. For one way timing, about half the round trip is right. However, the CPU cycles tend to be somewhat asymmetric for one trip: receiving is typically more expensive than sending. .BU "Pipe latency" . Unix pipes are an interprocess communication mechanism implemented as a one-way byte stream. Each end of the stream has an associated file descriptor; one is the write descriptor and the other the read descriptor. .LP Pipes are frequently used as a local IPC mechanism. Because of the simplicity of pipes, they are frequently the fastest portable communication mechanism. .LP Pipe latency is measured by creating a pair of pipes, forking a child process, and passing a word back and forth. This benchmark is identical to the two-process, zero-sized context switch benchmark, except that it includes both the context switching time and the pipe overhead in the results. .nr NTABLE \n[TABLE]+1 .nr LTABLE \n[TABLE] Table \n[NTABLE] shows the round trip latency from process A to process B and back to process A. .TSTART .so lat_pipe.tbl .TEND "Pipe latency (microseconds)" .LP The time can be broken down to two context switches plus four system calls plus the pipe overhead. The context switch component is two of the small processes in Table \n[LTABLE]. This benchmark is identical to the context switch benchmark in .RN Ousterhout90 . .BU "TCP and RPC/TCP latency" . TCP sockets may be viewed as an interprocess communication mechanism similar to pipes with the added feature that TCP sockets work across machine boundaries. .LP TCP and RPC/TCP connections are frequently used in low-bandwidth, latency-sensitive applications. The default Oracle distributed lock manager uses TCP sockets, and the locks per second available from this service are accurately modeled by the TCP latency test. .TSTART .so lat_tcp.tbl .TEND "TCP latency (microseconds)" .LP Sun's RPC is layered either over TCP or over UDP. The RPC layer is responsible for managing connections (the port mapper), managing different byte orders and word sizes (XDR), and implementing a remote procedure call abstraction. Table \n[TABLE] shows the same benchmark with and without the RPC layer to show the cost of the RPC implementation. .LP TCP latency is measured by having a server process that waits for connections and a client process that connects to the server. The two processes then exchange a word between them in a loop. The latency reported is one round-trip time. The measurements in Table \n[TABLE] are local or loopback measurements, since our intent is to show the overhead of the software. The same benchmark may be, and frequently is, used to measure host-to-host latency. .LP Note that the RPC layer frequently adds hundreds of microseconds of additional latency. The problem is not the external data representation (XDR) layer \(em the data being passed back and forth is a byte, so there is no XDR to be done. There is no justification for the extra cost; it is simply an expensive implementation. DCE RPC is worse. .TSTART .so lat_udp.tbl .TEND "UDP latency (microseconds)" .BU "UDP and RPC/UDP latency" . UDP sockets are an alternative to TCP sockets. They differ in that UDP sockets are unreliable messages that leave the retransmission issues to the application. UDP sockets have a few advantages, however. They preserve message boundaries, whereas TCP does not; and a single UDP socket may send messages to any number of other sockets, whereas TCP sends data to only one place. .LP UDP and RPC/UDP messages are commonly used in many client/server applications. NFS is probably the most widely used RPC/UDP application in the world. .LP Like TCP latency, UDP latency is measured by having a server process that waits for connections and a client process that connects to the server. The two processes then exchange a word between them in a loop. The latency reported is round-trip time. The measurements in Table \n[TABLE] are local or loopback measurements, since our intent is to show the overhead of the software. Again, note that the RPC library can add hundreds of microseconds of extra latency. .\" .LP .\" It is interesting to compare UDP latency with TCP latency. In many cases the .\" TCP latency is \fBless\fP than the UDP latency. This flies in the face .\" of conventional wisdom, which says that TCP is an inherently more expensive .\" protocol than UDP. The reasons that TCP may appear faster are: in this .\" benchmark, the protocol costs are dwarfed by the other costs (context .\" switching, system calls, and driver overhead); and TCP is frequently .\" hand-tuned for performance, while UDP is rarely hand-tuned. .TSTART .so lat_ipc.tbl .TEND "Remote latencies (microseconds)" .BU "Network latency" . We have a few results for over the wire latency included in Table \n[TABLE]. As might be expected, the most heavily used network interfaces (i.e., ethernet) have the lowest latencies. The times shown include the time on the wire, which is about 130 microseconds for 10Mbit ethernet, 13 microseconds for 100Mbit ethernet and FDDI, and less than 10 microseconds for Hippi. .BU "TCP connection latency" . TCP is a connection-based, reliable, byte-stream-oriented protocol. As part of this reliability, a connection must be established before any data can be transferred. The connection is accomplished by a ``three-way handshake,'' an exchange of packets when the client attempts to connect to the server. .LP Unlike UDP, where no connection is established, TCP sends packets at startup time. If an application creates a TCP connection to send one message, then the startup time can be a substantial fraction of the total connection and transfer costs. The benchmark shows that the connection cost is approximately half of the cost. .LP Connection cost is measured by having a server, registered using the port mapper, waiting for connections. The client figures out where the server is registered and then repeatedly times a \*[connect] system call to the server. The socket is closed after each connect. Twenty connects are completed and the fastest of them is used as the result. The time measured will include two of the three packets that make up the three way TCP handshake, so the cost is actually greater than the times listed. .\" XXX Larry --- if a machine's clock granularity is on the order of .\" 10 milliseconds, won't this benchmark run into granularity problems? .TSTART .so lat_connect.tbl .TEND "TCP connect latency (microseconds)" .LP Table \n[TABLE] shows that if the need is to send a quick message to another process, given that most packets get through, a UDP message will cost a \f(CWsend\fP and a \f(CWreply\fP (if positive acknowledgments are needed, which they are in order to have an apples-to-apples comparison with TCP). If the transmission medium is 10Mbit Ethernet, the time on the wire will be approximately 65 microseconds each way, or 130 microseconds total. To do the same thing with a short-lived TCP connection would cost 896 microseconds of wire time alone. .LP The comparison is not meant to disparage TCP; TCP is a useful protocol. Nor is the point to suggest that all messages should be UDP. In many cases, the difference between 130 microseconds and 900 microseconds is insignificant compared with other aspects of application performance. However, if the application is very latency sensitive and the transmission medium is slow (such as serial link or a message through many routers), then a UDP message may prove cheaper. .NH 2 File system latency .LP File system latency is defined as the time required to create or delete a zero length file. We define it this way because in many file systems, such as the BSD fast file system, the directory operations are done synchronously in order to maintain on-disk integrity. Since the file data is typically cached and sent to disk at some later date, the file creation and deletion become the bottleneck seen by an application. This bottleneck is substantial: to do a synchronous update to a disk is a matter of tens of milliseconds. In many cases, this bottleneck is much more of a perceived performance issue than processor speed. .LP The benchmark creates 1,000 zero-sized files and then deletes them. All the files are created in one directory and their names are short, such as "a", "b", "c", ... "aa", "ab", .... .TSTART .so lat_fs.tbl .TEND "File system latency (microseconds)" .LP The create and delete latencies are shown in Table \n[TABLE]. Notice that Linux does extremely well here, 2 to 3 orders of magnitude faster than the slowest systems. However, Linux does not guarantee anything about the disk integrity; the directory operations are done in memory. Other fast systems, such as SGI's XFS, use a log to guarantee the file system integrity. The slower systems, all those with ~10 millisecond file latencies, are using synchronous writes to guarantee the file system integrity. Unless Unixware has modified UFS substantially, they must be running in an unsafe mode since the FreeBSD UFS is much slower and both file systems are basically the 4BSD fast file system. .NH 2 Disk latency .\" XXX - either get more results for this benchmark or delete it. .\" I'd really like to not delete it - lmdd is probably the most .\" useful tool and it gets the least press. .LP Included with \*[lmbench] is a small benchmarking program useful for measuring disk and file I/O. \*[lmdd], which is patterned after the Unix utility \f(CWdd\fP, measures both sequential and random I/O, optionally generates patterns on output and checks them on input, supports flushing the data from the buffer cache on systems that support \f(CWmsync\fP, and has a very flexible user interface. Many I/O benchmarks can be trivially replaced with a \f(CWperl\fP script wrapped around \*[lmdd]. .LP While we could have generated both sequential and random I/O results as part of this paper, we did not because those benchmarks are heavily influenced by the performance of the disk drives used in the test. We intentionally measure only the system overhead of a SCSI command since that overhead may become a bottleneck in large database configurations. .LP Some important applications, such as transaction processing, are limited by random disk IO latency. Administrators can increase the number of disk operations per second by buying more disks, until the processor overhead becomes the bottleneck. The \*[lmdd] benchmark measures the processor overhead associated with each disk operation, and it can provide an upper bound on the number of disk operations the processor can support. It is designed for SCSI disks, and it assumes that most disks have 32-128K read-ahead buffers and that they can read ahead faster than the processor can request the chunks of data.\** .FS This may not always be true: a processor could be fast enough to make the requests faster than the rotating disk. If we take 6M/second to be disk speed, and divide that by 512 (the minimum transfer size), that is 12,288 IOs/second, or 81 microseconds/IO. We don't know of any processor/OS/IO controller combinations that can do an IO in 81 microseconds. .FE .LP The benchmark simulates a large number of disks by reading 512byte transfers sequentially from the raw disk device (raw disks are unbuffered and are not read ahead by Unix). Since the disk can read ahead faster than the system can request data, the benchmark is doing small transfers of data from the disk's track buffer. Another way to look at this is that the benchmark is doing memory-to-memory transfers across a SCSI channel. It is possible to generate loads of more than 1,000 SCSI operations/second on a single SCSI disk. For comparison, disks under database load typically run at 20-80 operations per second. .TSTART .so lat_disk.tbl .TEND "SCSI I/O overhead (microseconds)" .LP The resulting overhead number represents a \fBlower\fP bound on the overhead of a disk I/O. The real overhead numbers will be higher on SCSI systems because most SCSI controllers will not disconnect if the request can be satisfied immediately. During the benchmark, the processor simply sends the request and transfers the data, while during normal operation, the processor will send the request, disconnect, get interrupted, reconnect, and transfer the data. .LP This technique can be used to discover how many drives a system can support before the system becomes CPU-limited because it can produce the overhead load of a fully configured system with just a few disks. .NH 2 Parallelism .LP description of parallelism benchmarks with sample results. .NH 2 Other benchmarks .LP description of other benchmarks with sample results. .NH 1 Scaling Benchmarks .LP There are a number of issues associated with converting single-process benchmarks with a single process to scalable benchmarks with several independent processes, in addition to the various issues addressed by the timing harness. Many of the benchmarks consume or utilize system resources, such as memory or network bandwidth, and a careful assessment of the likely resource contention issues is necessary to ensure that the benchmarks measure important aspects of system performance and not artifacts of artificial resource contention. .LP For example, the Linux 2.2 kernel uses a single lock to control access to the kernel data structures for a file. This means that multiple processes accessing that file will have their operations serialized by that lock. .NH 2 File System .LP A number of the benchmarks measure aspects of file system performance, such as \*[bw_file_rd], \*[bw_mmap_rd], \*[lat_mmap], and \*[lat_pagefault]. It is not immediately apparent how these benchmarks should be extended to the parallel domain. For example, it may be important to know how file system performance scales when multiple processes are reading the same file, or when multiple processes are reading different files. The first case might be important for large, distributed scientific calculations, while the second might be more important for a web server. .LP However, for the operating system, the two cases are significantly different. When multiple processes access the same file, access to the kernel data structures for that file must be coordinated and so contention and locking of those structures can impact performance, while this is less true when multiple processes access different files. .LP In addition, there are any number of issues associated with ensuring that the benchmarks are either measuring operating system overhead (e.g., that no I/O is actually done to disk), or actually measuring the system's I/O performance (e.g., that the data cannot be resident in the buffer cache). Especially with file system related benchmarks, it is very easy to develop benchmarks that compare apples and oranges (e.g., the benchmark includes the time to flush data to disk on one system, but only includes the time to flush a portion of data to disk on another system). .LP \*[lmbench3] allows the user to measure either case as controlled by a command-line switch. When measuring accesses to independent files, the benchmarks first create their own private copies of the file, one for each child process. Then each process accesses its private file. When measuring accesses to a single file, each child simply uses the designated file directly. .NH 2 Context Switching .LP Measuring context switching accurately is a difficult task. \*[lmbench1] and \*[lmbench2] measured context switch times via a "hot-potato" approach using pipes connected in a ring. However, this experimental design heavily favors schedulers that do "hand-off" scheduling, since at most one process is active at a time. Consequently, it is not really a good benchmark for measuring scheduler overhead in multi-processor machines. .LP The design and methodology for measuring context switching and scheduler overhead need to be revisited so that it can more accurately measure performance for multi-processor machines. .NH 1 New Benchmarks .LP \*[lmbench3] also includes a number of new benchmarks. .NH 2 Stream .LP \*[lmbench3] includes a new micro-benchmark which measures the performance of John McCalpin's \*[stream] benchmark kernels for \*[stream] versions 1 and 2. This benchmark faithfully recreates each of the kernel operations from both \*[stream] benchmarks, and because of the powerful new timing harness it can easily measure memory system scalability. .TSTART 1 .TS center box tab (|); c s s s s c | c | c s | c l | l | l | l | l . Stream _ Kernel|Code|Bytes|FL ||rd|wr|OPS _ COPY|$a[i]=b[i]$|8(+8)|8|0 SCALE|$a[i]=q times b[i]$|8(+8)|8|1 ADD|$a[i]=b[i]+c[i]$|16(+8)|8|1 TRIAD|$a[i]=b[i]+q times c[i]$|16(+8)|8|2 .TE .TS center box tab (|); c s s s s c | c | c s | c l | l | l | l | l . Stream2 _ Kernel|Code|Bytes|FL ||rd|wr|OPS _ FILL|$a[i]=q$|0(+8)|8|0 COPY|$a[i]=b[i]$|8(+8)|8|0 DAXPY|$a[i]=a[i]+q times b[i]$|16|8|2 SUM|$sum=sum + a[i]$|8|0|1 .TE .TEND "Stream operations" .LP Table \n[TABLE] shows the four kernels for each version of the \*[stream] benchmark. Note that the .I read columns include numbers in parenthesis, which represent the average number of bytes read into the cache as a result of the write to that variable. Cache lines are almost invariably bigger than a single double, and so when a write miss occurs the cache will read the line from memory and then modify the selected bytes. Sometimes vector instructions such as SSE and 3DNow can avoid this load by writing an entire cache line at once. .NH 2 Basic operation latency .LP \*[lmbench3] includes a new micro-benchmark which measures the latency for a variety of basic operations, such as addition, multiplication, and division of integer, float, and double operands. To measure the basic operation latency we construct a basic arithmetic statement containing the desired operands and operations. This statement is repeated one hundred times and these repetitions are then embedded in a loop. .TSTART .TS center box tab (&); c c c l & l & l . Operand&Operation&Statement _ int&$bit$&r^=i;s^=r;r|=s; &$add$&a+=b;b-=a; &$mul$&r=(r*i)^r; &$div$&r=(r/i)^r; &$mod$&r=(r%i)^r; _ float&$add$&f+=f; &$mul$&f*=f; &$div$&f=g/f; _ double&$add$&f+=f; &$mul$&f*=f; &$div$&f=g/f; .TE .TEND "lat_ops statements" .LP Table \n[TABLE] shows the data type and expressions used for each basic operation type. The variable $i$ indicates the integer loop variable and generally changes every ten or hundred evaluations of the basic expression. All other variables are of the basic type being measured, and aside from being modified by the relevant expressions are only initialized once at the beginning of the benchmark routine. .LP Each statement has been designed to ensure that the statement instances are \fIinterlocked\fR, namely that the processor cannot begin processing the next instance of the statement until it has completed processing the previous instance. This property is crucial to the correct measurement of operation latency. .LP One important consideration in the design of the statements was that they not be optimized out of the loop by intelligent compilers. Since the statements are repeated one hundred times, the compiler has the option of evaluating the sequence of one hundred repetitions of the same statement, and sometimes it can find optimizations that are not immediately apparent. For example, the integer statement $a=a+a;$ when repeated one hundred times in a loop can be replaced with the single statement $a=0;$ because the statement $a=a+a;$ is equivalent to $a< < =1;$, and one hundred repetitions of that statement is equivalent to $a< < =100;$, which for 32bit (or even 64bit) integers is equivalent to $a=0;$. .LP It is relatively easy to identify floating point statements that interlock, are not optimized away, and that only use the operation of interest. It is much harder to identify integer statements meeting the same criterion. All simple integer bitwise operations can either be optimized away, don't interlock, or use operations other than one of interest. We chose to add operations other than the operation(s) of interest to the statements. .LP The integer $mul$, $div$, and $mod$ statements all include an added $xor$ operation which prevents (current) compilers from optimizing the statements away. Since the $xor$ operation is generally completed in a single clock tick, and since we can measure the $xor$ operation latency separately and subtract that overhead, we can still measure the latencies of the other operations of interest. .LP It is not possible to measure latency for 64bit operations on 32bit machines because most implementations allow operations on the upper and lower bits to overlap. This means that on most 32bit machines, the measured latency would appear to be a non-integral multiple of the basic clock cycle. For example, in the $add$ statement, the system could first add the two lower words. Then, in parallel it could both add the two upper words (along with the carry from the lower words), and compute the $xor$ of the lower word. Finally, it can overlap the $xor$ of the upper word with the addition of the two lower words from the next instantiation of the statement. .TSTART .TS center box tab (&); c c c c c c c c c c l & l & r & r & r . Operand&Op&HPPA2.0&PIII&AMD &&400MHz&667MHz&1.3GHz _ mhz&&2.50&1.50&0.75 int&$bit$&2.53&1.50&0.75 &$add$&2.50&1.51&0.75 &$mul$&14.52&6.07&3.03 &$div$&109.40&58.52&30.86 &$mod$&75.14&65.01&32.59 _ float&$add$&7.54&4.58&3.0 &$mul$&7.50&7.50&3.0 &$div$&45.00&35.26&13.21 _ double&$add$&7.52&4.53&3.01 &$mul$&7.52&7.71&3.01 &$div$&85.01&35.51&13.16 .TE .TEND "lat_ops results (ns)" .LP Table \n[TABLE] contains some sample results for two processors. It does contain one result which is slightly surprising unless you are familiar with the PA-RISC architecture: floating point multiply and divide are faster than the corresponding integer operations! This is because PA-RISC does not contain integer MUL, DIV, or MOD instructions and the optimizing compiler converts the integers into floating point, does the operations in the floating point unit, and then converts the result back to an integer. .NH 2 Basic operation parallelism .LP Instruction-level parallelism in commodity processors has become commonplace in the last ten years. Modern processors typically have more than one operational unit that can be active during a given clock cycle, such as an integer arithmetic unit and a floating point unit. In addition, processors may have more than a single instance of a given type of operational unit, both of which may be active at a given time. All this intra-processor parallelism is used to try and reduce the average number of clock cycles per executed instruction. .LP \*[lmbench3] incorporates a new benchmark \*[par_ops] which attempts to quantify the level of available instruction-level parallelism provided by the processor. This benchmark is very similar to \*[lat_ops], and in fact uses the same statement kernels, but it has been modified and extended. We create different versions of each benchmark; each version has $N$ sets of interleaved statements. Each set is identical to equivalent \*[lat_ops] statements. In this way multiple independent sets can be executing the same operation(s) in parallel, if the hardware supports it. .LP For example, the float $mul$ benchmark to measure performance with two parallel streams of statements would look like something this: .DS #define TEN(a) a a a a a a a a a a void benchmark_1(iter_t iterations, void* cookie) { register iter_t i = iterations; struct _state* state = (struct _state*)cookie; register float f0 = state->float_data[0]; register float f1 = state->float_data[1]; while (i-- > 0) { TEN(f0*=f0; f1*=f1;) } use_int((int)f0); use_int((int)f1); } .DE .LP If the processor had two floating point multiply units, then both $f0$ and $f1$ multiplies could proceed in parallel. .LP However, there are some potential problems with the integer operations, namely the fact that the statements contain mixed operations. In general, processors have at least as many integer units that can do $xor$ as can do the other operations of interest ($mul$, $div$ and $mod$), so the inclusion of $xor$ in the statements shouldn't be a bottleneck. .LP However, since parallelism is measured by comparing the latency of the single-stream with that of multiple interleaved streams, and since the single-stream latency includes the $xor$ latency, the apparent parallelism of $mul$, $div$, $mod$ can be over-stated. For example, if a process has one unit that can do integer bit operations, such as $xor$, and another unit for integer $mul$ operations, then the average latency for $a0 = (i * a0) ^ a0$ in the single stream case would be: .EQ t bar = t sub xor + t sub mul .EN In the multi-stream case, the execution of the $xor$ operation of one stream can be overlapped with the $mul$ of another stream, so the average latency per stream would simply be $t bar = t sub mul$, assuming that $mul$ operations are not cheaper than $xor$ operations, which results in an apparent parallelism $p tilde$: .EQ p tilde = {t sub xor + t sub mul} over { t sub mul } .EN Assuming that $t sub xor < < t sub mul$, this still gives a reasonable approximation to the correct answer. Unfortunately, this is not always a reasonable assumption. .LP Of course, if it was known ahead of time that $xor$ and { $mul$, $div$, and $mod$ } used different execution units, then the benchmark could simply subtract $t sub xor$ from the baseline measurement. The difficulty lies in determining whether the units overlap or not. .TSTART .TS center box tab (&); c c c c c c c c c c l & l & r & r & r . Operand&Op&HPPA2.0&PIII&AMD &&400MHz&667MHz&1.3GHz _ int&$bit$&1.99&1.70&1.87 &$add$&1.99&1.61&1.90 &$mul$&6.64&3.81&2.00 &$div$&2.81&1.20&1.00 &$mod$&2.78&1.11&1.03 _ float&$add$&5.88&1.00&2.66 &$mul$&5.86&1.14&2.47 &$div$&2.12&1.03&1.14 _ double&$add$&5.68&1.08&2.49 &$mul$&5.58&1.00&2.53 &$div$&2.19&1.03&1.14 .TE .TEND "par_ops results" .LP .NH 1 Results .LP Some sample results .LP bw_mem_rd performance vs. scaling on an SMP machine .LP .NH 1 Unscalable benchmarks .LP There are a number of benchmarks which either did not make sense for scalable load, such as \*[mhz], or which could not be extended to measure scalable load due to other constraints, such as \*[lat_connect]. .LP \*[mhz] measures the processor clock speed, which is not a scalable feature of the system, so it doesn't make any sense to create a version of it that measures scalable performance. .LP More specifically, \*[lat_connect] measures the latency of connecting to a TCP socket. TCP implementations have a timeout on sockets and there is generally a fixed size queue for sockets in the TIMEOUT state. This means that once the queue has been filled by a program connecting and closing sockets as fast as possible, then all new socket connections have to wait TIMEOUT seconds. Needless to say, this gives no insight into the latency of socket creation per se, but is rather a boring artifact. Since the \*[lmbench2] version of the benchmark can run for very short periods of time, it generally does not run into this problem and is able to correctly measure TCP connection latency. .LP Any scalable version of the benchmark needs each copy to run for at least a second, and there are $N$ copies creating connections as fast as possible, so it would essentially be guaranteed to run into the TIMEOUT problem. Consequently, \*[lat_connect] was not enhanced to measure scalable performance. .NH 1 A brief tutorial on memory design .LP Nearly all modern, general purpose computers use virtual memory with phyically addressed caches. As such, there is typically one or more caches between the physical memory and the processor, and virtual-to-physical address translation occurs between the processor and the top-level cache. Cache staging and replacement is done in \fIcache line\fR units, which are typically several words in length, and caches lower in the hierarchy sometimes have cache lines which are larger than those in the higher caches. .LP Modern processors usually incorporate at least an L1 cache on-chip, and some are starting to also incorporate the L2 cache on-chip. In addition, most include a translation look-aside buffer (TLB) on-chip for fast virtual-to-physical address translation. .LP One key element of any cache design is its replacement strategy. Most caches use either direct-mapped or set associative caches. In the first instance any word in physical memory has exactly one cache line where into which it may be staged, while set associative caches allow a given word to be cached into one of a set of lines. Direct-mapped caches have a very simple replacement policy: the contents of the line that is needed is discarded. Set associative caches usually use LRU or some variant within each set, so the least recently used line in the set of possible cache lines is replaced. The control logic for direct-mapped caches is much cheaper to build, but they are generally only as effective as a set-associative cache half the size.\** .FS See .RN Hennessy96 page 396. .FE .LP Another key element of memory hierarchy design is the management of dirty data; at what point are writes passed down the memory hierarchy to lower caches and main memory? The two basic policies are write-through and write-back. A write-through policy means that writes are immediately passed through the cache to the next level in the hierarchy, so the lower levels are updated at the same time as the cache. A write-back policy means that the cache line is marked as dirty in the cache, and only when the line is ejected from the cache is the data passed down the hierarchy. Write-through policies are often used in higher (smaller) caches because multi- processor systems need to keep a coherent view of memory and the writes are often propagated to other processors by \fIsnoopy\fR caches. .LP One often overlooked aspect of cache performance is cache behavior during writes. Most cache lines contain several words, and most instructions only update the line a word at a time. This means that when the processor writes a word to a cache line that is not present, the cache will read the line from memory before completing the write operation. For \*[bcopy]-like operations this means that the overall memory bandwidth requirement is actually two reads and one write per copied word, rather than the expected read and write. .LP Most modern processors now include some form of prefetch in the memory hierarchy. For the most part these are simple systems that can recognize fixed strided accesses through memory, such as might be seen in many array operations. However, prefetching systems appear to be growing in complexity and capability. .LP Additionally, modern memory subsystems can usually support multiple outstanding requests; the level of parallelism is usually dependent on the level of the hierarchy being accessed. Top-level caches can sometimes support as many as six or eight outstanding requests, while main memory can usually support two outstanding requests. Other elements of the memory hierarchy, such as the TLB, often have additional limits on the level of achievable parallelism in practice.\** .FS For example, if the TLB serializes all TLB misses, and if each memory access causes a TLB miss, then the memory accesses will be serialized even if the data was in a cache supporting six outstanding requests. .FE .LP For more information and details on memory subsystem design, and computer architecture in general, please see .RN Hennessy96 which has an excellent description of these and many other issues. .NH 1 Memory analysis .LP There are a variety of aspects of memory hierarchy design that are interesting to a software developer, such as the number of caches and their sizes. In addition, other aspects of cache design, such as the line size, associativity and parallelism can impact software performance and are of potential interest to software developers. .LP The problem is designing a portable ANSI-C program to infer the cache parameters. A number of operating systems have hooks to report at least certain aspects of cache and memory hierarchy design, but any program utilizing those hooks would not be fully portable across hardware and operating system platforms. .LP The key observation is that caches help reduce memory latency. In a perfect world, all possible data would fit in the cache, so a graph of average memory latency versus amount of memory utilized would look like a series of plateaus separated by cliffs. The cliff edges would be located at the cache boundaries and the plateau height would be the average memory latency. .LP The first problem is that one needs a mechanism for accurately measuring time in a portable fashion. \*[lmbench2] introduced a new timing harness that determines the minimum duration of a timing interval for \*[gettimeofday] to provide accurate measurements .RN Staelin98 . .LP \*[lmbench] includes a benchmark that measures average memory latency, \*[lat_mem_rd] .RN McVoy96 . It creates a pointer chain, and then measures the average time to dereference the pointers. \*[lat_mem_rd] creates the pointer chain by simply striding through memory at fixed intervals, e.g. every other word. .LP \*[lmbench2] extended \*[lat_mem_rd] so that each timing interval only accessed memory as many times as necessary to consume a timing interval. When accessing cache this often means that the whole pointer chain will be accessed at least once during the timing interval, but when accessing memory this often means that only a portion of the chain will be accessed during any given timing interval. .LP While this approach gives very useful insights into memory hierarchy performance, it is not quite sufficient to determine the various characteristics of the memory hierarchy. .LP The first problem is that unless the stride is exactly the same size as the cache size, then there will either be multiple successive accesses to the same line, or some fraction of data will be completely skipped. In the first case the observed latency is much faster than the true latency because it is the average of a single miss latency (slow) with one or more hit latencies (fast). In the second case, the amount of data actually loaded into the cache may be a small fraction of the expected amount so the data may fit into a smaller (faster) cache. The second problem is that this sequence is highly predictable, even by simple-minded prefetching policies, so accurate prefetching might be masking the true memory latencies. .LP This method does do a few things properly. First of all, accesses to a single page are clustered together so the TLB miss cost (if any) is amortized over as many accesses as possible. Secondly, assuming the pointer chain is laid out unpredictably, the memory subsystem must wait for the previous load to complete before it can initiate the next load, so we can measure the true latency. .NH 2 Prefetching .LP Some memory subsystems have been highly optimized to recognize and automatically prefetch memory when given "predictable" memory access streams, such as when striding through array accesses. This means that the memory access stream generated by \*[lmbench] must be unpredictable by the standard prediction algorithms. .LP The original \*[lmbench] memory latency benchmark, lat_mem_rd, built a chain of pointers that would stride backwards through memory. This was able to defeat many simple prefetching algorithms of the time, but some systems came to incorporate prefetching algorithms that recognized strided accesses in both directions. .LP The obvious method for producing an unpredictable chain of line references is to use a random permutation of line indexes. .LP \*[lmbench] uses a deterministic algorithm to compute the reference chain which guarantees that references are as far away from previous accesses in both time and space as possible. Basically, the binary bits representing the line index are reversed, so that 1101 becomes 1011, or 001 becomes 100. This only works if the number of cache lines is an even power of two, but since page sizes and line sizes are always powers of two, this assumption is valid.\** .FS At least this is the case in every modern system known to the author. .FE .LP Additionally, since higher-level caches can have smaller line sizes than lower-level caches, it is necessary to access every word in the relevant chunk of memory. However, accesses to words in the same line must be separated in time by accesses to the rest of the memory. This is achieved by identifying the line size for the largest cache, and then setting up the chain so that there is one pass through the memory for each word in the line with the sequence of words being determined by the bit-reversal method described above. .LP For example, suppose a system has 4KB pages, the largest cache has a line size of 64bytes, and a word is 4bytes. Then each page would have 64 lines, and each line would have 16 words. The system would setup a pointer chain that visits each line on each page using the zeroth word; at the end of the chain it would then jump to the start of the pages and visit each line on each page using the eigth word, and so forth until each word had been visited. .NH 2 Dirty data .LP An additional issue that we need to take into account is the cache's policy for dirty data. Many caches use a copy-back policy, while others use a write-through policy. .LP Different caches on the same machine may use different policies. Also, cache performance can be affected by the presence of dirty data. For example, suppose both the L1 and L2 caches use a copy-back policy, and suppose that the access time for reading data located in L2 depends on whether the data being ejected from L1 is dirty and needs to be copied back from L1 to L2 before the read from L2 to L1. In this case, a benchmark which writes a pointer chain that fits in L2 but is larger than L1, and then measures the time to follow the chain, will get a different average memory latency than a benchmark which writes the same chain and reads enough data to flush the L2 cache before measuring the time to follow the chain. In the first case, each application read will result in a write from L1 to L2 followed by a read from L2 to L1, while in the second case each application read will only result in a read from L2 to L1. .LP Since it is possible that average memory latencies for a read-only access stream may be increased if any of the data in the cache is dirty, we need to flush the cache after setting up the pointer chains and before we do any measurements. Otherwise, when we access a pointer chain that is larger than the L1 cache but smaller than the largest cache, dirty data can reside in the lowest (largest) cache and as each line is staged from the largest cache to the L1 cache, it is marked as dirty in the L1 cache. Then when each dirty line is flushed from the L1 cache (to the L2 cache), the system has to write the data back to L2, which delays the load of the next (dirty) line from L2 to L1. .LP To flush the cache we read (and sum) a large amount of memory, which should be several times larger than the largest cache. In this way, all dirty data in the cache should be flushed from the cache without creating additional dirty data. .NH 2 Page mapping .LP Complicating the issue still further is the fact that caches do not use full LRU replacement policies. Nearly all caches use some form of set associativity, where pages are directed to a pool of cache lines based on the physical address. Replacement within the pool is typically LRU. Direct-mapped caches are a special case where the pool size is a single line. .LP Additionally, some systems use victim caches, which are typically small caches which caches recently discarded cache lines. Victim caches can be particularly effective for direct-mapped caches by reducing the cache miss rate caused by colliding hot spots. .LP However, page mapping and its attendant cache collisions is under the control of the kernel, and is in fact invisible to user-land programs. Some operating systems make an effort to minimize possible page collisions when giving memory to processes\**, while other operating systems appear to simply grab the first available pages, regardless of potential cache collision effects. .FS This is generally known as "page coloring", and is much more important on systems with direct-mapped caches than those with N-way set associative caches. .FE .LP Factoring out page placement affects on average memory latency is very difficult, but it is necessary to ensure that the correct cache size is identified. .NH 1 Cache line size .LP The first feature of the memory hierarchy we will try to analyze is the cache line size, since we can find the line size for the largest cache without any other knowledge of the system, and since determining nearly all other aspects of the memory subsystem either require or are greatly simplified by knowing the cache line size. .LP The most obvious aspect of cache design is that replacement is done on a per-line basis, and cache lines often contain several words of data (32-128bytes per line is common). However, it is necessary to ensure that we don't generate "spurious" cache hits by referencing a word from a cache line that was recently accessed. We must ensure that each line is only re-referenced after all other memory in the buffer has been referenced. .LP Unfortunately, we usually do not know the cache line size ahead of time. In addition, sometimes systems contain several caches, and each cache can use a different line size! Usually line sizes are powers of two, and usually the smaller (higher) caches have line sizes which are the same or smaller than the larger (lower) caches. However, we still need to ensure that we access all cache lines for all caches without generating the spurious cache hits. .LP Determining the cache line size requires a series of experiments. The basic observation is that when the amount of memory being accessed is larger than the cache, and when the access chain is arranged properly, then each memory reference causes a cache miss. If however, a word on a recently access line is requested, then that reference will be a cache hit. More completely, the average memory access time $t bar$ is: .EQ t bar = t sub miss + ( n - 1 ) t sub hit .EN expressed as a function of $n$, the number of accesses to the cache line, $t sub miss$, the cache miss latency, and $t sub hit$, the cache hit latency. .TSTART .G1 .so memhier-line.d .G2 .FEND "Line Size" .LP We can determine the cache line size by measuring the average memory access latency over a series of memory access patterns: accessing every word, every other word, every fourth word, every eigth word, ... While the system is accessing multiple words per cache line, the average memory latency will be smaller than the cache miss latency, and as the space between accesses increases, the average memory increase will grow. When the system accesses only one word per line, the average memory latency will remain level even as the spacing between accesses increases. .LP It is possible to utilize this behavior to identify the cache line size. The algorithm is to measure the average memory latency when each word is accessed. Then as you increase the space between accessed words (doubling the space each iteration), you look for a situation where the average latency increased dramatically, say greater than 30%, followed by a levelling off on the next iteration, say an increase less than 15%. The line size is the last point where the average latency jumped dramatically. .NH 1 TLB .LP Measuring the TLB-miss costs assumes that one can isolate those costs from the rest of the memory access costs. The key observation is that it is often possible to create a situation in which all data being accessed resides in the cache, and yet it requires a TLB-miss to be able to locate it. .LP This program identifies the effective TLB size, rather than the true TLB size. First of all, from a programmer's point of view, it is really the effective TLB size that impacts program performance. Secondly, there is no way for a user-land program to measure true TLB size because kernels sometimes pin some kernel page mappings into the TLB and because some hardware/OS combinations support "super-pages", or multi-page mappings. .LP We create two similar pointer chains with identical length and which reference an identical amount of memory, with one key difference. In the first chain, the data is packed tightly into as few pages as possible, and references remain within a single page as long as possible. The second chain spreads the data over as many pages as possible and jumps between pages at each reference. The two chains are arranged so that the same amount of data will fit into the cache, so that the raw memory access time for each chain is identical, within experimental constraints. The sole difference between average access costs should be the TLB-lookup times. .LP When the pages from the second chain fit into the TLB, the average access times for the two chains should be identical. However, as soon as the number of pages in the second chain exceeds the TLB size, the second chain will start to pay TLB-miss costs. Depending on the TLB replacement policy, the fraction of requests generating TLB-misses in the second chain can vary dramatically\**. .FS Pure LRU would ensure that as soon as the chain was one page longer than the TLB size, every access would trigger a TLB-miss. However, other replacement algorithms might result in as few as $"number of pages" - "TLB size" + 1$ misses per iteration over the loop. .FE .TSTART .G1 .so memhier-tlb.d .G2 .FEND "TLB" .LP The system must search for the point at which the average memory latency of the second chain diverges from the average latency of the first chain. Since most systems have relatively small TLBs and since checking TLB sizes smaller than the effective TLB size is faster than checking TLB sizes larger than the TLB, the system starts with the guess of eight pages to establish a baseline. It then iteratively doubles the number of pages until either a maximum limit has been reached or the average TLB-miss cost is greater than 15% of the average memory latency. Once it discovers the upper bound on the possible TLB size, it uses a binary search between the last two TLB size guesses to find the point at which the average latency for the two streams diverge. .NH 1 Cache size .LP For the purpose of identifying the cache size, the ideal situation is that as long as the amount of memory is equal to or less than the cache size, then all the data is in the cache and the average memory latency is the cache hit latency. As soon as the memory doesn't fit in cache, then none of it should be in the cache, so the average memory latency is the cache miss latency.\** When examining average memory latency versus memory size, this would give nice flat plateaus for each cache, with nice sharp transitions from one cache to the next, and from the largest cache to main memory. .FS Of course, for real programs, you want the average memory latency to be as low as possible, which means that you want as much of the data in cache as possible. .FE .LP However, the realities are that real data from real systems is corrupted in a variety of ways. First of all, even when the memory can fit into the cache, pages often collide in the cache and the fraction of pages that have collisions often increases as the amount of memory nears the cache size. Secondly, even when the memory cannot fit into the cache, there can be pages that do not collide. Finally, there is simple experimental noise, which is usually limited to 1% or less. .LP The result of the first two problems is that on some systems, the average memory latency increases gradually as the memory size is increased. There are no flat plateaus and sharp cliffs which make it easy to identify the number, size, and performance of the caches. .NH 2 Page coloring .LP The first problem is to create a set of pages which do not collide in the cache. The solution is to allocate more memory than necessary, and to try different combinations of pages to find the page set with the fastest average memory latency. Unfortunately, the obvious algorithm is exponential in the number of pages. .TSTART .G1 .so memhier-color.d .G2 .FEND "Page Coloring Effects" .LP One observation is that cache misses are usually much more expensive than cache hits. So, one possibility is to choose a random set of pages as the baseline and measure the average memory latency. Then iterate over the pages, removing that page from the set and measuring the average memory latency of the reduced set. If that page collides with another page, then the average memory latency for the reduced set should be smaller than the average latency for the whole set. .LP Once a page that collides has been identified, then the system can iterate through available pages, try adding them to the reduced set and measuring the average memory latency. If the page doesn't collide with any pages in the reduced set, then the average memory latency should drop still further. In this way, the system could identify all colliding pages and replace them with pages that don't collide (assuming the memory all fits in the cache). .LP There are a number of problems with this simple approach. First of all, it would take a very long time to run due to the large, but polynomial, number of experiments required. Secondly, as the memory size increases and the number of pages involved gets large, the effect of a single page on the average memory latency can reach the level of experimental noise. .LP This approach makes the assumption that physical page locations do not change once the memory has been allocated. In most systems, this assumption is valid unless the memory is paged to disk. However, at least IRIX includes an operating system configuration option to allow the operating system to dynamically relocate pages in memory. This capability is disabled by default, so its use is relatively uncommon. It is possible that page relocation will become more common in the future, in which case this design may need to be revisited in the future. .LP Our algorithm uses this basic approach, but attempts to reduce the number of experiments required by removing chunks of pages at a time. It will remove up to 5% of pages at a time and see if the average memory latency decreases significantly, in which case it examines the chunk a page at a time to find the page or pages which probably conflict. .LP An additional problem is that for large caches, the measured difference between two sets of pages with just one page collision difference can be very hard to measure. For example, on a system with a 512Kbyte L2 cache and 4Kbyte pages, the cache can hold 128 pages. Assuming that a cache miss is 200ns, a cache hit is 50ns, and 123 pages have no collisions but 5 pages collide, then the average memory latency is .EQ t bar = { 123 times 50 + 5 times 200 } over 128 .EN or 55.85ns. Suppose we remove one page and replace it with another page which doesn't collide, so we now have 4 collisions and 124 pages without collisions, then the average memory latency is 54.68ns. The difference is generally significant even in the face of experimental noise, but for larger caches the differences may recede into the background noise. .LP As caches increase in size, the problems associated with detecting page collisions can only increase. For example, an 8MB cache on a system with 4KB pages would contain 2,048 pages. Removing a single page collision, even when the resulting memory latency for that page reduces by a factor of four, would simply result in an overall reduction in average memory latency of less than 0.2%, which is smaller than the average experimental measurement errors. .LP Additionally, as caches increase in size, effects such as cache consumption by the page table can begin to become important. .LP The single largest remaining problem in our system is that this algorithm does not guarantee that we find a set of pages which do not contain any collisions in all cases that it \fImight\fR find such a set. It merely does so \fImost\fR of the time with (relatively) few measurements. .LP One possible means of dealing with this problem is to try an remove sets of pages in the hope that enough pages from a set of colliding pages will be removed at once, so that the remaining pages from that collision set won't collide anymore. Suppose you have a 4-way set associative cache, and that you have six pages that collide. If you remove two of the pages, then the remaining four pages don't collide anymore either. This means that by removing two pages we have removed six collisions, which should be easier to detect. .LP XXX Look into randomizing the pages after each iteration of the top-level loop to make this sort of serendipitious event more likely. .NH 2 Measurement .LP In order to reduce the number of memory sizes that are measured by the system, we use a binary search on memory sizes to find "edges" in the memory latency. We make the simplifying assumption that cache sizes are either a power of two, or 1.5 times a power of two. In our experience, this assumption has been true. We also assume that no cache is smaller than 512 bytes. .LP We explore the memory space at intervals equivalent to the most recent power of two divided by four. So, starting at one megabyte we would (potentially) measure memory latency at 1MB, 1.25MB, 1.5MB, and 1.75MB. This allows us to detect cache sizes at the desired intervals, since the measurement at the exact cache size can often be corrupted by other system activity so the next smaller measurement should still be valid. .LP XXX If the measurement size increment is several times larger than a page, then perhaps we should actually measure the system with a couple pages less than the stated size? This would allow us some "slop" for collisions and might make it easier near cache boundaries to get accurate measurements. The "slop" should probably be some fraction of the measurement increment size, such as 10%, so it scales properly. .LP Since we start with a maximum size as a given, and we use 512 bytes as a minimum, and we can compute the full set of possible measurements, and initialize an array with the desired sizes. We can then use a modified binary search on this array to efficiently locate cache edges while still (potentially) leaving large, flat plateaus unexplored between the end points. .LP Finally, we assume that true memory latency is monotonically increasing with the amount of memory that you access. This means that if the measured latency ever decreases as you increase the amount of accessed memory, then the previous measurement must have been an error and the value is replaced by the smaller measurement. .NH 2 Data analysis .LP Assuming the data collected by the system were noise-free and that the experimental system had managed to eliminate all artifacts such as page coloring effects, then the next problem is to analyze the data to find the number and size of the caches. Basically this means examining the data to find plateaus and cliffs. Each plateau would represent a cache, and the cliff represents the edge (size) of the cache. .LP Of course, real data is never perfect, and there are any number of issues which can affect the experimental results, so the analysis methodology must be robust to noise. .LP XXX describe analysis methodology here .NH 1 Cache associativity .LP No modern caches are fully associative, meaning that no caches use LRU replacement, because the performance of such caches is insufficient. Most caches are either set associative or direct mapped, meaning that data from a given location can only go to one of a small number of cache lines, and in the case of a direct-mapped cache to a single cache line. .LP To determine the cache associativity we need to find a set of pages which have no page collisions and which (just) fit into the cache. We then need to locate a page which collides with these pages and append it to the set. Then we can iterate through the pages in the initial page set, removing a page at a time, and comparing the resulting average memory latency with that of the full set. When the average memory latency drops significantly, then we know that this page conflicts with the full page set, and since the page set only has one conflict, we know it conflicts with the newly introduced page. The number of pages that conflict with this newly introduced page is the set associativity. .LP There is a potential bug in this algorithm for systems with victim caches! If the victim cache can hold at least a page of data, then this algorithm cannot properly determine the cache associativity because the victim cache will play the role of additional associative cache lines. .LP For smaller caches there is the additional problem that the cache associativity may not be smaller than the number of pages that the cache may hold. In which case, this simple approach will never find pages that collide in the cache. The solution to this problem is to increase the line size and the number of pages so that only portions of each page are accessed, and there can be enough pages to create collisions. .NH 1 Memory parallelism .LP With the increasing memory bottleneck, most modern systems allow multiple outstanding memory references. On many systems, the effective parallelism depends on which part of the memory hierarchy is being accessed. For example, L1 caches can often service as many as six or eight outstanding requests, while main memory systems can usually support at most two outstanding requests. .LP To measure the available parallelism for a given chunk of memory, the system sets up a pointer chain running through the memory exactly the same as if it were to measure the average memory latency. It then uses fifteen different access routines, one for each possible level of parallelism.\** .FS The assumption here is that no memory subsystem supports more than sixteen accesses in parallel. .FE Each routine dereferences $N$ pointers in parallel. For example, the inner loop of the routine where $N=2$ would look something like this: .DS while (iterations-- > 0) { p0 = (char**)*p0; p1 = (char**)*p1; } .DE .LP The available parallelism is the maximum speedup over all N compared to the sequential case. .LP Note that this value is often not integral because many factors go into the effective parallelism, such as TLB contention, can limit the effective parallelism. .NH 1 Conclusion .LP \*[lmbench] is a useful, portable micro-benchmark suite designed to measure important aspects of system performance. \*[lmbench3] adds a number of important extensions, such as the ability to measure system scalability. .NH 1 Acknowledgments .LP Many people have provided invaluable help and insight into both the benchmarks themselves and the paper. The \s-1USENIX\s0 reviewers were especially helpful. We thank all of them and especially thank: Wayne Scott \s-1(BitMover)\s0, Larry McVoy \s-1(BitMover)\s0, and Bruce Chapman \s-1(SUN)\s0. .LP We would also like to thank all of the people that have run the benchmark and contributed their results; none of this would have been possible without their assistance. .LP Our thanks to all of the free software community for tools that were used during this project. \*[lmbench] is currently developed on Linux, a copylefted Unix written by Linus Torvalds and his band of happy hackers. This paper and all of the \*[lmbench] documentation was produced using the \f(CWgroff\fP suite of tools written by James Clark. Finally, all of the data processing of the results is done with \f(CWperl\fP written by Larry Wall. .NH 1 Obtaining the benchmarks .LP The benchmarks are available at .ft I http://ftp.bitmover.com/lmbench .ft .\" .R1 .\" bibliography references-lmbench3 .\" .R2 .\"******************************************************************** .\" Redefine the IP paragraph format so it won't insert a useless line .\" break when the paragraph tag is longer than the indent distance .\" .de @IP .if \\n[.$]>1 .nr \\n[.ev]:ai (n;\\$2) .par*start \\n[\\n[.ev]:ai] 0 .if !'\\$1'' \{\ . \" Divert the label so as to freeze any spaces. . di par*label . in 0 . nf \&\\$1 . di . in . fi . chop par*label . ti -\\n[\\n[.ev]:ai]u . ie \\n[dl]+1n<=\\n[\\n[.ev]:ai] \\*[par*label]\h'|\\n[\\n[.ev]:ai]u'\c . el \{\ \\*[par*label] .\". br . \} . rm par*label .\} .. .\"******************************************************************** .\" redefine the way the reference tag is printed so it is enclosed in .\" square brackets .\" .de ref*end-print .ie d [F .IP "[\\*([F]" 2 .el .XP \\*[ref*string] .. .\"******************************************************************** .\" Get journal number entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-N .ref*field N "" ( ) .. .\"******************************************************************** .\" Get journal volume entries right. Now will print as V(N) rather .\" than the awful V, N. .\" .de ref*add-V .ref*field V , "" "" "" .. .\"******************************************************************** .\" Get the date entry right. Should not be enclosed in parentheses. .\" .de ref*add-D .ref*field D "," .. .R1 accumulate sort A+DT database references-userguide label-in-text label A.nD.y-2 bracket-label [ ] ", " bibliography references-userguide .R2 .\" .so bios �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/���������������������������������������������������������������������������������0000775�0000764�0000764�00000000000�10723011663�013753� 5����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/Makefile�������������������������������������������������������������������������0000664�0000764�0000764�00000046174�10606700061�015424� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# $Id$ # Make targets: # # lmbench [default] builds the benchmark suite for the current os/arch # results builds, configures run parameters, and runs the benchmark # rerun reruns the benchmark using the same parameters as last time # scaling reruns the benchmark using same parameters as last time, # except it asks what scaling value to use # hardware reruns the hardware benchmarks using the same parameters # os reruns the OS benchmarks using the same parameters # clean cleans out sources and run configuration # clobber clean and removes the bin directories # shar obsolete, use cd .. && make shar # depend builds make dependencies (needs gcc) # debug builds all the benchmarks with '-g' debugging flag # assembler builds the .s files for each benchmark # # This is largely self configuring. Most stuff is pretty portable. # # If you don't have gcc, try make CC=cc and see if that works. # # If you want to do cross-compilation try make OS=armv5tel-linux-gnu # or whatever your OS string should be in the target environment. # Since many embedded development environments also have a special # cross-compiler, you might want to also select a particular compiler, # so your build command would look something like: # make OS=armv5tel-linux-gnu CC=gcc-arm # # Overriding the OS and CC make parameters needs to be done as an # argument to make, not as an environment variable. See above comments. # # I finally know why Larry Wall's Makefile says "Grrrr". SHELL=/bin/sh CC=`../scripts/compiler` MAKE=`../scripts/make` AR=ar ARCREATE=cr # base of installation location BASE=/usr/local O= ../bin/unknown D= ../doc TRUE=/bin/true OS=`../scripts/os` TARGET=`../scripts/target` BINDIR=../bin/$(OS) CONFIG=../bin/$(OS)/`../scripts/config` UTILS=../scripts/target ../scripts/os ../scripts/gnu-os ../scripts/compiler \ ../scripts/info ../scripts/info-template ../scripts/version \ ../scripts/config ../scripts/config-run ../scripts/results \ ../scripts/lmbench ../scripts/make ../scripts/build INSTALL=cp RESULTS=Results/$(OS) SAMPLES=lmbench/Results/aix/rs6000 lmbench/Results/hpux/snake \ lmbench/Results/irix/indigo2 lmbench/Results/linux/pentium \ lmbench/Results/osf1/alpha lmbench/Results/solaris/ss20* COMPILE=$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) INCS = bench.h lib_mem.h lib_tcp.h lib_udp.h stats.h timing.h SRCS = bw_file_rd.c bw_mem.c bw_mmap_rd.c bw_pipe.c bw_tcp.c bw_udp.c \ bw_unix.c \ cache.c clock.c disk.c enough.c flushdisk.c getopt.c hello.c \ lat_connect.c lat_ctx.c lat_fcntl.c lat_fifo.c lat_fs.c \ lat_mem_rd.c lat_mmap.c lat_ops.c lat_pagefault.c lat_pipe.c \ lat_proc.c lat_rpc.c lat_select.c lat_sig.c lat_syscall.c \ lat_tcp.c lat_udp.c lat_unix.c lat_unix_connect.c lat_sem.c \ lat_usleep.c lat_pmake.c \ lib_debug.c lib_mem.c lib_stats.c lib_tcp.c lib_timing.c \ lib_udp.c lib_unix.c lib_sched.c \ line.c lmdd.c lmhttp.c par_mem.c par_ops.c loop_o.c memsize.c \ mhz.c msleep.c rhttp.c seek.c timing_o.c tlb.c stream.c \ bench.h lib_debug.h lib_tcp.h lib_udp.h lib_unix.h names.h \ stats.h timing.h version.h ASMS = $O/bw_file_rd.s $O/bw_mem.s $O/bw_mmap_rd.s $O/bw_pipe.s \ $O/bw_tcp.s $O/bw_udp.s $O/bw_unix.s $O/clock.s \ $O/disk.s $O/enough.s $O/flushdisk.s $O/getopt.s $O/hello.s \ $O/lat_connect.s $O/lat_ctx.s lat_fcntl.s $O/lat_fifo.s \ $O/lat_fs.s $O/lat_mem_rd.s $O/lat_mmap.s $O/lat_ops.s \ $O/lat_pagefault.s $O/lat_pipe.s $O/lat_proc.s $O/lat_rpc.s \ $O/lat_select.s $O/lat_sig.s $O/lat_syscall.s $O/lat_tcp.s \ $O/lat_udp.s $O/lat_unix.s $O/lat_unix_connect.s $O/lat_sem.s \ $O/lib_debug.s $O/lib_mem.s \ $O/lib_stats.s $O/lib_tcp.s $O/lib_timing.s $O/lib_udp.s \ $O/lib_unix.s $O/lib_sched.s \ $O/line.s $O/lmdd.s $O/lmhttp.s $O/par_mem.s \ $O/par_ops.s $O/loop_o.s $O/memsize.s $O/mhz.s $O/msleep.s \ $O/rhttp.s $O/timing_o.s $O/tlb.s $O/stream.s \ $O/cache.s $O/lat_dram_page.s $O/lat_pmake.s $O/lat_rand.s \ $O/lat_usleep.s $O/lat_cmd.s EXES = $O/bw_file_rd $O/bw_mem $O/bw_mmap_rd $O/bw_pipe $O/bw_tcp \ $O/bw_unix $O/hello \ $O/lat_select $O/lat_pipe $O/lat_rpc $O/lat_syscall $O/lat_tcp \ $O/lat_udp $O/lat_mmap $O/mhz $O/lat_proc $O/lat_pagefault \ $O/lat_connect $O/lat_fs $O/lat_sig $O/lat_mem_rd $O/lat_ctx \ $O/lat_sem \ $O/memsize $O/lat_unix $O/lmdd $O/timing_o $O/enough \ $O/msleep $O/loop_o $O/lat_fifo $O/lmhttp $O/lat_http \ $O/lat_fcntl $O/disk $O/lat_unix_connect $O/flushdisk \ $O/lat_ops $O/line $O/tlb $O/par_mem $O/par_ops \ $O/stream OPT_EXES=$O/cache $O/lat_dram_page $O/lat_pmake $O/lat_rand \ $O/lat_usleep $O/lat_cmd LIBOBJS= $O/lib_tcp.o $O/lib_udp.o $O/lib_unix.o $O/lib_timing.o \ $O/lib_mem.o $O/lib_stats.o $O/lib_debug.o $O/getopt.o \ $O/lib_sched.o lmbench: $(UTILS) @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build all -@env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="k$(MAKEFLAGS)" CC="$(CC)" OS="$(OS)" ../scripts/build opt results: lmbench @env OS="${OS}" ../scripts/config-run @env OS="${OS}" ../scripts/results rerun: lmbench @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi @env OS="${OS}" ../scripts/results scaling: lmbench @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; \ else ../scripts/config-scaling $(CONFIG); fi @env OS="${OS}" ../scripts/results hardware: lmbench @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi @env OS="${OS}" BENCHMARK_HARDWARE=YES BENCHMARK_OS=NO ../scripts/results os: lmbench @if [ ! -f $(CONFIG) ]; then env OS="${OS}" ../scripts/config-run; fi @env OS="${OS}" BENCHMARK_HARDWARE=NO BENCHMARK_OS=YES ../scripts/results install: lmbench @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build install-target install-target: if [ ! -d $(BASE) ]; then mkdir $(BASE); fi if [ ! -d $(BASE)/bin ]; then mkdir $(BASE)/bin; fi if [ ! -d $(BASE)/include ]; then mkdir $(BASE)/include; fi if [ ! -d $(BASE)/lib ]; then mkdir $(BASE)/lib; fi cp $(EXES) $(BASE)/bin cp $(INCS) $(BASE)/include cp $O/lmbench.a $(BASE)/lib/libmbench.a cd ../doc; env MAKEFLAGS="$(MAKEFLAGS)" make CC="${CC}" OS="${OS}" BASE="$(BASE)" install # No special handling for all these all: $(EXES) $O/lmbench opt: $(OPT_EXES) asm: $(ASMS) $(ASMS): $(CC) -S $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ `basename $@ .s`.c Wall: @env CFLAGS="-g -O -Wall" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt debug: @env CFLAGS="-g -O -DDEBUG" MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build all opt assembler: @env CFLAGS=-O MAKE="$(MAKE)" MAKEFLAGS="$(MAKEFLAGS)" CC="${CC}" OS="${OS}" ../scripts/build asm tag: ROOT=`cat ../CVS/Root`; \ MODULE=`cat ../CVS/Repository`; \ VERSION=`../scripts/version`; \ TAG=`echo lmbench_$${VERSION} | sed -e 's/-/_/g' -e 's/\\./_/g'`; \ cd .. \ && cvs -d$${ROOT} tag -c "$${TAG}" dist: SRCDIR=`pwd`; \ ROOT=`cat ../CVS/Root`; \ MODULE=`cat ../CVS/Repository`; \ VERSION=`../scripts/version`; \ cd /tmp \ && cvs -d$${ROOT} export -Dtomorrow \ -d $${MODULE}-$${VERSION} $${MODULE} \ && chmod +x $${MODULE}-$${VERSION}/scripts/[a-z]* \ && mv $${MODULE}-$${VERSION} lmbench-$${VERSION} \ && tar czf $${SRCDIR}/../../lmbench-$${VERSION}.tgz \ lmbench-$${VERSION} \ && rm -rf lmbench-$${VERSION}; clean: /bin/rm -f ../bin/*/CONFIG ../bin/*/*.[oas] /bin/rm -f *.[oas] clobber: /bin/rm -rf ../bin* SHAR shar: cd ../.. && shar lmbench/Results/Makefile $(SAMPLES) lmbench/scripts/* lmbench/src/Makefile lmbench/src/*.[ch] > lmbench/SHAR depend: ../scripts/depend ../scripts/depend testmake: $(SRCS) $(UTILS) # used by scripts/make to test gmake @true .PHONY: lmbench results rerun hardware os install all Wall debug \ install install-target dist get edit get-e clean clobber \ share depend testmake $O/lmbench : ../scripts/lmbench version.h rm -f $O/lmbench VERSION=`../scripts/version`; \ sed -e "s/<version>/$${VERSION}/g" < ../scripts/lmbench > $O/lmbench chmod +x $O/lmbench $O/lmbench.a: $(LIBOBJS) /bin/rm -f $O/lmbench.a $(AR) $(ARCREATE) $O/lmbench.a $(LIBOBJS) -ranlib $O/lmbench.a $O/lib_timing.o : lib_timing.c $(INCS) $(COMPILE) -c lib_timing.c -o $O/lib_timing.o $O/lib_mem.o : lib_mem.c $(INCS) $(COMPILE) -c lib_mem.c -o $O/lib_mem.o $O/lib_tcp.o : lib_tcp.c $(INCS) $(COMPILE) -c lib_tcp.c -o $O/lib_tcp.o $O/lib_udp.o : lib_udp.c $(INCS) $(COMPILE) -c lib_udp.c -o $O/lib_udp.o $O/lib_unix.o : lib_unix.c $(INCS) $(COMPILE) -c lib_unix.c -o $O/lib_unix.o $O/lib_debug.o : lib_debug.c $(INCS) $(COMPILE) -c lib_debug.c -o $O/lib_debug.o $O/lib_stats.o : lib_stats.c $(INCS) $(COMPILE) -c lib_stats.c -o $O/lib_stats.o $O/lib_sched.o : lib_sched.c $(INCS) $(COMPILE) -c lib_sched.c -o $O/lib_sched.o $O/getopt.o : getopt.c $(INCS) $(COMPILE) -c getopt.c -o $O/getopt.o $(UTILS) : -cd ../scripts; make get # Do not remove the next line, $(MAKE) depend needs it # MAKEDEPEND follows $O/rhttp.s:rhttp.c timing.h stats.h bench.h $O/rhttp: rhttp.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/rhttp rhttp.c $O/lmbench.a $(LDLIBS) $O/http.s:http.c timing.h stats.h bench.h $O/http: http.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/http http.c $O/lmbench.a $(LDLIBS) $O/flushdisk.s:flushdisk.c $O/flushdisk: flushdisk.c $(COMPILE) -DMAIN -o $O/flushdisk flushdisk.c $O/mhz.s: mhz.c timing.h stats.h bench.h $O/mhz: mhz.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/mhz mhz.c $O/lmbench.a $(LDLIBS) -lm $O/lat_ctx.s: lat_ctx.c timing.h stats.h bench.h $O/lat_ctx: lat_ctx.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_ctx lat_ctx.c $O/lmbench.a $(LDLIBS) $O/lmhttp.s:lmhttp.c timing.h stats.h bench.h $O/lmhttp: lmhttp.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lmhttp lmhttp.c $O/lmbench.a $(LDLIBS) $O/lat_http.s:lat_http.c timing.h stats.h bench.h $O/lat_http: lat_http.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_http lat_http.c $O/lmbench.a $(LDLIBS) $O/bw_file_rd.s:bw_file_rd.c timing.h stats.h bench.h $O/bw_file_rd: bw_file_rd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/bw_file_rd bw_file_rd.c $O/lmbench.a $(LDLIBS) $O/bw_mem.s:bw_mem.c timing.h stats.h bench.h $O/bw_mem: bw_mem.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/bw_mem bw_mem.c $O/lmbench.a $(LDLIBS) $O/bw_mmap_rd.s:bw_mmap_rd.c timing.h stats.h bench.h $O/bw_mmap_rd: bw_mmap_rd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/bw_mmap_rd bw_mmap_rd.c $O/lmbench.a $(LDLIBS) $O/bw_pipe.s:bw_pipe.c timing.h stats.h bench.h $O/bw_pipe: bw_pipe.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/bw_pipe bw_pipe.c $O/lmbench.a $(LDLIBS) $O/bw_tcp.s:bw_tcp.c bench.h timing.h stats.h lib_tcp.h $O/bw_tcp: bw_tcp.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a $(COMPILE) -o $O/bw_tcp bw_tcp.c $O/lmbench.a $(LDLIBS) $O/bw_udp.s:bw_udp.c bench.h timing.h stats.h lib_udp.h $O/bw_udp: bw_udp.c bench.h timing.h stats.h lib_udp.h $O/lmbench.a $(COMPILE) -o $O/bw_udp bw_udp.c $O/lmbench.a $(LDLIBS) $O/bw_unix.s:bw_unix.c timing.h stats.h bench.h $O/bw_unix: bw_unix.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/bw_unix bw_unix.c $O/lmbench.a $(LDLIBS) $O/disk.s:disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h $O/disk: disk.c flushdisk.c bench.h timing.h stats.h lib_tcp.h $O/lmbench.a $(COMPILE) -o $O/disk disk.c $O/lmbench.a $(LDLIBS) $O/clock.s:clock.c timing.h stats.h bench.h $O/clock: clock.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/clock clock.c $O/lmbench.a $(LDLIBS) $O/hello.s:hello.c $O/hello: hello.c $O/lmbench.a $(COMPILE) -o $O/hello hello.c $O/lmbench.a $(LDLIBS) $O/lat_alarm.s:lat_alarm.c timing.h stats.h bench.h $O/lat_alarm: lat_alarm.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_alarm lat_alarm.c $O/lmbench.a $(LDLIBS) $O/lat_connect.s:lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lat_connect: lat_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a $(COMPILE) -o $O/lat_connect lat_connect.c $O/lmbench.a $(LDLIBS) $O/lat_unix_connect.s:lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lat_unix_connect: lat_unix_connect.c lib_tcp.c bench.h lib_tcp.h timing.h stats.h $O/lmbench.a $(COMPILE) -o $O/lat_unix_connect lat_unix_connect.c $O/lmbench.a $(LDLIBS) $O/lat_fs.s:lat_fs.c timing.h stats.h bench.h $O/lat_fs: lat_fs.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_fs lat_fs.c $O/lmbench.a $(LDLIBS) $O/lat_fcntl.s:lat_fcntl.c timing.h stats.h bench.h $O/lat_fcntl: lat_fcntl.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_fcntl lat_fcntl.c $O/lmbench.a $(LDLIBS) $O/lat_mem_rd.s:lat_mem_rd.c timing.h stats.h bench.h $O/lat_mem_rd: lat_mem_rd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mem_rd lat_mem_rd.c $O/lmbench.a $(LDLIBS) $O/lat_mem_rd2.s:lat_mem_rd2.c timing.h stats.h bench.h $O/lat_mem_rd2: lat_mem_rd2.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mem_rd2 lat_mem_rd2.c $O/lmbench.a $(LDLIBS) $O/lat_mem_wr.s:lat_mem_wr.c timing.h stats.h bench.h $O/lat_mem_wr: lat_mem_wr.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mem_wr lat_mem_wr.c $O/lmbench.a $(LDLIBS) $O/lat_mem_wr2.s:lat_mem_wr2.c timing.h stats.h bench.h $O/lat_mem_wr2: lat_mem_wr2.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mem_wr2 lat_mem_wr2.c $O/lmbench.a $(LDLIBS) $O/lat_mmap.s:lat_mmap.c timing.h stats.h bench.h $O/lat_mmap: lat_mmap.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mmap lat_mmap.c $O/lmbench.a $(LDLIBS) $O/lat_mmaprd.s:lat_mmaprd.c timing.h stats.h bench.h $O/lat_mmaprd: lat_mmaprd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_mmaprd lat_mmaprd.c $O/lmbench.a $(LDLIBS) $O/lat_ops.s:lat_ops.c timing.h stats.h bench.h $O/lat_ops: lat_ops.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_ops lat_ops.c $O/lmbench.a $(LDLIBS) $O/lat_pagefault.s:lat_pagefault.c timing.h stats.h bench.h $O/lat_pagefault: lat_pagefault.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_pagefault lat_pagefault.c $O/lmbench.a $(LDLIBS) $O/lat_pipe.s:lat_pipe.c timing.h stats.h bench.h $O/lat_pipe: lat_pipe.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_pipe lat_pipe.c $O/lmbench.a $(LDLIBS) $O/lat_fifo.s:lat_fifo.c timing.h stats.h bench.h $O/lat_fifo: lat_fifo.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_fifo lat_fifo.c $O/lmbench.a $(LDLIBS) $O/lat_proc.s:lat_proc.c timing.h stats.h bench.h $O/lat_proc: lat_proc.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_proc lat_proc.c $O/lmbench.a $(LDLIBS) $O/lat_rpc.s:lat_rpc.c timing.h stats.h bench.h $O/lat_rpc: lat_rpc.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_rpc lat_rpc.c $O/lmbench.a $(LDLIBS) $O/lat_sig.s:lat_sig.c timing.h stats.h bench.h $O/lat_sig: lat_sig.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_sig lat_sig.c $O/lmbench.a $(LDLIBS) $O/lat_syscall.s:lat_syscall.c timing.h stats.h bench.h $O/lat_syscall: lat_syscall.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_syscall lat_syscall.c $O/lmbench.a $(LDLIBS) $O/lat_select.s: lat_select.c timing.h stats.h bench.h $O/lat_select: lat_select.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_select lat_select.c $O/lmbench.a $(LDLIBS) $O/lat_tcp.s:lat_tcp.c timing.h stats.h bench.h lib_tcp.h $O/lat_tcp: lat_tcp.c timing.h stats.h bench.h lib_tcp.h $O/lmbench.a $(COMPILE) -o $O/lat_tcp lat_tcp.c $O/lmbench.a $(LDLIBS) $O/lat_udp.s:lat_udp.c timing.h stats.h bench.h lib_udp.h $O/lat_udp: lat_udp.c timing.h stats.h bench.h lib_udp.h $O/lmbench.a $(COMPILE) -o $O/lat_udp lat_udp.c $O/lmbench.a $(LDLIBS) $O/lat_unix.s:lat_unix.c timing.h stats.h bench.h $O/lat_unix: lat_unix.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_unix lat_unix.c $O/lmbench.a $(LDLIBS) $O/lib_tcp.s:lib_tcp.c bench.h lib_tcp.h $O/lib_tcp: lib_tcp.c bench.h lib_tcp.h $O/lmbench.a $(COMPILE) -o $O/lib_tcp lib_tcp.c $O/lmbench.a $(LDLIBS) $O/lib_udp.s:lib_udp.c bench.h lib_udp.h $O/lib_udp: lib_udp.c bench.h lib_udp.h $O/lmbench.a $(COMPILE) -o $O/lib_udp lib_udp.c $O/lmbench.a $(LDLIBS) $O/lmdd.s:lmdd.c timing.h stats.h bench.h $O/lmdd: lmdd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lmdd lmdd.c $O/lmbench.a $(LDLIBS) $O/enough.s:enough.c timing.h stats.h bench.h $O/enough: enough.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/enough enough.c $O/lmbench.a $(LDLIBS) $O/loop_o.s:loop_o.c timing.h stats.h bench.h $O/loop_o: loop_o.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/loop_o loop_o.c $O/lmbench.a $(LDLIBS) $O/timing_o.s:timing_o.c timing.h stats.h bench.h $O/timing_o: timing_o.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/timing_o timing_o.c $O/lmbench.a $(LDLIBS) $O/memsize.s:memsize.c timing.h stats.h bench.h $O/memsize: memsize.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/memsize memsize.c $O/lmbench.a $(LDLIBS) $O/msleep.s:msleep.c timing.h stats.h bench.h $O/msleep: msleep.c timing.h stats.h bench.h $(COMPILE) -o $O/msleep msleep.c $O/line.s: line.c timing.h stats.h bench.h $O/line: line.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/line line.c $O/lmbench.a $(LDLIBS) $O/tlb.s:tlb.c timing.h stats.h bench.h $O/tlb: tlb.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/tlb tlb.c $O/lmbench.a $(LDLIBS) $O/cache.s:cache.c timing.h stats.h bench.h $O/cache: cache.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/cache cache.c $O/lmbench.a $(LDLIBS) $O/par_mem.s:par_mem.c timing.h stats.h bench.h $O/par_mem: par_mem.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/par_mem par_mem.c $O/lmbench.a $(LDLIBS) $O/par_ops.s:par_ops.c timing.h stats.h bench.h $O/par_ops: par_ops.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/par_ops par_ops.c $O/lmbench.a $(LDLIBS) $O/stream.s:stream.c timing.h stats.h bench.h $O/stream: stream.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/stream stream.c $O/lmbench.a $(LDLIBS) $O/lat_sem.s:lat_sem.c timing.h stats.h bench.h $O/lat_sem: lat_sem.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_sem lat_sem.c $O/lmbench.a $(LDLIBS) $O/par_list.s:par_list.c timing.h stats.h bench.h $O/par_list: par_list.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/par_list par_list.c $O/lmbench.a $(LDLIBS) $O/lat_dram_page.s:lat_dram_page.c timing.h stats.h bench.h $O/lat_dram_page: lat_dram_page.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_dram_page lat_dram_page.c $O/lmbench.a $(LDLIBS) $O/lat_usleep.s:lat_usleep.c timing.h stats.h bench.h $O/lat_usleep: lat_usleep.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_usleep lat_usleep.c $O/lmbench.a $(LDLIBS) $O/lat_pmake.s:lat_pmake.c timing.h stats.h bench.h $O/lat_pmake: lat_pmake.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_pmake lat_pmake.c $O/lmbench.a $(LDLIBS) $O/lat_rand.s:lat_rand.c timing.h stats.h bench.h $O/lat_rand: lat_rand.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_rand lat_rand.c $O/lmbench.a $(LDLIBS) $O/lat_cmd.s:lat_cmd.c timing.h stats.h bench.h $O/lat_cmd: lat_cmd.c timing.h stats.h bench.h $O/lmbench.a $(COMPILE) -o $O/lat_cmd lat_cmd.c $O/lmbench.a $(LDLIBS) ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/TODO�����������������������������������������������������������������������������0000664�0000764�0000764�00000007650�10106122233�014442� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������$Id$ Add standard deviation and other statistics calculations to "make stats" in results. Alternatively, we might report min, 1Q, median, 3Q, max, as standard deviation for non-normal distributions isn't always sensible. Add flags to various file-related benchmarks bw_file_rd, bw_mmap_rd, lat_fcntl.c, lat_fs, lat_mmap, and lat_pagefault, for parallelism which selects whether each instance has its own file or shares a file. Figure out how to improve lat_select. It doesn't really work for multi-processor systems. Linus suggests that we have each process do some amount of work, and vary the amount of work until context switch times for the producer degrade. The current architecture of lat_select is too synchronous and favors simple hand-off scheduling too much. From Linus. Look into threads vs. process scaling. benchmp currently uses separate processes (via fork()); some benchmarks such as page faults and VM mapping might have very different performance for threads vs. processes since Linux (at least) has per-memory space locks for many of these things. From Linus. Add a '-f' option to lat_ctx which causes the work to be floating point summation (so we get floating point state too). (Suggestion by Ingo Molnar) Add a threads benchmark suite (context switch, mutex, semaphore, ...). Create a new process for each measurement, rather than reusing the same process. This is mostly to get different page layouts and mostly impacts the memory latency benchmarks, although it can also affect lat_ctx. Write/extend the results processing system/scripts to graph/display/ process results in the "-P <parallelism>" dimension, and to properly handle results with differing parallelism when reporting standard results. The parallelism is stored in the results file as SYNC_MAX. Add "bw_udp" benchmark to measure UDP bandwidth [in progress] Make a bw_tcp mode that measures bandwidth for each block and graph that as offset/bandwidth. Make the disk stuff autosize such that you get the same number of data points regardless of disk size. Fix the getsummary to include simple calls. Think about the issues of int/long/long long/double/long double load/stores. Maybe do them all. This will (at least) require adding a test to scripts/build for the presence of long double on this system. Make all results print out bandwidths in powers of 10/sizes in powers of two. Documentation on porting. Check that file size is right in the benchmarking system. Compiler version info included in results. XXX - do this! memory store latency (complex) Why can't I just use the read one and make it write? Well, because the read one is list oriented and I need to figure out reasonable math for the write case. The read one is a load per statement whereas the write one will be more work, I think. RPC numbers reserved for the benchmark. Check all the error outputs and make sure they are consistent. On all the normalized graphs, make sure that they mean the same thing. I do not think that the bandwidth measurements are "correct" in this sense. Document the timing.c interfaces. Run the whole suite through gcc -Wall and fix all the errors. Also make sure that it compiles and has the right sizes for 64 bit OS. [Mon Jul 1 13:30:01 PDT 1996, after meeting w/ Kevin] Do the load latency like so loop: load r1 { increase the number of nops until they start to make the run time longer - the last one was the memory latency. } use the register { increase the number of nops until they start to make the run time longer - the last one was the cache fill shadow. } repeat Do the same thing w/ a varying number of loads (& their uses), showing the number of outstanding loads implemented to L1, L2, mem. Do hand made assembler to get accurate numbers. Provide C source that mimics the hand made assembler for new machines. Think about a report format for the hardware stuff that showed the numbers as triples L1/L2/mem (or quadruples for alphas). ����������������������������������������������������������������������������������������lmbench-3.0-a9/src/bench.h��������������������������������������������������������������������������0000664�0000764�0000764�00000020247�10450256147�015215� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * $Id$ */ #ifndef _BENCH_H #define _BENCH_H #ifdef WIN32 #include <windows.h> typedef unsigned char bool_t; #endif #include <assert.h> #include <ctype.h> #include <stdio.h> #include <math.h> #ifndef WIN32 #include <unistd.h> #endif #include <stdlib.h> #include <fcntl.h> #include <signal.h> #include <errno.h> #ifndef WIN32 #include <strings.h> #endif #include <sys/types.h> #ifndef WIN32 #include <sys/mman.h> #endif #include <sys/stat.h> #ifndef WIN32 #include <sys/wait.h> #include <time.h> #include <sys/time.h> #include <sys/socket.h> #include <sys/un.h> #include <sys/resource.h> #define PORTMAP #include <rpc/rpc.h> #endif #ifdef HAVE_pmap_clnt_h #include <rpc/pmap_clnt.h> #endif #include <rpc/types.h> #ifdef HAVE_pmap_clnt_h #include <rpc/pmap_clnt.h> #endif #include <stdarg.h> #ifndef HAVE_uint typedef unsigned int uint; #endif #ifndef S_IREAD #define S_IREAD S_IRUSR #endif #ifndef S_IWRITE #define S_IWRITE S_IWUSR #endif #ifndef S_IEXEC #define S_IEXEC S_IXUSR #endif #ifndef HAVE_uint64 #ifdef HAVE_uint64_t typedef uint64_t uint64; #else /* HAVE_uint64_t */ typedef unsigned long long uint64; #endif /* HAVE_uint64_t */ #endif /* HAVE_uint64 */ #ifndef HAVE_int64 #ifdef HAVE_int64_t typedef int64_t int64; #else /* HAVE_int64_t */ typedef long long int64; #endif /* HAVE_int64_t */ #endif /* HAVE_int64 */ #ifndef HAVE_socklen_t typedef int socklen_t; #endif #ifndef HAVE_off64_t typedef int64 off64_t; #endif #define NO_PORTMAPPER #include "stats.h" #include "timing.h" #include "lib_debug.h" #include "lib_tcp.h" #include "lib_udp.h" #include "lib_unix.h" #ifdef DEBUG # define debug(x) fprintf x #else # define debug(x) #endif #ifdef NO_PORTMAPPER #define TCP_SELECT -31233 #define TCP_XACT -31234 #define TCP_CONTROL -31235 #define TCP_DATA -31236 #define TCP_CONNECT -31237 #define UDP_XACT -31238 #define UDP_DATA -31239 #else #define TCP_SELECT (u_long)404038 /* XXX - unregistered */ #define TCP_XACT (u_long)404039 /* XXX - unregistered */ #define TCP_CONTROL (u_long)404040 /* XXX - unregistered */ #define TCP_DATA (u_long)404041 /* XXX - unregistered */ #define TCP_CONNECT (u_long)404042 /* XXX - unregistered */ #define UDP_XACT (u_long)404032 /* XXX - unregistered */ #define UDP_DATA (u_long)404033 /* XXX - unregistered */ #define VERS (u_long)1 #endif #define UNIX_CONTROL "/tmp/lmbench.ctl" #define UNIX_DATA "/tmp/lmbench.data" #define UNIX_LAT "/tmp/lmbench.lat" /* * socket send/recv buffer optimizations */ #define SOCKOPT_READ 0x0001 #define SOCKOPT_WRITE 0x0002 #define SOCKOPT_RDWR 0x0003 #define SOCKOPT_PID 0x0004 #define SOCKOPT_REUSE 0x0008 #define SOCKOPT_NONE 0 #ifndef SOCKBUF #define SOCKBUF (1024*1024) #endif #ifndef XFERSIZE #define XFERSIZE (64*1024) /* all bandwidth I/O should use this */ #endif #if defined(SYS5) || defined(WIN32) #define bzero(b, len) memset(b, 0, len) #define bcopy(s, d, l) memcpy(d, s, l) #define rindex(s, c) strrchr(s, c) #endif #define gettime usecs_spent #define streq !strcmp #define ulong unsigned long #ifndef HAVE_DRAND48 #ifdef HAVE_RAND #define srand48 srand #define drand48() ((double)rand() / (double)RAND_MAX) #elif defined(HAVE_RANDOM) #define srand48 srandom #define drand48() ((double)random() / (double)RAND_MAX) #endif /* HAVE_RAND */ #endif /* HAVE_DRAND48 */ #ifdef WIN32 #include <process.h> #define getpid _getpid int gettimeofday(struct timeval *tv, struct timezone *tz); #endif #define SMALLEST_LINE 32 /* smallest cache line size */ #define TIME_OPEN2CLOSE #define GO_AWAY signal(SIGALRM, exit); alarm(60 * 60); #define REAL_SHORT 50000 #define SHORT 1000000 #define MEDIUM 2000000 #define LONGER 7500000 /* for networking data transfers */ #define ENOUGH REAL_SHORT #define TRIES 11 typedef struct { uint64 u; uint64 n; } value_t; typedef struct { int N; value_t v[TRIES]; } result_t; int sizeof_result(int N); void insertinit(result_t *r); void insertsort(uint64, uint64, result_t *); void save_median(); void save_minimum(); void set_results(result_t *r); result_t* get_results(); #define BENCHO(loop_body, overhead_body, enough) { \ int __i, __N; \ double __oh; \ result_t __overhead, __r; \ insertinit(&__overhead); insertinit(&__r); \ __N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\ if (enough < LONGER) {loop_body;} /* warm the cache */ \ for (__i = 0; __i < __N; ++__i) { \ BENCH1(overhead_body, enough); \ if (gettime() > 0) \ insertsort(gettime(), get_n(), &__overhead); \ BENCH1(loop_body, enough); \ if (gettime() > 0) \ insertsort(gettime(), get_n(), &__r); \ } \ for (__i = 0; __i < __r.N; ++__i) { \ __oh = __overhead.v[__i].u / (double)__overhead.v[__i].n; \ if (__r.v[__i].u > (uint64)((double)__r.v[__i].n * __oh)) \ __r.v[__i].u -= (uint64)((double)__r.v[__i].n * __oh); \ else \ __r.v[__i].u = 0; \ } \ *(get_results()) = __r; \ } #define BENCH(loop_body, enough) { \ long __i, __N; \ result_t __r; \ insertinit(&__r); \ __N = (enough == 0 || get_enough(enough) <= 100000) ? TRIES : 1;\ if (enough < LONGER) {loop_body;} /* warm the cache */ \ for (__i = 0; __i < __N; ++__i) { \ BENCH1(loop_body, enough); \ if (gettime() > 0) \ insertsort(gettime(), get_n(), &__r); \ } \ *(get_results()) = __r; \ } #define BENCH1(loop_body, enough) { \ double __usecs; \ BENCH_INNER(loop_body, enough); \ __usecs = gettime(); \ __usecs -= t_overhead() + get_n() * l_overhead(); \ settime(__usecs >= 0. ? (uint64)__usecs : 0); \ } #define BENCH_INNER(loop_body, enough) { \ static iter_t __iterations = 1; \ int __enough = get_enough(enough); \ iter_t __n; \ double __result = 0.; \ \ while(__result < 0.95 * __enough) { \ start(0); \ for (__n = __iterations; __n > 0; __n--) { \ loop_body; \ } \ __result = stop(0,0); \ if (__result < 0.99 * __enough \ || __result > 1.2 * __enough) { \ if (__result > 150.) { \ double tmp = __iterations / __result; \ tmp *= 1.1 * __enough; \ __iterations = (iter_t)(tmp + 1); \ } else { \ if (__iterations > (iter_t)1<<27) { \ __result = 0.; \ break; \ } \ __iterations <<= 3; \ } \ } \ } /* while */ \ save_n((uint64)__iterations); settime((uint64)__result); \ } /* getopt stuff */ #define getopt mygetopt #define optind myoptind #define optarg myoptarg #define opterr myopterr #define optopt myoptopt extern int optind; extern int opterr; extern int optopt; extern char *optarg; int getopt(int ac, char **av, char *opts); void lmbench_usage(int argc, char *argv[], char* usage); off64_t seekto(int fd, off64_t off, int whence); typedef u_long iter_t; typedef void (*benchmp_f)(iter_t iterations, void* cookie); extern void benchmp(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie ); /* * These are used by weird benchmarks which cannot return, such as page * protection fault handling. See lat_sig.c for sample usage. */ extern void* benchmp_getstate(); extern iter_t benchmp_interval(void* _state); /* * Which child process is this? * Returns a number in the range [0, ..., N-1], where N is the * total number of children (parallelism) */ extern int benchmp_childid(); /* * harvest dead children to prevent zombies */ extern void sigchld_wait_handler(int signo); /* * Handle optional pinning/placement of processes on an SMP machine. */ extern int handle_scheduler(int childno, int benchproc, int nbenchprocs); extern int sched_pin(int cpu); #include "lib_mem.h" /* * Generated from msg.x which is included here: program XACT_PROG { version XACT_VERS { char RPC_XACT(char) = 1; } = 1; } = 3970; * Please do not edit this file. * It was generated using rpcgen. */ #define XACT_PROG ((u_long)404040) #define XACT_VERS ((u_long)1) #define RPC_XACT ((u_long)1) #define RPC_EXIT ((u_long)2) extern char *rpc_xact_1(); extern char *client_rpc_xact_1(); #endif /* _BENCH_H */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/busy.c���������������������������������������������������������������������������0000664�0000764�0000764�00000000130�07045412511�015073� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������volatile int i; main() { nice(10); for (;;) getppid(); //for (;;) i++; exit(i); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_file_rd.c���������������������������������������������������������������������0000664�0000764�0000764�00000007724�10620624535�016231� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_file_rd.c - time reading & summing of a file * * Usage: bw_file_rd [-C] [-P <parallelism] [-W <warmup>] [-N <repetitions>] size file * * The intent is that the file is in memory. * Disk benchmarking is done with lmdd. * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define CHK(x) if ((int)(x) == -1) { perror(#x); exit(1); } #ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif #define TYPE int #define MINSZ (sizeof(TYPE) * 128) void *buf; /* do the I/O here */ size_t xfersize; /* do it in units of this */ size_t count; /* bytes to move (can't be modified) */ typedef struct _state { char filename[256]; int fd; int clone; } state_t; void doit(int fd) { size_t size, chunk; size = count; chunk = xfersize; while (size > 0) { if (size < chunk) chunk = size; if (read(fd, buf, MIN(size, chunk)) <= 0) { break; } bread(buf, MIN(size, xfersize)); size -= chunk; } } void initialize(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->fd = -1; if (state->clone) { char buf[128]; char* s; /* copy original file into a process-specific one */ sprintf(buf, "%d", (int)getpid()); s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1); sprintf(s, "%s%d", state->filename, (int)getpid()); if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) { perror("creating private tempfile"); unlink(s); exit(1); } strcpy(state->filename, s); } } void init_open(iter_t iterations, void * cookie) { state_t *state = (state_t *) cookie; int ofd; if (iterations) return; initialize(0, cookie); CHK(ofd = open(state->filename, O_RDONLY)); state->fd = ofd; } void time_with_open(iter_t iterations, void * cookie) { state_t *state = (state_t *) cookie; char *filename = state->filename; int fd; while (iterations-- > 0) { fd = open(filename, O_RDONLY); doit(fd); close(fd); } } void time_io_only(iter_t iterations,void * cookie) { state_t *state = (state_t *) cookie; int fd = state->fd; while (iterations-- > 0) { lseek(fd, 0, SEEK_SET); doit(fd); } } void cleanup(iter_t iterations, void * cookie) { state_t *state = (state_t *) cookie; if (iterations) return; if (state->fd >= 0) close(state->fd); if (state->clone) unlink(state->filename); } int main(int ac, char **av) { int fd; state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char usage[1024]; sprintf(usage,"[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|io_only <filename>" "\nmin size=%d\n",(int) (XFERSIZE>>10)) ; state.clone = 0; while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'C': state.clone = 1; break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 3 != ac) { /* should have three arguments left */ lmbench_usage(ac, av, usage); } strcpy(state.filename,av[optind+2]); count = bytes(av[optind]); if (count < MINSZ) { exit(1); /* I want this to be quiet */ } if (count < XFERSIZE) { xfersize = count; } else { xfersize = XFERSIZE; } buf = (void *)valloc(XFERSIZE); bzero(buf, XFERSIZE); if (!strcmp("open2close", av[optind+1])) { benchmp(initialize, time_with_open, cleanup, 0, parallel, warmup, repetitions, &state); } else if (!strcmp("io_only", av[optind+1])) { benchmp(init_open, time_io_only, cleanup, 0, parallel, warmup, repetitions, &state); } else lmbench_usage(ac, av, usage); bandwidth(count, get_n() * parallel, 0); return (0); } ��������������������������������������������lmbench-3.0-a9/src/bw_mem.c�������������������������������������������������������������������������0000664�0000764�0000764�00000032376�10620624535�015404� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_mem.c - simple memory write bandwidth benchmark * * Usage: bw_mem [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size what * what: rd wr rdwr cp fwr frd fcp bzero bcopy * * Copyright (c) 1994-1996 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$"; #include "bench.h" #define TYPE int /* * rd - 4 byte read, 32 byte stride * wr - 4 byte write, 32 byte stride * rdwr - 4 byte read followed by 4 byte write to same place, 32 byte stride * cp - 4 byte read then 4 byte write to different place, 32 byte stride * fwr - write every 4 byte word * frd - read every 4 byte word * fcp - copy every 4 byte word * * All tests do 512 byte chunks in a loop. * * XXX - do a 64bit version of this. */ void rd(iter_t iterations, void *cookie); void wr(iter_t iterations, void *cookie); void rdwr(iter_t iterations, void *cookie); void mcp(iter_t iterations, void *cookie); void fwr(iter_t iterations, void *cookie); void frd(iter_t iterations, void *cookie); void fcp(iter_t iterations, void *cookie); void loop_bzero(iter_t iterations, void *cookie); void loop_bcopy(iter_t iterations, void *cookie); void init_overhead(iter_t iterations, void *cookie); void init_loop(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); typedef struct _state { double overhead; size_t nbytes; int need_buf2; int aligned; TYPE *buf; TYPE *buf2; TYPE *buf2_orig; TYPE *lastone; size_t N; } state_t; void adjusted_bandwidth(uint64 t, uint64 b, uint64 iter, double ovrhd); int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; size_t nbytes; state_t state; int c; char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> what [conflict]\nwhat: rd wr rdwr cp fwr frd fcp bzero bcopy\n<size> must be larger than 512"; state.overhead = 0; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } /* should have two, possibly three [indicates align] arguments left */ state.aligned = state.need_buf2 = 0; if (optind + 3 == ac) { state.aligned = 1; } else if (optind + 2 != ac) { lmbench_usage(ac, av, usage); } nbytes = state.nbytes = bytes(av[optind]); if (state.nbytes < 512) { /* this is the number of bytes in the loop */ lmbench_usage(ac, av, usage); } if (streq(av[optind+1], "cp") || streq(av[optind+1], "fcp") || streq(av[optind+1], "bcopy")) { state.need_buf2 = 1; } if (streq(av[optind+1], "rd")) { benchmp(init_loop, rd, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "wr")) { benchmp(init_loop, wr, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "rdwr")) { benchmp(init_loop, rdwr, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "cp")) { benchmp(init_loop, mcp, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "frd")) { benchmp(init_loop, frd, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "fwr")) { benchmp(init_loop, fwr, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "fcp")) { benchmp(init_loop, fcp, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "bzero")) { benchmp(init_loop, loop_bzero, cleanup, 0, parallel, warmup, repetitions, &state); } else if (streq(av[optind+1], "bcopy")) { benchmp(init_loop, loop_bcopy, cleanup, 0, parallel, warmup, repetitions, &state); } else { lmbench_usage(ac, av, usage); } adjusted_bandwidth(gettime(), nbytes, get_n() * parallel, state.overhead); return(0); } void init_overhead(iter_t iterations, void *cookie) { } void init_loop(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->buf = (TYPE *)valloc(state->nbytes); state->buf2_orig = NULL; state->lastone = (TYPE*)state->buf - 1; state->lastone = (TYPE*)((char *)state->buf + state->nbytes - 512); state->N = state->nbytes; if (!state->buf) { perror("malloc"); exit(1); } bzero((void*)state->buf, state->nbytes); if (state->need_buf2 == 1) { state->buf2_orig = state->buf2 = (TYPE *)valloc(state->nbytes + 2048); if (!state->buf2) { perror("malloc"); exit(1); } /* default is to have stuff unaligned wrt each other */ /* XXX - this is not well tested or thought out */ if (state->aligned) { char *tmp = (char *)state->buf2; tmp += 2048 - 128; state->buf2 = (TYPE *)tmp; } } } void cleanup(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; if (iterations) return; free(state->buf); if (state->buf2_orig) free(state->buf2_orig); } void rd(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; register int sum = 0; while (iterations-- > 0) { register TYPE *p = state->buf; while (p <= lastone) { sum += #define DOIT(i) p[i]+ DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) p[124]; p += 128; } } use_int(sum); } #undef DOIT void wr(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; while (iterations-- > 0) { register TYPE *p = state->buf; while (p <= lastone) { #define DOIT(i) p[i] = 1; DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); p += 128; } } } #undef DOIT void rdwr(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; register int sum = 0; while (iterations-- > 0) { register TYPE *p = state->buf; while (p <= lastone) { #define DOIT(i) sum += p[i]; p[i] = 1; DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); p += 128; } } use_int(sum); } #undef DOIT void mcp(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; TYPE* p_save = NULL; while (iterations-- > 0) { register TYPE *p = state->buf; register TYPE *dst = state->buf2; while (p <= lastone) { #define DOIT(i) dst[i] = p[i]; DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24) DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52) DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76) DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100) DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124); p += 128; dst += 128; } p_save = p; } use_pointer(p_save); } #undef DOIT void fwr(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; TYPE* p_save = NULL; while (iterations-- > 0) { register TYPE *p = state->buf; while (p <= lastone) { #define DOIT(i) p[i]= DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127) 1; p += 128; } p_save = p; } use_pointer(p_save); } #undef DOIT void frd(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register int sum = 0; register TYPE *lastone = state->lastone; while (iterations-- > 0) { register TYPE *p = state->buf; while (p <= lastone) { sum += #define DOIT(i) p[i]+ DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) DOIT(123) DOIT(124) DOIT(125) DOIT(126) p[127]; p += 128; } } use_int(sum); } #undef DOIT void fcp(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *lastone = state->lastone; while (iterations-- > 0) { register TYPE *p = state->buf; register TYPE *dst = state->buf2; while (p <= lastone) { #define DOIT(i) dst[i]=p[i]; DOIT(0) DOIT(1) DOIT(2) DOIT(3) DOIT(4) DOIT(5) DOIT(6) DOIT(7) DOIT(8) DOIT(9) DOIT(10) DOIT(11) DOIT(12) DOIT(13) DOIT(14) DOIT(15) DOIT(16) DOIT(17) DOIT(18) DOIT(19) DOIT(20) DOIT(21) DOIT(22) DOIT(23) DOIT(24) DOIT(25) DOIT(26) DOIT(27) DOIT(28) DOIT(29) DOIT(30) DOIT(31) DOIT(32) DOIT(33) DOIT(34) DOIT(35) DOIT(36) DOIT(37) DOIT(38) DOIT(39) DOIT(40) DOIT(41) DOIT(42) DOIT(43) DOIT(44) DOIT(45) DOIT(46) DOIT(47) DOIT(48) DOIT(49) DOIT(50) DOIT(51) DOIT(52) DOIT(53) DOIT(54) DOIT(55) DOIT(56) DOIT(57) DOIT(58) DOIT(59) DOIT(60) DOIT(61) DOIT(62) DOIT(63) DOIT(64) DOIT(65) DOIT(66) DOIT(67) DOIT(68) DOIT(69) DOIT(70) DOIT(71) DOIT(72) DOIT(73) DOIT(74) DOIT(75) DOIT(76) DOIT(77) DOIT(78) DOIT(79) DOIT(80) DOIT(81) DOIT(82) DOIT(83) DOIT(84) DOIT(85) DOIT(86) DOIT(87) DOIT(88) DOIT(89) DOIT(90) DOIT(91) DOIT(92) DOIT(93) DOIT(94) DOIT(95) DOIT(96) DOIT(97) DOIT(98) DOIT(99) DOIT(100) DOIT(101) DOIT(102) DOIT(103) DOIT(104) DOIT(105) DOIT(106) DOIT(107) DOIT(108) DOIT(109) DOIT(110) DOIT(111) DOIT(112) DOIT(113) DOIT(114) DOIT(115) DOIT(116) DOIT(117) DOIT(118) DOIT(119) DOIT(120) DOIT(121) DOIT(122) DOIT(123) DOIT(124) DOIT(125) DOIT(126) DOIT(127) p += 128; dst += 128; } } } void loop_bzero(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *p = state->buf; register size_t N = state->N; while (iterations-- > 0) { bzero(p, N); } } void loop_bcopy(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register TYPE *p = state->buf; register TYPE *dst = state->buf2; register size_t N = state->N; while (iterations-- > 0) { bcopy(p,dst,N); } } /* * Almost like bandwidth() in lib_timing.c, but we need to adjust * bandwidth based upon loop overhead. */ void adjusted_bandwidth(uint64 time, uint64 bytes, uint64 iter, double overhd) { #define MB (1000. * 1000.) extern FILE *ftiming; double secs = ((double)time / (double)iter - overhd) / 1000000.0; double mb; mb = bytes / MB; if (secs <= 0.) return; if (!ftiming) ftiming = stderr; if (mb < 1.) { (void) fprintf(ftiming, "%.6f ", mb); } else { (void) fprintf(ftiming, "%.2f ", mb); } if (mb / secs < 1.) { (void) fprintf(ftiming, "%.6f\n", mb/secs); } else { (void) fprintf(ftiming, "%.2f\n", mb/secs); } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_mmap_rd.c���������������������������������������������������������������������0000664�0000764�0000764�00000010462�10620624535�016235� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_mmap_rd.c - time reading & summing of a file using mmap * * Usage: bw_mmap_rd [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file * * Sizes less than 2m are not recommended. Memory is read by summing it up * so the numbers include the cost of the adds. If you use sizes large * enough, you can compare to bw_mem_rd and get the cost of TLB fills * (very roughly). * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #ifdef MAP_FILE # define MMAP_FLAGS MAP_FILE|MAP_SHARED #else # define MMAP_FLAGS MAP_SHARED #endif #define TYPE int #define MINSZ (sizeof(TYPE) * 128) #define CHK(x) if ((long)(x) == -1) { perror("x"); exit(1); } typedef struct _state { size_t nbytes; char filename[256]; int fd; int clone; void *buf; } state_t; void time_no_open(iter_t iterations, void * cookie); void time_with_open(iter_t iterations, void * cookie); void initialize(iter_t iterations, void *cookie); void init_open(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); int main(int ac, char **av) { struct stat sbuf; int parallel = 1; int warmup = 0; int repetitions = -1; size_t nbytes; state_t state; int c; char *usage = "[-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] <size> open2close|mmap_only <filename>"; state.clone = 0; while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'C': state.clone = 1; break; default: lmbench_usage(ac, av, usage); break; } } /* should have three arguments left (bytes type filename) */ if (optind + 3 != ac) { lmbench_usage(ac, av, usage); } nbytes = state.nbytes = bytes(av[optind]); strcpy(state.filename,av[optind+2]); CHK(stat(state.filename, &sbuf)); if ((S_ISREG(sbuf.st_mode) && nbytes > sbuf.st_size) || (nbytes < MINSZ)) { fprintf(stderr,"<size> out of range!\n"); exit(1); } if (!strcmp("open2close", av[optind+1])) { benchmp(initialize, time_with_open, cleanup, 0, parallel, warmup, repetitions, &state); } else if (!strcmp("mmap_only", av[optind+1])) { benchmp(init_open, time_no_open, cleanup, 0, parallel, warmup, repetitions, &state); } else { lmbench_usage(ac, av, usage); } bandwidth(nbytes, get_n() * parallel, 0); return (0); } void initialize(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->fd = -1; state->buf = NULL; if (state->clone) { char buf[8192]; char* s; /* copy original file into a process-specific one */ sprintf(buf, "%d", (int)getpid()); s = (char*)malloc(strlen(state->filename) + strlen(buf) + 1); sprintf(s, "%s%d", state->filename, (int)getpid()); if (cp(state->filename, s, S_IREAD|S_IWRITE) < 0) { perror("creating private tempfile"); unlink(s); exit(1); } strcpy(state->filename, s); } } void init_open(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; if (iterations) return; initialize(0, cookie); CHK(state->fd = open(state->filename, 0)); CHK(state->buf = mmap(0, state->nbytes, PROT_READ, MMAP_FLAGS, state->fd, 0)); } void cleanup(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; if (iterations) return; if (state->buf) munmap(state->buf, state->nbytes); if (state->fd >= 0) close(state->fd); if (state->clone) unlink(state->filename); } void time_no_open(iter_t iterations, void * cookie) { state_t *state = (state_t *) cookie; while (iterations-- > 0) { bread(state->buf, state->nbytes); } } void time_with_open(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; char *filename = state->filename; size_t nbytes = state->nbytes; int fd; void *p; while (iterations-- > 0) { CHK(fd = open(filename, 0)); CHK(p = mmap(0, nbytes, PROT_READ, MMAP_FLAGS, fd, 0)); bread(p, nbytes); close(fd); munmap(p, nbytes); } } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_pipe.c������������������������������������������������������������������������0000664�0000764�0000764�00000007511�10715547567�015572� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_pipe.c - pipe bandwidth benchmark. * * Usage: bw_pipe [-m <message size>] [-M <total bytes>] \ * [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 1994 Larry McVoy. * Copyright (c) 2002 Carl Staelin. * Distributed under the FSF GPL with additional restriction that results * may published only if: * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void reader(iter_t iterations, void* cookie); void writer(int writefd, char* buf, size_t xfer); int XFER = 10*1024*1024; struct _state { int pid; size_t xfer; /* bytes to read/write per "packet" */ size_t bytes; /* bytes to read/write in one iteration */ char *buf; /* buffer memory space */ int readfd; }; void initialize(iter_t iterations, void *cookie) { int pipes[2]; struct _state* state = (struct _state*)cookie; if (iterations) return; if (pipe(pipes) == -1) { perror("pipe"); exit(1); } handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case 0: close(pipes[0]); handle_scheduler(benchmp_childid(), 1, 1); state->buf = valloc(state->xfer); if (state->buf == NULL) { perror("child: no memory"); exit(2); } touch(state->buf, state->xfer); writer(pipes[1], state->buf, state->xfer); return; /*NOTREACHED*/ case -1: perror("fork"); exit(3); /*NOTREACHED*/ default: break; } close(pipes[1]); state->readfd = pipes[0]; state->buf = valloc(state->xfer + getpagesize()); if (state->buf == NULL) { perror("parent: no memory"); exit(4); } touch(state->buf, state->xfer + getpagesize()); state->buf += 128; /* destroy page alignment */ } void cleanup(iter_t iterations, void * cookie) { struct _state* state = (struct _state*)cookie; if (iterations) return; close(state->readfd); if (state->pid > 0) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); } state->pid = 0; } void reader(iter_t iterations, void * cookie) { size_t done; ssize_t n; struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { for (done = 0; done < state->bytes; done += n) { if ((n = read(state->readfd, state->buf, state->xfer)) < 0) { perror("bw_pipe: reader: error in read"); exit(1); } } } } void writer(int writefd, char* buf, size_t xfer) { size_t done; ssize_t n; for ( ;; ) { #ifdef TOUCH touch(buf, xfer); #endif for (done = 0; done < xfer; done += n) { if ((n = write(writefd, buf, xfer - done)) < 0) { exit(0); } } } } int main(int ac, char *av[]) { struct _state state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; state.xfer = XFERSIZE; /* per-packet size */ state.bytes = XFER; /* total bytes per call */ while (( c = getopt(ac, av, "m:M:P:W:N:")) != EOF) { switch(c) { case 'm': state.xfer = bytes(optarg); break; case 'M': state.bytes = bytes(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } /* round up total byte count to a multiple of xfer */ if (state.bytes < state.xfer) { state.bytes = state.xfer; } else if (state.bytes % state.xfer) { state.bytes += state.bytes - state.bytes % state.xfer; } benchmp(initialize, reader, cleanup, MEDIUM, parallel, warmup, repetitions, &state); if (gettime() > 0) { fprintf(stderr, "Pipe bandwidth: "); mb(get_n() * parallel * state.bytes); } return(0); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_tcp.c�������������������������������������������������������������������������0000664�0000764�0000764�00000012210�10715530610�015370� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_tcp.c - simple TCP bandwidth test * * Three programs in one - * server usage: bw_tcp -s * client usage: bw_tcp [-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname * shutdown: bw_tcp -hostname * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" typedef struct _state { int sock; uint64 move; size_t msize; char *server; int fd; char *buf; } state_t; void server_main(); void client_main(int parallel, state_t *state); void source(int data); void initialize(iter_t iterations, void* cookie); void loop_transfer(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void* cookie); int main(int ac, char **av) { int parallel = 1; int warmup = LONGER; int repetitions = -1; int shutdown = 0; state_t state; char *usage = "-s\n OR [-m <message size>] [-M <bytes to move>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S serverhost\n"; int c; state.msize = 0; state.move = 0; /* Rest is client argument processing */ while (( c = getopt(ac, av, "sS:m:M:P:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); break; case 'S': /* shutdown serverhost */ { int conn; conn = tcp_connect(optarg, TCP_DATA, SOCKOPT_NONE); write(conn, "0", 1); exit(0); } case 'm': state.msize = bytes(optarg); break; case 'M': state.move = bytes(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac - 2 || optind >= ac) { lmbench_usage(ac, av, usage); } state.server = av[optind++]; if (state.msize == 0 && state.move == 0) { state.msize = state.move = XFERSIZE; } else if (state.msize == 0) { state.msize = state.move; } else if (state.move == 0) { state.move = state.msize; } /* make the number of bytes to move a multiple of the message size */ if (state.move % state.msize) { state.move += state.msize - state.move % state.msize; } /* * Default is to warmup the connection for seven seconds, * then measure performance over each timing interval. * This minimizes the effect of opening and initializing TCP * connections. */ benchmp(initialize, loop_transfer, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { fprintf(stderr, "%.6f ", state.msize / (1000. * 1000.)); mb(state.move * get_n() * parallel); } return(0); } void initialize(iter_t iterations, void *cookie) { char buf[100]; state_t *state = (state_t *) cookie; if (iterations) return; state->buf = valloc(state->msize); if (!state->buf) { perror("valloc"); exit(1); } touch(state->buf, state->msize); state->sock = tcp_connect(state->server, TCP_DATA, SOCKOPT_READ|SOCKOPT_WRITE|SOCKOPT_REUSE); if (state->sock < 0) { perror("socket connection"); exit(1); } sprintf(buf, "%lu", (unsigned long)state->msize); if (write(state->sock, buf, strlen(buf) + 1) != strlen(buf) + 1) { perror("control write"); exit(1); } } void loop_transfer(iter_t iterations, void *cookie) { int c; uint64 todo; state_t *state = (state_t *) cookie; while (iterations-- > 0) { for (todo = state->move; todo > 0; todo -= c) { if ((c = read(state->sock, state->buf, state->msize)) <= 0) { exit(1); } if (c > todo) c = todo; } } } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; /* close connection */ (void)close(state->sock); } void server_main() { int data, newdata; GO_AWAY; data = tcp_server(TCP_DATA, SOCKOPT_WRITE|SOCKOPT_REUSE); if (data < 0) { perror("server socket creation"); exit(1); } signal(SIGCHLD, sigchld_wait_handler); for ( ;; ) { newdata = tcp_accept(data, SOCKOPT_WRITE); switch (fork()) { case -1: perror("fork"); break; case 0: source(newdata); exit(0); default: close(newdata); break; } } } /* * Read the message size. Keep transferring * data in message-size sized packets until * the socket goes away. */ void source(int data) { size_t m; unsigned long nbytes; char *buf, scratch[100]; /* * read the message size */ bzero(scratch, 100); if (read(data, scratch, 100) <= 0) { perror("control nbytes"); exit(7); } sscanf(scratch, "%lu", &nbytes); m = nbytes; /* * A hack to allow turning off the absorb daemon. */ if (m == 0) { tcp_done(TCP_DATA); kill(getppid(), SIGTERM); exit(0); } buf = valloc(m); if (!buf) { perror("valloc"); exit(1); } bzero(buf, m); /* * Keep sending messages until the connection is closed */ while (write(data, buf, m) == m) { #ifdef TOUCH touch(buf, m); #endif } free(buf); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_udp.c�������������������������������������������������������������������������0000664�0000764�0000764�00000010621�10620624536�015404� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_udp.c - simple UDP bandwidth test * * Three programs in one - * server usage: bw_tcp -s * client usage: bw_tcp [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname [bytes] * shutdown: bw_tcp -S hostname * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define MAX_MSIZE (10 * 1024 * 1024) typedef struct _state { int sock; int seq; long move; long msize; char *server; int fd; char *buf; } state_t; void server_main(); void client_main(int parallel, state_t *state); void init(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void loop_transfer(iter_t iterations, void *cookie); int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int server = 0; state_t state; char *usage = "-s\n OR [-m <message size>] [-W <warmup>] [-N <repetitions>] server [size]\n OR -S serverhost\n"; int c; uint64 usecs; state.msize = 0; state.move = 10*1024*1024; /* Rest is client argument processing */ while (( c = getopt(ac, av, "sS:m:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); case 'S': /* shutdown serverhost */ { int seq, n; int sock = udp_connect(optarg, UDP_XACT, SOCKOPT_NONE); for (n = -1; n > -5; --n) { seq = htonl(n); (void) send(sock, &seq, sizeof(int), 0); } close(sock); exit (0); } case 'm': state.msize = atoi(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac - 2 || optind >= ac) { lmbench_usage(ac, av, usage); } state.server = av[optind++]; if (optind < ac) { state.move = bytes(av[optind]); } if (state.msize == 0) { state.msize = state.move; } /* make the number of bytes to move a multiple of the message size */ if (state.move % state.msize) { state.move += state.move - state.move % state.msize; } state.buf = valloc(state.msize); if (!state.buf) { perror("valloc"); exit(1); } touch(state.buf, state.msize); /* * Make one run take at least 5 seconds. * This minimizes the effect of connect & reopening TCP windows. */ benchmp(init, loop_transfer, cleanup, LONGER, parallel, warmup, repetitions, &state ); out: (void)fprintf(stderr, "socket UDP bandwidth using %s: ", state.server); mb(state.move * get_n() * parallel); } void init(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE); state->seq = 0; state->buf = (char*)malloc(state->msize); } void loop_transfer(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; char *server = state->server; int sock = state->sock; long control[2], nbytes; nbytes = state->move; control[0] = state->move; control[1] = state->msize; while (iterations-- > 0) { if (send(sock, control, 2 * sizeof(long), 0) != 2 * sizeof(long)) { perror("bw_udp client: send failed"); exit(5); } while (nbytes > 0) { if (recv(sock, state->buf, state->msize, 0) != state->msize) { perror("bw_udp client: recv failed"); exit(5); } nbytes -= state->msize; } } } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; close(state->sock); free(state->buf); } void server_main() { char *buf = (char*)valloc(MAX_MSIZE); int sock, namelen, seq = 0; long nbytes, msize; struct sockaddr_in it; GO_AWAY; sock = udp_server(UDP_XACT, SOCKOPT_NONE); while (1) { namelen = sizeof(it); if (recvfrom(sock, (void*)buf, 2 * sizeof(long), 0, (struct sockaddr*)&it, &namelen) < 0) { fprintf(stderr, "bw_udp server: recvfrom: got wrong size\n"); exit(9); } nbytes = ntohl(*(long*)buf); msize = ntohl(*((long*)buf + 1)); while (nbytes > 0) { if (sendto(sock, (void*)buf, msize, 0, (struct sockaddr*)&it, sizeof(it)) < 0) { perror("bw_udp sendto"); exit(9); } nbytes -= msize; } } } ���������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/bw_unix.c������������������������������������������������������������������������0000664�0000764�0000764�00000010122�10620624536�015573� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * bw_unix.c - simple Unix stream socket bandwidth test * * Usage: bw_unix [-m <message size>] [-M <total bytes>] \ * [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 1994 Larry McVoy. * Copyright (c) 2002 Carl Staelin. * Distributed under the FSF GPL with additional restriction that results * may published only if: * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void reader(iter_t iterations, void * cookie); void writer(int controlfd, int writefd, char* buf, void* cookie); size_t XFER = 10*1024*1024; struct _state { int pid; size_t xfer; /* bytes to read/write per "packet" */ size_t bytes; /* bytes to read/write in one iteration */ char *buf; /* buffer memory space */ int pipes[2]; int control[2]; int initerr; }; void initialize(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; if (iterations) return; state->buf = valloc(XFERSIZE); touch(state->buf, XFERSIZE); state->initerr = 0; if (socketpair(AF_UNIX, SOCK_STREAM, 0, state->pipes) == -1) { perror("socketpair"); state->initerr = 1; return; } if (pipe(state->control) == -1) { perror("pipe"); state->initerr = 2; return; } handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case 0: handle_scheduler(benchmp_childid(), 1, 1); close(state->control[1]); close(state->pipes[0]); writer(state->control[0], state->pipes[1], state->buf, state); return; /*NOTREACHED*/ case -1: perror("fork"); state->initerr = 3; return; /*NOTREACHED*/ default: break; } close(state->control[0]); close(state->pipes[1]); } void cleanup(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; if (iterations) return; close(state->control[1]); close(state->pipes[0]); if (state->pid > 0) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); } state->pid = 0; } void reader(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; size_t done, n; size_t todo = state->bytes; while (iterations-- > 0) { write(state->control[1], &todo, sizeof(todo)); for (done = 0; done < todo; done += n) { if ((n = read(state->pipes[0], state->buf, state->xfer)) <= 0) { /* error! */ exit(1); } } } } void writer(int controlfd, int writefd, char* buf, void* cookie) { size_t todo, done; ssize_t n; struct _state* state = (struct _state*)cookie; for ( ;; ) { read(controlfd, &todo, sizeof(todo)); for (done = 0; done < todo; done += n) { #ifdef TOUCH touch(buf, XFERSIZE); #endif if ((n = write(writefd, buf, state->xfer)) < 0) { /* error! */ exit(1); } } } } int main(int argc, char *argv[]) { struct _state state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-m <message size>] [-M <total bytes>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; state.xfer = XFERSIZE; /* per-packet size */ state.bytes = XFER; /* total bytes per call */ while (( c = getopt(argc,argv,"m:M:P:W:N:")) != EOF) { switch(c) { case 'm': state.xfer = bytes(optarg); break; case 'M': state.bytes = bytes(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(argc, argv, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(argc, argv, usage); break; } } if (optind == argc - 1) { state.bytes = bytes(argv[optind]); } else if (optind < argc - 1) { lmbench_usage(argc, argv, usage); } state.pid = 0; /* round up total byte count to a multiple of xfer */ if (state.bytes % state.xfer) { state.bytes += state.bytes - state.bytes % state.xfer; } benchmp(initialize, reader, cleanup, MEDIUM, parallel, warmup, repetitions, &state); if (gettime() > 0) { fprintf(stderr, "AF_UNIX sock stream bandwidth: "); mb(get_n() * parallel * XFER); } return(0); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/cache.c��������������������������������������������������������������������������0000664�0000764�0000764�00000046576�10716124212�015201� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * cache.c - guess the cache size(s) * * usage: cache [-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" struct cache_results { size_t len; size_t maxlen; size_t line; double latency; double variation; double ratio; double slope; }; int find_cache(int start, int n, double prev_lat, struct cache_results* p); int collect_data(size_t start, size_t line, size_t maxlen, int repetitions, struct cache_results** pdata); void search(int left, int right, int repetitions, struct mem_state* state, struct cache_results* p); int collect_sample(int repetitions, struct mem_state* state, struct cache_results* p); double measure(size_t size, int repetitions, double* variation, struct mem_state* state); double remove_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, int repetitions, struct mem_state* state); int test_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, double *baseline, double chunk_baseline, int repetitions, struct mem_state* state); int fixup_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, double *baseline, double chunk_baseline, int repetitions, struct mem_state* state); void check_memory(size_t size, struct mem_state* state); void pagesort(size_t n, size_t* pages, double* latencies); #ifdef ABS #undef ABS #endif #define ABS(a) ((a) < 0 ? -(a) : (a)) #define SWAP(a,b) {size_t _tmp = (a); (a) = (b); (b) = _tmp;} #define THRESHOLD 1.5 #define FIVE(m) m m m m m #define TEN(m) FIVE(m) FIVE(m) #define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) #define HUNDRED(m) FIFTY(m) FIFTY(m) #define DEREF p = (char**)*p; static char **addr_save = NULL; void mem_benchmark(iter_t iterations, void *cookie) { register char **p; struct mem_state* state = (struct mem_state*)cookie; p = addr_save ? addr_save : (char**)state->p[0]; while (iterations-- > 0) { HUNDRED(DEREF); } addr_save = p; } /* * Assumptions: * * 1) Cache lines are a multiple of pointer-size words * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes) * 3) Pages are an even multiple of cache lines */ int main(int ac, char **av) { int c; int i, j, n, start, level, prev, min; int warmup = 0; int repetitions = (1000000 <= get_enough(0) ? 1 : TRIES); ssize_t line = 0; size_t maxlen = 32 * 1024 * 1024; int *levels; double par, maxpar, prev_lat; char *usage = "[-c] [-L <line size>] [-M len[K|M]] [-W <warmup>] [-N <repetitions>]\n"; struct cache_results* r; struct mem_state state; while (( c = getopt(ac, av, "L:M:W:N:")) != EOF) { switch(c) { case 'L': line = atoi(optarg); if (line < sizeof(char*)) line = sizeof(char*); break; case 'M': maxlen = bytes(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } sched_pin(0); state.width = 1; state.len = maxlen; state.maxlen = maxlen; state.pagesize = getpagesize(); if (line == 0) { line = line_find(maxlen, warmup, repetitions, &state); if (line == 0) line = getpagesize() / 16; state.line = line; } n = collect_data((size_t)512, line, maxlen, repetitions, &r); r[n-1].line = line; levels = (int*)malloc(n * sizeof(int)); if (!levels) { perror("malloc"); exit(1); } bzero(levels, n * sizeof(int)); for (start = 0, prev = 0, level = 0, prev_lat = -1.0; (i = find_cache(start, n, prev_lat, r)) >= 0; ++level, start = i + 1, prev = i) { /* * performance is not greatly improved over main memory, * so it is likely not a cache boundary */ if (r[i].latency / r[n-1].latency > 0.5) break; /* * is cache boundary "legal"? (e.g. 2^N or 1.5*2^N) * cache sizes are "never" 1.25*2^N or 1.75*2^N */ for (c = r[i].len; c > 0x7; c >>= 1) ; if (c == 5 || c == 7) { i++; if (i >= n) break; } levels[level] = i; prev_lat = (r[start].latency > 0.0 ? r[start].latency : r[start - 1].latency); } for (i = 0; i < level; ++i) { prev = (i > 0 ? levels[i-1]: -1); /* locate most likely cache latency */ for (j = min = prev + 1; j < levels[i]; ++j) { if (r[j].latency <= 0.) continue; if (r[min].latency <= 0. || ABS(r[j].slope) < ABS(r[min].slope)) { min = j; } } /* Compute line size */ if (i == level - 1) { line = r[n-1].line; } else { j = (levels[i] + levels[i+1]) / 2; for (line = -1; line <= 0 && j < n; ++j) { r[j].line = line_find(r[j].len, warmup, repetitions, &state); line = r[j].line; } } /* Compute memory parallelism for cache */ maxpar = par_mem(r[levels[i]-1].len, warmup, repetitions, &state); fprintf(stderr, "L%d cache: %lu bytes %.2f nanoseconds %ld linesize %.2f parallelism\n", (int)(i+1), (unsigned long)r[levels[i]].len, r[min].latency, (long)line, maxpar); } /* Compute memory parallelism for main memory */ j = n - 1; for (i = n - 1; i >= 0; i--) { if (r[i].latency < 0.) continue; if (r[i].latency > 0.99 * r[n-1].latency) j = i; } par = par_mem(r[j].len, warmup, repetitions, &state); fprintf(stderr, "Memory latency: %.2f nanoseconds %.2f parallelism\n", r[n-1].latency, par); exit(0); } int find_cache(int start, int n, double prev_lat, struct cache_results* p) { int i, j, prev; double max = -1.; for (prev = (start == 0 ? start : start - 1); prev > 0; prev--) { if (p[prev].ratio > 0.0) break; } for (i = start, j = -1; i < n; ++i) { if (p[i].latency < 0.) continue; if (max < p[i].ratio) max = p[i].ratio; if (THRESHOLD < p[i].ratio) j = i; if (THRESHOLD < max && p[j].len * 2 <= p[i].len) return j; prev = i; } return -1; } int collect_data(size_t start, size_t line, size_t maxlen, int repetitions, struct cache_results** pdata) { int i; int samples; int idx; size_t len = start; size_t incr = start / 4; struct mem_state state; struct cache_results* p; state.width = 1; state.len = maxlen; state.maxlen = maxlen; state.line = line; state.pagesize = getpagesize(); state.addr = NULL; /* count the (maximum) number of samples to take */ for (len = start, incr = start / 4, samples = 0; len <= maxlen; incr<<=1) { for (i = 0; i < 4 && len <= maxlen; ++i, len += incr) samples++; } *pdata = (struct cache_results*) malloc(samples * sizeof(struct cache_results)); if (!*pdata) { perror("malloc"); exit(2); } p = *pdata; /* initialize the data */ for (len = start, incr = start / 4, idx = 0; len <= maxlen; incr<<=1) { for (i = 0; i < 4 && len <= maxlen; ++i, ++idx, len += incr) { p[idx].len = len; p[idx].line = line; p[idx].latency = -1.; p[idx].ratio = -1.; p[idx].slope = -1.; } } /* make sure we have enough memory for the scratch data */ while (state.addr == NULL) { mem_initialize(0, &state); if (state.addr == NULL) { maxlen /= 2; state.len = state.maxlen = maxlen; while (p[samples-1].len > maxlen) samples--; } } for (i = 0; i < samples; ++i) p[i].maxlen = maxlen; /* in case the system has laid out the pages well, don't scramble */ for (i = 0; i < state.npages; ++i) state.pages[i] = i * state.pagesize; p[samples-1].latency = measure(p[samples-1].len, repetitions, &p[samples-1].variation, &state); while (p[samples-1].latency <= 0.0) { p[samples-1].latency = measure(p[samples-1].len, repetitions, &p[samples-1].variation, &state); --samples; } p[0].latency = measure(p[0].len, repetitions, &p[0].variation, &state); search(0, samples - 1, repetitions, &state, p); /* fprintf(stderr, "%10.10s %8.8s %8.8s %8.8s %8.8s %5.5s\n", "mem size", "latency", "variation", "ratio", "slope", "line"); for (idx = 0; idx < samples; ++idx) { if (p[idx].latency < 0.) continue; fprintf(stderr, "%10.6f %8.3f %8.3f %8.3f %8.3f %4lu\n", p[idx].len / (1000. * 1000.), p[idx].latency, p[idx].variation, p[idx].ratio, p[idx].slope, (unsigned long)p[idx].line); } /**/ mem_cleanup(0, &state); return samples; } void search(int left, int right, int repetitions, struct mem_state* state, struct cache_results* p) { int middle = left + (right - left) / 2; /* fprintf(stderr, "search(%d, %d, ...): [%lu/%G, %lu, %lu/%G] entering\n", left, right, (unsigned long)p[left].len, p[left].latency, (unsigned long)p[middle].len, (unsigned long)p[right].len, p[right].latency); /**/ if (p[left].latency > 0.0) { p[left].ratio = p[right].latency / p[left].latency; p[left].slope = (p[left].ratio - 1.) / (double)(right - left); /* we probably have a bad data point, so ignore it */ if (p[left].ratio < 0.98) { p[left].latency = p[right].latency; p[left].ratio = 1.; p[left].slope = 0.; } } if (middle == left || middle == right) return; if (p[left].ratio > 1.35 || p[left].ratio < 0.97) { collect_sample(repetitions, state, &p[middle]); search(middle, right, repetitions, state, p); search(left, middle, repetitions, state, p); } return; } int collect_sample(int repetitions, struct mem_state* state, struct cache_results* p) { int i, modified, npages; double baseline; npages = (p->len + getpagesize() - 1) / getpagesize(); baseline = measure(p->len, repetitions, &p->variation, state); if (npages > 1) { for (i = 0, modified = 1; i < 8 && modified; ++i) { modified = test_chunk(0, npages, npages, state->pages, p->len, &baseline, 0.0, repetitions, state); } } p->latency = baseline; /* fprintf(stderr, "collect_sample: len=%lu, latency=%G\n", (unsigned long)p->len, p->latency); /**/ return (p->latency > 0); } double measure(size_t size, int repetitions, double* variation, struct mem_state* state) { size_t i, j, npages, nlines; double time, median; char *p; result_t *r, *r_save; size_t *pages; pages = state->pages; npages = (size + getpagesize() - 1) / getpagesize(); nlines = state->nlines; if (size % getpagesize()) nlines = (size % getpagesize()) / state->line; r_save = get_results(); r = (result_t*)malloc(sizeof_result(repetitions)); if (!r) { perror("malloc"); exit(3); } insertinit(r); /* * assumes that you have used mem_initialize() to setup the memory */ p = state->base; for (i = 0; i < npages - 1; ++i) { for (j = 0; j < state->nwords; ++j) { *(char**)(p + pages[i] + state->lines[state->nlines - 1] + state->words[j]) = p + pages[i+1] + state->lines[0] + state->words[j]; } } for (j = 0; j < state->nwords; ++j) { *(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = p + pages[0] + state->lines[0] + state->words[(j+1)%state->nwords]; } /* check_memory(size, state); /**/ addr_save = NULL; state->p[0] = p + pages[0] + state->lines[0] + state->words[0]; /* now, run through the chain once to clear the cache */ mem_benchmark((size / sizeof(char*) + 100) / 100, state); for (i = 0; i < repetitions; ++i) { BENCH1(mem_benchmark(__n, state); __n = 1;, 0) insertsort(gettime(), get_n(), r); } set_results(r); median = (1000. * (double)gettime()) / (100. * (double)get_n()); save_minimum(); time = (1000. * (double)gettime()) / (100. * (double)get_n()); /* Are the results stable, or do they vary? */ if (time != 0.) *variation = median / time; else *variation = -1.0; set_results(r_save); free(r); if (nlines < state->nlines) { for (j = 0; j < state->nwords; ++j) { *(char**)(p + pages[npages - 1] + state->lines[nlines - 1] + state->words[j]) = p + pages[npages - 1] + state->lines[nlines] + state->words[j]; } } /* fprintf(stderr, "%.6f %.2f\n", size / (1000. * 1000.), median); /**/ return median; } double remove_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, int repetitions, struct mem_state* state) { size_t n, j; double t, var; if (i + chunk < npages) { for (j = 0; j < chunk; ++j) { n = pages[i+j]; pages[i+j] = pages[npages-1-j]; pages[npages-1-j] = n; } } t = measure(len - chunk * getpagesize(), repetitions, &var, state); if (i + chunk < npages) { for (j = 0; j < chunk; ++j) { n = pages[i+j]; pages[i+j] = pages[npages-1-j]; pages[npages-1-j] = n; } } return t; } int test_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, double *baseline, double chunk_baseline, int repetitions, struct mem_state* state) { int modified = 0; int changed; size_t j, k, subchunk; double t, tt, nodiff_chunk_baseline; if (chunk <= 20 && chunk < npages) { return fixup_chunk(i, chunk, npages, pages, len, baseline, chunk_baseline, repetitions, state); } nodiff_chunk_baseline = *baseline; subchunk = (chunk + 19) / 20; for (j = i, k = 0; j < i + chunk; j+=subchunk, k++) { if (j + subchunk > i + chunk) subchunk = i + chunk - j; t = remove_chunk(j, subchunk, npages, pages, len, repetitions, state); /* fprintf(stderr, "test_chunk(...): baseline=%G, t=%G, len=%d, chunk=%d, i=%d\n", *baseline, t, len, subchunk, j); /**/ if (t >= 0.99 * *baseline) continue; if (t >= 0.999 * nodiff_chunk_baseline) continue; tt = remove_chunk(j, subchunk, npages, pages, len, repetitions, state); if (tt > t) t = tt; if (t >= 0.99 * *baseline) continue; if (t >= 0.999 * nodiff_chunk_baseline) continue; changed = test_chunk(j, subchunk, npages, pages, len, baseline, t, repetitions, state); if (changed) { modified = 1; } else { nodiff_chunk_baseline = t; } } return modified; } /* * This routine is called once we have identified a chunk * that has pages that are suspected of colliding with other * pages. * * The algorithm is to remove all the pages, and then * slowly add back pages; attempting to add pages with * minimal cost. */ int fixup_chunk(size_t i, size_t chunk, size_t npages, size_t* pages, size_t len, double *baseline, double chunk_baseline, int repetitions, struct mem_state* state) { int swapped = 0; size_t j, k; size_t page, substitute, original; size_t ntotalpages, nsparepages; size_t subset_len; size_t *pageset; size_t *saved_pages; static size_t available_index = 0; double t, var, new_baseline; double latencies[20]; ntotalpages = (state->maxlen + getpagesize() - 1)/ getpagesize(); nsparepages = ntotalpages - npages; pageset = state->pages + npages; new_baseline = *baseline; saved_pages = (size_t*)malloc(sizeof(size_t) * ntotalpages); if (!saved_pages) { perror("malloc"); exit(4); } bcopy(pages, saved_pages, sizeof(int) * ntotalpages); /* move everything to the end of the page list */ if (i + chunk < npages) { for (j = 0; j < chunk; ++j) { page = pages[i+j]; pages[i+j] = pages[npages-chunk+j]; pages[npages-chunk+j] = page; } } if (available_index >= nsparepages) available_index = 0; /* * first try to identify which pages we can definitely keep */ for (j = 0, k = chunk; j < k; ) { t = measure((npages - chunk + j + 1) * getpagesize(), repetitions, &var, state); if (0.995 * t <= chunk_baseline) { latencies[j] = t; ++j; /* keep this page */ } else { --k; /* this page is probably no good */ latencies[k] = t; SWAP(pages[npages - chunk + j], pages[npages - chunk + k]); } } /* * sort the "bad" pages by increasing latency */ pagesort(chunk - j, &pages[npages - chunk + j], &latencies[j]); /* fprintf(stderr, "fixup_chunk: len=%d, chunk=%d, j=%d, baseline=%G, lat[%d]=%G..%G\n", len, chunk, j, *baseline, j, (j < chunk ? latencies[j] : -1.0), latencies[chunk - 1]); /**/ if (chunk >= npages && j < chunk / 2) { j = chunk / 2; t = measure((npages - chunk + j + 1) * getpagesize(), repetitions, &var, state); chunk_baseline = t; } for (k = 0; j < chunk && k < 2 * npages; ++k) { original = npages - chunk + j; substitute = nsparepages - 1; substitute -= (k + available_index) % (nsparepages - 1); subset_len = (original + 1) * getpagesize(); if (j == chunk - 1 && len % getpagesize()) { subset_len = len; } SWAP(pages[original], pageset[substitute]); t = measure(subset_len, repetitions, &var, state); SWAP(pages[original], pageset[substitute]); /* * try to keep pages ordered by increasing latency */ if (t < latencies[chunk - 1]) { latencies[chunk - 1] = t; SWAP(pages[npages - 1], pageset[substitute]); pagesort(chunk - j, &pages[npages - chunk + j], &latencies[j]); } if (0.995 * latencies[j] <= chunk_baseline) { ++j; /* keep this page */ ++swapped; } } available_index = (k + available_index) % (nsparepages - 1); /* measure new baseline, in case we didn't manage to optimally * replace every page */ if (swapped) { new_baseline = measure(len, repetitions, &var, state); /* fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d, baseline=%G, newbase=%G\n", len, swapped, k, *baseline, new_baseline); /**/ if (new_baseline >= 0.999 * *baseline) { /* no benefit to these changes; back them out */ swapped = 0; bcopy(saved_pages, pages, sizeof(int) * ntotalpages); } else { /* we sped up, so keep these changes */ *baseline = new_baseline; /* move back to the middle of the pagelist */ if (i + chunk < npages) { for (j = 0; j < chunk; ++j) { page = pages[i+j]; pages[i+j] = pages[npages-chunk+j]; pages[npages-chunk+j] = page; } } } /* } else { fprintf(stderr, "fixup_chunk: len=%d, swapped=%d, k=%d\n", len, swapped, k); /**/ } free(saved_pages); return swapped; } void check_memory(size_t size, struct mem_state* state) { size_t i, j, first_page, npages, nwords; size_t page, word_count, pagesize; off_t offset; char **p, **q; char **start; pagesize = getpagesize(); npages = (size + pagesize - 1) / pagesize; nwords = size / sizeof(char*); /* fprintf(stderr, "check_memory(%d, ...): entering, %d words\n", size, nwords); /**/ word_count = 1; first_page = 0; start = (char**)(state->base + state->pages[0] + state->lines[0] + state->words[0]); for (q = p = (char**)*start; p != start; ) { word_count++; offset = (unsigned long)p - (unsigned long)state->base; page = offset - offset % pagesize; for (j = first_page; j < npages; ++j) { if (page == state->pages[j]) break; } if (j == npages) { for (j = 0; j < first_page; ++j) { if (page == state->pages[j]) break; } if (j == first_page) { fprintf(stderr, "check_memory: bad memory reference for size %lu\n", (unsigned long)size); } } first_page = j % npages; p = (char**)*p; if (word_count & 0x1) q = (char**)*q; if (*p == *q) { fprintf(stderr, "check_memory: unwanted memory cycle! page=%lu\n", (unsigned long)j); return; } } if (word_count != nwords) { fprintf(stderr, "check_memory: wrong word count, expected %lu, got %lu\n", (unsigned long)nwords, (unsigned long)word_count); } /* fprintf(stderr, "check_memory(%lu, ...): exiting\n", (unsigned long)size); /**/ } void pagesort(size_t n, size_t* pages, double* latencies) { int i, j; double t; for (i = 0; i < n - 1; ++i) { for (j = i + 1; j < n; ++j) { if (latencies[i] > latencies[j]) { t = latencies[i]; latencies[i] = latencies[j]; latencies[j] = t; SWAP(pages[i], pages[j]); } } } } ����������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/clock.c��������������������������������������������������������������������������0000664�0000764�0000764�00000001065�07045412511�015214� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * clock.c * * calculate the minimum timing loop length that gives us significant results */ #include "bench.h" char *id = "$Id$"; char *revision = "$Revision$"; main() { uint64 enough; double t_overhead, l_overhead; enough = compute_enough(15); printf("ENOUGH=%lu\n", (unsigned long)enough); fflush(stdout); t_overhead = timing_overhead(enough); printf("TIMING_OVERHEAD=%f\n", t_overhead); fflush(stdout); l_overhead = loop_overhead(enough, t_overhead); printf("LOOP_OVERHEAD=%f\n", l_overhead); printf("# version [%s]\n", revision); exit(0); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/disk.c���������������������������������������������������������������������������0000664�0000764�0000764�00000013660�10715547567�015101� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * disk - calculate zone bandwidths and seek times * * Usage: disk device * * Copyright (c) 1994-1997 Larry McVoy. All rights reserved. * Bits of this are derived from work by Ethan Solomita. */ #include <stdio.h> #include <sys/types.h> #include <unistd.h> #include <stdlib.h> #include "bench.h" #include "flushdisk.c" #define SEEKPOINTS 2000 #define ZONEPOINTS 150 uint64 disksize(char *); int zone(char *disk, int oflag, int bsize); int seek(char *disk, int oflag); int main(int ac, char **av) { fprintf(stderr, "\"Seek times for %s\n", av[1]); seek(av[1], 0); fprintf(stderr, "\n"); fprintf(stderr, "\"Zone bandwidth for %s\n", av[1]); zone(av[1], 0, (1<<20)); return (0); } int zone(char *disk, int oflag, int bsize) { char *buf; int usecs; int fd; uint64 off; int stride; if ((fd = open(disk, oflag)) == -1) { perror(disk); exit(1); } buf = valloc(bsize); if (!buf) { perror("valloc"); exit(1); } bzero(buf, bsize); #ifdef linux flushdisk(fd); #endif /* * We want ZONEPOINTS data points * but the stride has to be at least 512 and a 512 multiple. * Weird code below for precision. */ off = disksize(disk); off /= ZONEPOINTS; stride = off; if (stride < 512) stride = 512; stride += 511; stride >>= 9; stride <<= 9; /* * Very small disks such as ZIP drives get a 256K blocksize. * As measured on my SCSI ZIP, there seems to be no * difference between 256K and 1MB for sequential reads. * XXX - there is a rotational delay difference but that's tough. */ if (bsize > stride) bsize = 256<<10; if (bsize > stride) stride = bsize; off *= ZONEPOINTS; debug((stdout, "stride=%d bs=%d size=%dM points=%d\n", stride, bsize, (int)(off >> 20), (int)(off/stride))); /* * Read buf's worth of data every stride and time it. * Don't include the rotational delay. * This first I/O outside the loop is to catch read/write permissions. */ #define IO(a,b,c) (oflag == 0 ? read(a,b,c) : write(a,b,c)) if (IO(fd, buf, 512) != 512) { perror(disk); exit(1); } off = 512; for ( ;; ) { if (IO(fd, buf, 1024) != 1024) { exit(0); } off += 1024; start(0); if (IO(fd, buf, bsize) != bsize) { break; } usecs = stop(0, 0); off += bsize; fprintf(stderr, "%.01f %.2f\n", off/1000000.0, (double)bsize/usecs); off += stride; if (seekto(fd, off, SEEK_SET) != off) { break; } } return (0); } /* * Seek - calculate seeks as a function of distance. */ #undef IO #define IO(a,b,c) error = (oflag == 0 ? read(a,b,c) : write(a,b,c)); \ if (error == -1) { perror("io"); exit(1); } #define IOSIZE 512 #define TOOSMALL 1000 /* seeks this small are cached */ #define TOOBIG 1000000 /* seeks this big are remapped or weirdos */ /* zip drives have seeks this long */ int seek(char *disk, int oflag) { char *buf; int fd; off64_t size; off64_t begin, end; int usecs; int error; int tot_msec = 0, tot_io = 0; int stride; if ((fd = open(disk, oflag)) == -1) { perror(disk); return (-1); } #ifdef linux flushdisk(fd); #endif size = disksize(disk); buf = valloc(IOSIZE); if (!buf) { perror("valloc"); exit(1); } bzero(buf, IOSIZE); /* * We flip back and forth, in strides of 1MB (typically). * If we have a 100MB fd, that means we do * 1, 99, 2, 98, etc. * * We want around SEEK POINTS data points * but the stride has to be at least 512 and a 512 multiple. */ stride = size / SEEKPOINTS; if (stride < 512) stride = 512; stride += 511; stride >>= 9; stride <<= 9; debug((stdout, "stride=%d size=%dM points=%d\n", stride, (int)(size >> 20), (int)(size/stride))); end = size; begin = 0; seekto(fd, begin, SEEK_SET); IO(fd, buf, IOSIZE); while (end >= begin + stride*2) { end -= stride; start(0); seekto(fd, end, SEEK_SET); IO(fd, buf, IOSIZE); usecs = stop(0, 0); if (usecs > TOOSMALL && usecs < TOOBIG) { tot_io++; tot_msec += usecs/1000; fprintf(stderr, "%.01f %.02f\n", (end - begin - stride) / 1000000., usecs/1000.); } begin += stride; start(0); seekto(fd, begin, SEEK_SET); IO(fd, buf, IOSIZE); usecs = stop(0, 0); if (usecs > TOOSMALL && usecs < TOOBIG) { tot_io++; tot_msec += usecs/1000; fprintf(stderr, "%.01f %.02f\n", (end + stride - begin) / 1000000., usecs/1000.); } } /* * This is wrong, it should take the 1/3 stroke seek average. avg_msec = (double)tot_msec/tot_io; fprintf(stderr, "Average time == %.04f\n", avg_msec); */ return (0); } /* * Calculate how big a device is. * * To avoid 32 bit problems, our units are MB. */ #define FORWARD (512<<20) #define FORWARD1 (64<<20) #define FORWARD2 (1<<20) /* * Go forward in 1GB chunks until you can't. * Go backwards in 128MB chunks until you can. * Go forwards in 1MB chunks until you can't and return that -1. */ uint64 disksize(char *disk) { int fd = open(disk, 0); char buf[512]; uint64 off = 0; if (fd == -1) { perror("usage: disksize device"); return(0); } /* * Go forward until it doesn't work. */ for ( ;; ) { off += FORWARD; if (seekto(fd, off, SEEK_SET) != off) { debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); off -= FORWARD; break; } if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { debug((stdout, "read @ %dM failed\n", (int)(off>>20))); off -= FORWARD; break; } } for ( ;; ) { off += FORWARD1; if (seekto(fd, off, SEEK_SET) != off) { debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); off -= FORWARD1; break; } if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { debug((stdout, "read @ %dM failed\n", (int)(off>>20))); off -= FORWARD1; break; } } for ( ;; ) { off += FORWARD2; if (seekto(fd, off, SEEK_SET) != off) { debug((stdout, "seekto(%dM) failed\n", (int)(off>>20))); off -= FORWARD2; break; } if ((read(fd, buf, sizeof(buf)) != sizeof(buf))) { debug((stdout, "read @ %dM failed\n", (int)(off>>20))); off -= FORWARD2; break; } } debug((stdout, "disksize(%s) = %d MB\n", disk, (int)(off >> 20))); return (off); } ��������������������������������������������������������������������������������lmbench-3.0-a9/src/enough.c�������������������������������������������������������������������������0000664�0000764�0000764�00000000261�07045412511�015403� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include <stdio.h> #include <stdlib.h> extern int get_enough(int); int main() { putenv("LOOP_O=0.0"); putenv("TIMING_O=0.0"); printf("%u\n", get_enough(0)); return (0); } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/flushdisk.c����������������������������������������������������������������������0000664�0000764�0000764�00000001013�10450256147�016113� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#ifdef linux /* * flushdisk() - linux block cache clearing */ #include <stdio.h> #include <sys/types.h> #include <fcntl.h> #include <unistd.h> #include <stdlib.h> #include <sys/ioctl.h> #include <sys/mount.h> int flushdisk(int fd) { int ret = ioctl(fd, BLKFLSBUF, 0); usleep(100000); return (ret); } #endif #ifdef MAIN int main(int ac, char **av) { #ifdef linux int fd; int i; for (i = 1; i < ac; ++i) { fd = open(av[i], 0); if (flushdisk(fd)) { exit(1); } close(fd); } #endif return(0); } #endif ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/getopt.c�������������������������������������������������������������������������0000664�0000764�0000764�00000006311�07056570124�015431� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * Copyright (c) 1997 L.W.McVoy * * SGI's fucking getopt doesn't follow GNU's reset policy. Isn't having * N versions of Unix a great thing for the world? I'm gonna move to NT * if these assholes don't get their act together. * * This version handles * * - (leaves it and returns) * -a * -abcd * -r <arg> * -r<arg> * -abcr <arg> * -abcr<arg> * -r<arg> -R<arg>, etc. * * A special form is "d|" instead of "d:". This means the arg has to be * right next to the option. * Another special form is "d;". This means the option must be right next * to the option letter and can not be blank. */ #include "bench.h" static char *id = "%@%"; int optopt; /* option that is in error, if we return an error */ int optind; /* next arg in argv we process */ char *optarg; /* argument to an option */ static int n; int getopt(int ac, char **av, char *opts) { char *t; if (!optind) { optind = 1; n = 1; } debug((stderr, "GETOPT ind=%d n=%d arg=%s av[%d]='%s'\n", optind, n, optarg ? optarg : "", optind, av[optind])); if ((optind >= ac) || (av[optind][0] != '-') || !av[optind][1]) { return (EOF); } assert(av[optind][n]); for (t = (char *)opts; *t; t++) { if (*t == av[optind][n]) { break; } } if (!*t) { optopt = av[optind][n]; debug((stderr, "\tran out of option letters\n")); return ('?'); } /* OK, we found a legit option, let's see what to do with it. * If it isn't one that takes an option, just advance and return. */ if (t[1] != ':' && t[1] != '|' && t[1] != ';') { if (!av[optind][n+1]) { optind++; n = 1; } else { n++; } debug((stderr, "\tLegit singleton %c\n", *t)); return (*t); } /* got one with an option, see if it is cozied up to the flag */ if (av[optind][n+1]) { if (av[optind][n+1]) { optarg = &av[optind][n+1]; } else { optarg = 0; } optind++; n = 1; debug((stderr, "\t%c with %s\n", *t, optarg)); return (*t); } /* If it was not there, and it is optional, OK */ if (t[1] == '|') { optarg = 0; optind++; n = 1; debug((stderr, "\t%c without arg\n", *t)); return (*t); } /* was it supposed to be there? */ if (t[1] == ';') { optarg = 0; optind++; optopt = *t; debug((stderr, "\twanted another word\n")); return ('?'); } /* Nope, there had better be another word. */ if ((optind + 1 == ac) || (av[optind+1][0] == '-')) { optopt = av[optind][n]; debug((stderr, "\twanted another word\n")); return ('?'); } optarg = av[optind+1]; optind += 2; n = 1; debug((stderr, "\t%c with arg %s\n", *t, optarg)); return (*t); } #ifdef TEST /* XXX a.out -y file */ main(int ac, char **av) { extern char *optarg; extern int optind; char *comment = 0; int c; while ((c = getopt(ac, av, "fnpsx:y|")) != -1) { switch (c) { case 'f': case 'n': case 'p': case 's': printf("Got option %c\n", c); break; case 'x': case 'y': comment = optarg; printf("Got optarg %s with -%c\n", comment, c); break; case '?': fprintf(stderr, "bad option %c\n", optopt); break; default: fprintf(stderr, "unknown ret %c\n", c); break; } } while (av[optind]) { printf("av[%d] = %s\n", optind, av[optind++]); } exit(0); } #endif �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/hello.c��������������������������������������������������������������������������0000664�0000764�0000764�00000000120�07045412511�015213� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include "bench.h" int main() { write(1, "Hello world\n", 12); return (0); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_cmd.c������������������������������������������������������������������������0000664�0000764�0000764�00000004142�10715547567�015545� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_cmd.c - time to complete a given command line * * usage: lat_cmd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmd... * * Copyright (c) 2004 Carl Staelin. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void bench(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); typedef struct _state { char** argv; pid_t pid; } state_t; int main(int ac, char **av) { int c; int i; int parallel = 1; int warmup = 0; int repetitions = -1; state_t state; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] cmdline...\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind >= ac) { lmbench_usage(ac, av, usage); } state.argv = (char**)malloc((ac - optind + 1) * sizeof(char*)); if (!state.argv) { perror("malloc"); exit(1); } state.pid = 0; for (i = 0; i < ac - optind; ++i) { state.argv[i] = av[optind + i]; } state.argv[i] = NULL; benchmp(NULL, bench, NULL, 0, parallel, warmup, repetitions, &state); micro("lat_cmd", get_n()); return (0); } void cleanup(iter_t iterations, void* cookie) { state_t* state = (state_t*)cookie; if (iterations) return; if (state->pid) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); state->pid = 0; } } void bench(register iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; signal(SIGCHLD, SIG_DFL); while (iterations-- > 0) { switch (state->pid = fork()) { case '0': execvp(state->argv[0], state->argv); /*NOTREACHED*/ default: break; } waitpid(state->pid, NULL, 0); state->pid = 0; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_connect.c��������������������������������������������������������������������0000664�0000764�0000764�00000005074�10620624536�016423� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_connect.c - simple TCP connection latency test * * Three programs in one - * server usage: lat_connect -s * client usage: lat_connect [-N <repetitions>] hostname * shutdown: lat_connect -hostname * * lat_connect may not be parallelized because of idiosyncracies * with TCP connection creation. Basically, if the client tries * to create too many connections too quickly, the system fills * up the set of available connections with TIME_WAIT connections. * We can only measure the TCP connection cost accurately if we * do just a few connections. Since the parallel harness needs * each child to run for a second, this guarantees that the * parallel version will generate inaccurate results. * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" typedef struct _state { char *server; } state_t; void doclient(iter_t iterations, void * cookie); void server_main(); int main(int ac, char **av) { state_t state; int repetitions = -1; int c; char buf[256]; char *usage = "-s\n OR [-S] [-N <repetitions>] server\n"; while (( c = getopt(ac, av, "sSP:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); case 'S': /* shutdown serverhost */ { int sock = tcp_connect(av[optind], TCP_CONNECT, SOCKOPT_NONE); write(sock, "0", 1); close(sock); exit(0); } case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 1 != ac) { lmbench_usage(ac, av, usage); } state.server = av[optind]; benchmp(NULL, doclient, NULL, 0, 1, 0, repetitions, &state); sprintf(buf, "TCP/IP connection cost to %s", state.server); micro(buf, get_n()); exit(0); } void doclient(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register char *server = state->server; register int sock; while (iterations-- > 0) { sock = tcp_connect(server, TCP_CONNECT, SOCKOPT_REUSE); close(sock); } } void server_main() { int newsock, sock; char c ='1'; GO_AWAY; sock = tcp_server(TCP_CONNECT, SOCKOPT_NONE|SOCKOPT_REUSE); for (;;) { newsock = tcp_accept(sock, SOCKOPT_NONE); if (read(newsock, &c, 1) > 0) { tcp_done(TCP_CONNECT); exit(0); } close(newsock); } /* NOTREACHED */ } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_ctx.c������������������������������������������������������������������������0000664�0000764�0000764�00000017345�10715547567�015611� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_ctx.c - context switch timer * * usage: lat_ctx [-P parallelism] [-W <warmup>] [-N <repetitions>] [-s size] #procs [#procs....] * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define MAXPROC 2048 #define CHUNK (4<<10) #define TRIPS 5 #ifndef max #define max(a, b) ((a) > (b) ? (a) : (b)) #endif void doit(int rd, int wr, int process_size); int create_pipes(int **p, int procs); int create_daemons(int **p, pid_t *pids, int procs, int process_size); void initialize_overhead(iter_t iterations, void* cookie); void cleanup_overhead(iter_t iterations, void* cookie); void benchmark_overhead(iter_t iterations, void* cookie); void initialize(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); void benchmark(iter_t iterations, void* cookie); struct _state { int process_size; double overhead; int procs; pid_t* pids; int **p; void* data; }; int main(int ac, char **av) { int i, maxprocs; int c; int parallel = 1; int warmup = 0; int repetitions = -1; struct _state state; char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-s kbytes] processes [processes ...]\n"; double time; /* * Need 4 byte ints. */ if (sizeof(int) != 4) { fprintf(stderr, "Fix sumit() in ctx.c.\n"); exit(1); } state.process_size = 0; state.overhead = 0.0; state.pids = NULL; /* * If they specified a context size, or parallelism level, get them. */ while (( c = getopt(ac, av, "s:P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 's': state.process_size = atoi(optarg) * 1024; break; default: lmbench_usage(ac, av, usage); break; } } if (optind > ac - 1) lmbench_usage(ac, av, usage); /* compute pipe + sumit overhead */ maxprocs = atoi(av[optind]); for (i = optind; i < ac; ++i) { state.procs = atoi(av[i]); if (state.procs > maxprocs) maxprocs = state.procs; } state.procs = maxprocs; benchmp(initialize_overhead, benchmark_overhead, cleanup_overhead, 0, 1, warmup, repetitions, &state); if (gettime() == 0) return(0); state.overhead = gettime(); state.overhead /= get_n(); fprintf(stderr, "\n\"size=%dk ovr=%.2f\n", state.process_size/1024, state.overhead); /* compute the context switch cost for N processes */ for (i = optind; i < ac; ++i) { state.procs = atoi(av[i]); benchmp(initialize, benchmark, cleanup, 0, parallel, warmup, repetitions, &state); time = gettime(); time /= get_n(); time /= state.procs; time -= state.overhead; if (time > 0.0) fprintf(stderr, "%d %.2f\n", state.procs, time); } return (0); } void initialize_overhead(iter_t iterations, void* cookie) { int i; int procs; int* p; struct _state* pState = (struct _state*)cookie; if (iterations) return; pState->pids = NULL; pState->p = (int**)malloc(pState->procs * (sizeof(int*) + 2 * sizeof(int))); pState->data = (pState->process_size > 0) ? malloc(pState->process_size) : NULL; if (!pState->p || (pState->process_size > 0 && !pState->data)) { perror("malloc"); exit(1); } p = (int*)&pState->p[pState->procs]; for (i = 0; i < pState->procs; ++i) { pState->p[i] = p; p += 2; } if (pState->data) bzero(pState->data, pState->process_size); procs = create_pipes(pState->p, pState->procs); if (procs < pState->procs) { cleanup_overhead(0, cookie); exit(1); } } void cleanup_overhead(iter_t iterations, void* cookie) { int i; struct _state* pState = (struct _state*)cookie; if (iterations) return; for (i = 0; i < pState->procs; ++i) { close(pState->p[i][0]); close(pState->p[i][1]); } free(pState->p); if (pState->data) free(pState->data); } void benchmark_overhead(iter_t iterations, void* cookie) { struct _state* pState = (struct _state*)cookie; int i = 0; int msg = 1; while (iterations-- > 0) { if (write(pState->p[i][1], &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ exit(1); } if (read(pState->p[i][0], &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ exit(1); } if (++i == pState->procs) { i = 0; } bread(pState->data, pState->process_size); } } void initialize(iter_t iterations, void* cookie) { int procs; struct _state* pState = (struct _state*)cookie; if (iterations) return; initialize_overhead(iterations, cookie); pState->pids = (pid_t*)malloc(pState->procs * sizeof(pid_t)); if (pState->pids == NULL) exit(1); bzero((void*)pState->pids, pState->procs * sizeof(pid_t)); procs = create_daemons(pState->p, pState->pids, pState->procs, pState->process_size); if (procs < pState->procs) { cleanup(0, cookie); exit(1); } } void cleanup(iter_t iterations, void* cookie) { int i; struct _state* pState = (struct _state*)cookie; if (iterations) return; /* * Close the pipes and kill the children. */ cleanup_overhead(iterations, cookie); for (i = 1; pState->pids && i < pState->procs; ++i) { if (pState->pids[i] > 0) { kill(pState->pids[i], SIGKILL); waitpid(pState->pids[i], NULL, 0); } } if (pState->pids) free(pState->pids); pState->pids = NULL; } void benchmark(iter_t iterations, void* cookie) { struct _state* pState = (struct _state*)cookie; int msg; /* * Main process - all others should be ready to roll, time the * loop. */ while (iterations-- > 0) { if (write(pState->p[0][1], &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ exit(1); } if (read(pState->p[pState->procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ exit(1); } bread(pState->data, pState->process_size); } } void doit(int rd, int wr, int process_size) { int msg; void* data = NULL; if (process_size) { data = malloc(process_size); if (!data) { perror("malloc"); exit(3); } bzero(data, process_size); } for ( ;; ) { if (read(rd, &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ break; } if (process_size) bread(data, process_size); if (write(wr, &msg, sizeof(msg)) != sizeof(msg)) { /* perror("read/write on pipe"); */ break; } } exit(1); } int create_daemons(int **p, pid_t *pids, int procs, int process_size) { int i, j; int msg; /* * Use the pipes as a ring, and fork off a bunch of processes * to pass the byte through their part of the ring. * * Do the sum in each process and get that time before moving on. */ handle_scheduler(benchmp_childid(), 0, procs-1); for (i = 1; i < procs; ++i) { switch (pids[i] = fork()) { case -1: /* could not fork, out of processes? */ return i; case 0: /* child */ handle_scheduler(benchmp_childid(), i, procs-1); for (j = 0; j < procs; ++j) { if (j != i - 1) close(p[j][0]); if (j != i) close(p[j][1]); } doit(p[i-1][0], p[i][1], process_size); /* NOTREACHED */ default: /* parent */ ; } } /* * Go once around the loop to make sure that everyone is ready and * to get the token in the pipeline. */ if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg) || read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) { /* perror("write/read/write on pipe"); */ exit(1); } return procs; } int create_pipes(int **p, int procs) { int i; /* * Get a bunch of pipes. */ morefds(); for (i = 0; i < procs; ++i) { if (pipe(p[i]) == -1) { return i; } } return procs; } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_dram_page.c������������������������������������������������������������������0000664�0000764�0000764�00000010761�10620624536�016710� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_dram_page.c - guess the DRAM page latency * * usage: lat_dram_page * * Copyright (c) 2002 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void dram_page_initialize(iter_t iterations, void* cookie); void benchmark_loads(iter_t iterations, void *cookie); double loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie); struct dram_page_state { struct mem_state mstate; int group; }; int main(int ac, char **av) { int maxlen = 64 * 1024 * 1024; int warmup = 0; int repetitions = -1; int c; struct dram_page_state state; double dram_hit, dram_miss; char *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n"; state.mstate.width = 1; state.mstate.line = sizeof(char*); state.mstate.pagesize = getpagesize(); state.group = 16; while (( c = getopt(ac, av, "aL:T:M:W:N:")) != EOF) { switch(c) { case 'L': state.mstate.line = bytes(optarg); break; case 'T': state.group = bytes(optarg); break; case 'M': maxlen = bytes(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } dram_hit = loads(mem_initialize, maxlen, warmup, repetitions, &state); dram_miss = loads(dram_page_initialize, maxlen, warmup, repetitions, &state); if (dram_hit < 0.95 * dram_miss) { fprintf(stderr, "%f\n", dram_miss - dram_hit); } else { fprintf(stderr, "0.0\n"); } return (0); } #define ONE p = (char **)*p; #define FIVE ONE ONE ONE ONE ONE #define TEN FIVE FIVE #define FIFTY TEN TEN TEN TEN TEN #define HUNDRED FIFTY FIFTY void benchmark_loads(iter_t iterations, void *cookie) { struct mem_state* state = (struct mem_state*)cookie; register char **p = (char**)state->base; register int i; register int count = state->len / (state->line * 100) + 1; while (iterations-- > 0) { for (i = 0; i < count; ++i) { HUNDRED; } } use_pointer((void *)p); } void regroup(size_t* pages, int groupsize, void* cookie) { register int i, j; register char* ptr; register char *page; register char *page_end; register char *p = 0 /* lint */; struct mem_state* state = (struct mem_state*)cookie; if (groupsize <= 1) return; p = state->base; /* * for all but the last page in the group, * point to the same line in the next page */ for (i = 0; i < groupsize - 1; ++i) { for (j = 0; j < state->pagesize; j += sizeof(char*)) { *(char**)(p + pages[i] + j) = p + pages[i+1] + j; } } /* * for the last page, point to the next line * in the first page of the group, except for * the last line in the page which points to * the first line in the next group * * since the pointers are all set up for the * last line, only modify the pointers for * the other lines */ page = p + pages[groupsize-1]; page_end = page + state->pagesize; for (i = 0; i < state->pagesize; i += sizeof(char*)) { ptr = *(char**)(page + i); if (page <= ptr && ptr < page_end) { int offset = (int)(ptr - page); *(char**)(page + i) = p + pages[0] + offset; } } } /* * This is like mem_initialize */ void dram_page_initialize(iter_t iterations, void* cookie) { int i; struct mem_state* state = (struct mem_state*)cookie; struct dram_page_state* dstate = (struct dram_page_state*)cookie; if (iterations) return; mem_initialize(iterations, cookie); for (i = 0; i < state->npages; i += dstate->group) { int groupsize = dstate->group; if (groupsize > state->npages - i) { groupsize = state->npages - i; } regroup(state->pages + i, groupsize, cookie); } benchmark_loads(1, cookie); } double loads(benchmp_f initialize, int len, int warmup, int repetitions, void* cookie) { double result; int count; int parallel = 1; struct mem_state* state = (struct mem_state*)cookie; state->len = len; state->maxlen = len; count = 100 * (state->len / (state->line * 100) + 1); /* * Now walk them and time it. */ benchmp(initialize, benchmark_loads, mem_cleanup, 0, parallel, warmup, repetitions, cookie); /* We want to get to nanoseconds / load. */ result = (1000. * (double)gettime()) / (double)(count * get_n()); /* fprintf(stderr, "%.5f %.3f\n", len / (1024. * 1024.), result); /**/ return result; } ���������������lmbench-3.0-a9/src/lat_fcntl.c����������������������������������������������������������������������0000664�0000764�0000764�00000010462�10620624536�016075� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include "bench.h" /* * lat_fcntl.c - file locking test * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id: lat_pipe.c,v 1.8 1997/06/16 05:38:58 lm Exp $\n"; #include "bench.h" struct flock lock, unlock; struct flock s1, s2; /* * Create two files, use them as a ping pong test. * Process A: * lock(1) * unlock(2) * Process B: * unlock(1) * lock(2) * Initial state: * lock is locked * lock2 is locked */ #define waiton(fd) fcntl(fd, F_SETLKW, &lock) #define release(fd) fcntl(fd, F_SETLK, &unlock) struct _state { char filename1[2048]; char filename2[2048]; int pid; int fd1; int fd2; }; void initialize(iter_t iterations, void* cookie); void benchmark(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); void procA(struct _state *state) { if (waiton(state->fd1) == -1) { perror("lock of fd1 failed\n"); cleanup(0, state); exit(1); } if (release(state->fd2) == -1) { perror("unlock of fd2 failed\n"); cleanup(0, state); exit(1); } if (waiton(state->fd2) == -1) { perror("lock of fd2 failed\n"); cleanup(0, state); exit(1); } if (release(state->fd1) == -1) { perror("unlock of fd1 failed\n"); cleanup(0, state); exit(1); } } void procB(struct _state *state) { if (release(state->fd1) == -1) { perror("unlock of fd1 failed\n"); cleanup(0, state); exit(1); } if (waiton(state->fd2) == -1) { perror("lock of fd2 failed\n"); cleanup(0, state); exit(1); } if (release(state->fd2) == -1) { perror("unlock of fd2 failed\n"); cleanup(0, state); exit(1); } if (waiton(state->fd1) == -1) { perror("lock of fd1 failed\n"); cleanup(0, state); exit(1); } } void initialize(iter_t iterations, void* cookie) { char buf[10000]; struct _state* state = (struct _state*)cookie; if (iterations) return; sprintf(state->filename1, "/tmp/lmbench-fcntl%d.1", getpid()); sprintf(state->filename2, "/tmp/lmbench-fcntl%d.2", getpid()); state->pid = 0; state->fd1 = -1; state->fd2 = -1; unlink(state->filename1); unlink(state->filename2); if ((state->fd1 = open(state->filename1, O_CREAT|O_RDWR, 0666)) == -1) { perror("create"); exit(1); } if ((state->fd2 = open(state->filename2, O_CREAT|O_RDWR, 0666)) == -1) { perror("create"); exit(1); } unlink(state->filename1); unlink(state->filename2); write(state->fd1, buf, sizeof(buf)); write(state->fd2, buf, sizeof(buf)); lock.l_type = F_WRLCK; lock.l_whence = 0; lock.l_start = 0; lock.l_len = 1; unlock = lock; unlock.l_type = F_UNLCK; if (waiton(state->fd1) == -1) { perror("lock1"); exit(1); } if (waiton(state->fd2) == -1) { perror("lock2"); exit(1); } handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case -1: perror("fork"); exit(1); case 0: handle_scheduler(benchmp_childid(), 1, 1); for ( ;; ) { procB(state); } default: break; } } void benchmark(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { procA(state); } } void cleanup(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; if (iterations) return; if (state->fd1 >= 0) close(state->fd1); if (state->fd2 >= 0) close(state->fd2); state->fd1 = -1; state->fd2 = -1; if (state->pid) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); } state->pid = 0; } int main(int ac, char **av) { int c; int parallel = 1; int warmup = 0; int repetitions = -1; struct _state state; char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; /* * If they specified a parallelism level, get it. */ while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } state.pid = 0; benchmp(initialize, benchmark, cleanup, 0, parallel, warmup, repetitions, &state); micro("Fcntl lock latency", 2 * get_n()); return (0); } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_fifo.c�����������������������������������������������������������������������0000664�0000764�0000764�00000006455�10620624536�015721� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_fifo.c - named pipe transaction test * * usage: lat_fifo [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. */ char *id = "$Id$\n"; #include "bench.h" #define F1 "/tmp/lmbench_f1.%d" #define F2 "/tmp/lmbench_f2.%d" void initialize(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void doit(iter_t iterations, void *cookie); void writer(int wr, int rd); typedef struct _state { char filename1[256]; char filename2[256]; int pid; int wr; int rd; } state_t; int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } state.pid = 0; benchmp(initialize, doit, cleanup, SHORT, parallel, warmup, repetitions, &state); micro("Fifo latency", get_n()); return (0); } void initialize(iter_t iterations, void *cookie) { char c; state_t * state = (state_t *)cookie; if (iterations) return; state->pid = 0; sprintf(state->filename1,F1,getpid()); sprintf(state->filename2,F2,getpid()); unlink(state->filename1); unlink(state->filename2); if (mknod(state->filename1, S_IFIFO|0664, 0) || mknod(state->filename2, S_IFIFO|0664, 0)) { perror("mknod"); exit(1); } handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case 0: handle_scheduler(benchmp_childid(), 1, 1); state->rd = open(state->filename1, O_RDONLY); state->wr = open(state->filename2, O_WRONLY); writer(state->wr, state->rd); return; case -1: perror("fork"); return; default: state->wr = open(state->filename1, O_WRONLY); state->rd = open(state->filename2, O_RDONLY); break; } /* * One time around to make sure both processes are started. */ if (write(state->wr, &c, 1) != 1 || read(state->rd, &c, 1) != 1) { perror("(i) read/write on pipe"); exit(1); } } void cleanup(iter_t iterations, void * cookie) { state_t * state = (state_t *)cookie; if (iterations) return; unlink(state->filename1); unlink(state->filename2); close(state->wr); close(state->rd); if (state->pid > 0) { kill(state->pid, 15); waitpid(state->pid, NULL, 0); state->pid = 0; } } void doit(register iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; char c; register int w = state->wr; register int r = state->rd; register char *cptr = &c; while (iterations-- > 0) { if (write(w, cptr, 1) != 1 || read(r, cptr, 1) != 1) { perror("(r) read/write on pipe"); exit(1); } } } void writer(register int w, register int r) { char c; register char *cptr = &c; for ( ;; ) { if (read(r, cptr, 1) != 1 || write(w, cptr, 1) != 1) { perror("(w) read/write on pipe"); } } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_fs.c�������������������������������������������������������������������������0000664�0000764�0000764�00000014151�10715547567�015413� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * Benchmark creates & deletes. */ char *id = "$Id$\n"; #include "bench.h" struct _state { char *tmpdir; long max; long n; char** names; long ndirs; char** dirs; size_t size; }; void measure(size_t size, int parallel, int warmup, int repetitions, void* cookie); void mkfile(char* s, size_t size); void setup_names(iter_t iterations, void* cookie); void cleanup_names(iter_t iterations, void* cookie); void setup_rm(iter_t iterations, void* cookie); void cleanup_mk(iter_t iterations, void* cookie); void benchmark_mk(iter_t iterations, void* cookie); void benchmark_rm(iter_t iterations, void* cookie); int main(int ac, char **av) { int i; int parallel = 1; int warmup = 0; int repetitions = -1; static int sizes[] = { 0, 1024, 4096, 10*1024 }; struct _state state; int c; char* usage = "[-s <file size>] [-n <max files per dir>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [<dir>]\n"; state.size = 0; state.max = 100; state.tmpdir = NULL; while (( c = getopt(ac, av, "s:n:P:W:N:")) != EOF) { switch(c) { case 's': state.size = bytes(optarg); break; case 'n': state.max = bytes(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac - 1) { lmbench_usage(ac, av, usage); } if (optind == ac - 1) { state.tmpdir = av[1]; } if (state.size) { measure(state.size, parallel, warmup, repetitions, &state); } else { for (i = 0; i < sizeof(sizes)/sizeof(int); ++i) { state.size = sizes[i]; measure(state.size, parallel, warmup, repetitions, &state); } } return(0); } void measure(size_t size, int parallel, int warmup, int repetitions, void* cookie) { fprintf(stderr, "%luk", size>>10); benchmp(setup_names, benchmark_mk, cleanup_mk, 0, parallel, warmup, repetitions, cookie); if (gettime()) { fprintf(stderr, "\t%lu\t%.0f", (unsigned long)get_n(), (double)(1000000. * get_n() / (double)gettime())); } else { fprintf(stderr, "\t-1\t-1"); } benchmp(setup_rm, benchmark_rm, cleanup_names, 0, parallel, warmup, repetitions, cookie); if (gettime()) { fprintf(stderr, "\t%.0f", (double)(1000000. * get_n() / (double)gettime())); } else { fprintf(stderr, "\t-1"); } fprintf(stderr, "\n"); } void mkfile(char *name, size_t size) { size_t chunk; int fd = creat(name, 0666); char buf[128*1024]; /* XXX - track sizes */ while (size > 0) { chunk = ((size > (128*1024)) ? (128*1024) : size); write(fd, buf, chunk); size -= chunk; } close(fd); } void setup_names_recurse(iter_t* foff, iter_t* doff, int depth, struct _state* state) { long i, ndirs, count; char* basename = state->dirs[*doff]; char name[L_tmpnam + 8192]; if (depth > 0) { for (count = state->max, i = 1; i < depth; ++i) { count *= state->max; } ndirs = (state->n - *foff) / count + 1; for (i = 0; i < state->max && i < ndirs && *foff < state->n; ++i) { sprintf(name, "%s/%ld", basename, i); state->dirs[++(*doff)] = strdup(name); mkdir(name, 0777); setup_names_recurse(foff, doff, depth-1, state); } } else { for (i = 0; i < state->max && *foff < state->n; ++i) { sprintf(name, "%s/%ld", basename, i); state->names[(*foff)++] = strdup(name); } } } void setup_names(iter_t iterations, void* cookie) { long i, ndirs, depth; iter_t foff; iter_t doff; char dirname_tmpl[L_tmpnam + 256]; char* dirname; struct _state* state = (struct _state*)cookie; if (!iterations) return; depth = 0; state->n = iterations; state->ndirs = iterations / state->max; if (iterations % state->max) state->ndirs++; for (ndirs = state->ndirs; ndirs > 1; ) { ndirs = ndirs / state->max + ((ndirs % state->max) ? 1 : 0); state->ndirs += ndirs; depth++; } state->names = (char**)malloc(iterations * sizeof(char*)); state->dirs = (char**)malloc(state->ndirs * sizeof(char*)); if (iterations && !state->names || state->ndirs && !state->dirs) { perror("malloc"); exit(1); } for (i = 0; i < iterations; ++i) { state->names[i] = NULL; } for (i = 0; i < state->ndirs; ++i) { state->dirs[i] = NULL; } sprintf(dirname_tmpl, "lat_fs_%d_XXXXXX", getpid()); dirname = tempnam(state->tmpdir, dirname_tmpl); if (!dirname) { perror("tempnam failed"); exit(1); } if (mkdir(dirname, S_IRUSR|S_IWUSR|S_IXUSR)) { perror("mkdir failed"); exit(1); } state->dirs[0] = dirname; foff = 0; doff = 0; setup_names_recurse(&foff, &doff, depth, state); if (foff != iterations || doff != state->ndirs - 1) { fprintf(stderr, "setup_names: ERROR: foff=%lu, iterations=%lu, doff=%lu, ndirs=%lu, depth=%ld\n", (unsigned long)foff, (unsigned long)iterations, (unsigned long)doff, (unsigned long)state->ndirs, depth); } } void cleanup_names(iter_t iterations, void* cookie) { long i; struct _state* state = (struct _state*)cookie; if (!iterations) return; for (i = 0; i < state->n; ++i) { if (state->names[i]) free(state->names[i]); } free(state->names); state->n = 0; for (i = state->ndirs - 1; i >= 0; --i) { if (state->dirs[i]) { rmdir(state->dirs[i]); free(state->dirs[i]); } } free(state->dirs); state->ndirs = 0; } void setup_rm(iter_t iterations, void* cookie) { if (!iterations) return; setup_names(iterations, cookie); benchmark_mk(iterations, cookie); } void cleanup_mk(iter_t iterations, void* cookie) { if (!iterations) return; benchmark_rm(iterations, cookie); cleanup_names(iterations, cookie); } void benchmark_mk(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { if (!state->names[iterations]) { fprintf(stderr, "benchmark_mk: null filename at %lu of %lu\n", iterations, state->n); continue; } mkfile(state->names[iterations], state->size); } } void benchmark_rm(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { if (!state->names[iterations]) { fprintf(stderr, "benchmark_rm: null filename at %lu of %lu\n", iterations, state->n); continue; } unlink(state->names[iterations]); } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_http.c�����������������������������������������������������������������������0000664�0000764�0000764�00000004522�10715547567�015763� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_http.c - simple HTTP transaction latency test * * usage: lat_http hostname [port] < filelist * * Copyright (c) 1994-6 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" char *buf; int debug; int echo; int http(char *server, char *file, int prog) { int sock; int n; int b = 0; sock = tcp_connect(server, prog, SOCKOPT_REUSE); sprintf(buf, "GET /%s HTTP/1.0\r\n\r\n\n", file); if (debug) { printf(buf); } write(sock, buf, strlen(buf)); while ((n = read(sock, buf, XFERSIZE)) > 0) { b += n; if (echo) { write(1, buf, n); } } close(sock); if (debug) { printf("Got %d\n", b); } return (b); } void killhttp(char *server, int prog) { int sock; sock = tcp_connect(server, prog, SOCKOPT_REUSE); write(sock, "EXIT", 4); close(sock); } void chop(register char *s) { while (*s && *s != '\n') s++; *s = 0; } int main(int ac, char **av) { char *server; int i, prog; int c; int shutdown = 0; uint64 total = 0; uint64 usecs = 0; double avg; char *name = av[0]; char file[1024]; char *usage = "[-d] [-e] [-S] serverhost [port] < list\n"; while (( c = getopt(ac, av, "deS")) != EOF) { switch(c) { case 'd': debug++; break; case 'e': echo++; break; case 'S': /* shutdown serverhost */ shutdown = 1; break; default: lmbench_usage(ac, av, usage); break; } } if (optind >= ac || optind < ac - 2) { lmbench_usage(ac, av, usage); exit(0); } server = av[optind++]; if (optind < ac && atoi(av[optind]) != 0) { prog = -atoi(av[optind]); } else { prog = -80; } if (shutdown) { killhttp(server, prog); exit(0); } i = 0; buf = valloc(XFERSIZE); if (!buf) { perror("valloc"); exit(1); } bzero(buf, XFERSIZE); while (fgets(file, sizeof(file), stdin)) { chop(file); start(0); total += http(server, file, prog); usecs += stop(0,0); i++; } avg = total; avg /= (i - 1); if (avg > 1000) { avg /= 1000; fprintf(stderr, "Avg xfer: %.1fKB, ", avg); } else { fprintf(stderr, "Avg xfer %d, ", (int)avg); } settime(usecs); latency((uint64)1, total); exit(0); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_mem_rd.c���������������������������������������������������������������������0000664�0000764�0000764�00000007371�10715547567�016254� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_mem_rd.c - measure memory load latency * * usage: lat_mem_rd [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] size-in-MB [stride ...] * * Copyright (c) 1994 Larry McVoy. * Copyright (c) 2003, 2004 Carl Staelin. * * Distributed under the FSF GPL with additional restriction that results * may published only if: * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id: s.lat_mem_rd.c 1.13 98/06/30 16:13:49-07:00 lm@lm.bitmover.com $\n"; #include "bench.h" #define STRIDE (512/sizeof(char *)) #define LOWER 512 void loads(size_t len, size_t range, size_t stride, int parallel, int warmup, int repetitions); size_t step(size_t k); void initialize(iter_t iterations, void* cookie); benchmp_f fpInit = stride_initialize; int main(int ac, char **av) { int i; int c; int parallel = 1; int warmup = 0; int repetitions = -1; size_t len; size_t range; size_t stride; char *usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] [-t] len [stride...]\n"; while (( c = getopt(ac, av, "tP:W:N:")) != EOF) { switch(c) { case 't': fpInit = thrash_initialize; break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind == ac) { lmbench_usage(ac, av, usage); } len = atoi(av[optind]); len *= 1024 * 1024; if (optind == ac - 1) { fprintf(stderr, "\"stride=%d\n", (int)STRIDE); for (range = LOWER; range <= len; range = step(range)) { loads(len, range, STRIDE, parallel, warmup, repetitions); } } else { for (i = optind + 1; i < ac; ++i) { stride = bytes(av[i]); fprintf(stderr, "\"stride=%d\n", (int)stride); for (range = LOWER; range <= len; range = step(range)) { loads(len, range, stride, parallel, warmup, repetitions); } fprintf(stderr, "\n"); } } return(0); } #define ONE p = (char **)*p; #define FIVE ONE ONE ONE ONE ONE #define TEN FIVE FIVE #define FIFTY TEN TEN TEN TEN TEN #define HUNDRED FIFTY FIFTY void benchmark_loads(iter_t iterations, void *cookie) { struct mem_state* state = (struct mem_state*)cookie; register char **p = (char**)state->p[0]; register size_t i; register size_t count = state->len / (state->line * 100) + 1; while (iterations-- > 0) { for (i = 0; i < count; ++i) { HUNDRED; } } use_pointer((void *)p); state->p[0] = (char*)p; } void loads(size_t len, size_t range, size_t stride, int parallel, int warmup, int repetitions) { double result; size_t count; struct mem_state state; if (range < stride) return; state.width = 1; state.len = range; state.maxlen = len; state.line = stride; state.pagesize = getpagesize(); count = 100 * (state.len / (state.line * 100) + 1); #if 0 (*fpInit)(0, &state); fprintf(stderr, "loads: after init\n"); (*benchmark_loads)(2, &state); fprintf(stderr, "loads: after benchmark\n"); mem_cleanup(0, &state); fprintf(stderr, "loads: after cleanup\n"); settime(1); save_n(1); #else /* * Now walk them and time it. */ benchmp(fpInit, benchmark_loads, mem_cleanup, 100000, parallel, warmup, repetitions, &state); #endif /* We want to get to nanoseconds / load. */ save_minimum(); result = (1000. * (double)gettime()) / (double)(count * get_n()); fprintf(stderr, "%.5f %.3f\n", range / (1024. * 1024.), result); } size_t step(size_t k) { if (k < 1024) { k = k * 2; } else if (k < 4*1024) { k += 1024; } else { size_t s; for (s = 4 * 1024; s <= k; s *= 2) ; k += s / 4; } return (k); } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_mmap.c�����������������������������������������������������������������������0000664�0000764�0000764�00000007311�10715547567�015735� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_mmap.c - time how fast a mapping can be made and broken down * * Usage: mmap [-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file * * XXX - If an implementation did lazy address space mapping, this test * will make that system look very good. I haven't heard of such a system. * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define PSIZE (16<<10) #define N 10 #define STRIDE (10*PSIZE) #define MINSIZE (STRIDE*2) #define CHK(x) if ((x) == -1) { perror("x"); exit(1); } typedef struct _state { size_t size; int fd; int random; int clone; char *name; } state_t; void init(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void domapping(iter_t iterations, void * cookie); int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char *usage = "[-r] [-C] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] size file\n"; state.random = 0; state.clone = 0; while (( c = getopt(ac, av, "rP:W:N:C")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'r': state.random = 1; break; case 'C': state.clone = 1; break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 2 != ac) { lmbench_usage(ac, av, usage); } state.size = bytes(av[optind]); if (state.size < MINSIZE) { return (1); } state.name = av[optind+1]; benchmp(init, domapping, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { micromb(state.size, get_n()); } return (0); } void init(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; if (state->clone) { char buf[128]; char* s; /* copy original file into a process-specific one */ sprintf(buf, "%d", (int)getpid()); s = (char*)malloc(strlen(state->name) + strlen(buf) + 1); if (!s) { perror("malloc"); exit(1); } sprintf(s, "%s%d", state->name, (int)getpid()); if (cp(state->name, s, S_IREAD|S_IWRITE) < 0) { perror("Could not copy file"); unlink(s); exit(1); } state->name = s; } CHK(state->fd = open(state->name, O_RDWR)); if (state->clone) unlink(state->name); if (seekto(state->fd, 0, SEEK_END) < state->size) { fprintf(stderr, "Input file too small\n"); exit(1); } } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; close(state->fd); } /* * This alg due to Linus. The goal is to have both sparse and full * mappings reported. */ void domapping(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; register int fd = state->fd; register size_t size = state->size; register int random = state->random; register char *p, *where, *end; register char c = size & 0xff; while (iterations-- > 0) { #ifdef MAP_FILE where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_SHARED, fd, 0); #else where = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); #endif if ((long)where == -1) { perror("mmap"); exit(1); } if (random) { end = where + size; for (p = where; p < end; p += STRIDE) { *p = c; } } else { end = where + (size / N); for (p = where; p < end; p += PSIZE) { *p = c; } } munmap(where, size); } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_ops.c������������������������������������������������������������������������0000664�0000764�0000764�00000030323�10715547567�015603� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_ops.c - benchmark of simple operations * * Copyright (c) 1996-2004 Carl Staelin and Larry McVoy. * * This benchmark is meant to benchmark raw arithmetic operation * latency for various operations on various datatypes. Obviously, * not all operations make sense for all datatypes (e.g., modulus * on float). The benchmarks are configured to use interlocking * operations, so we measure the time of an individual operation. * * The exception to the interlocking operation guidelines are the * vector operations, muladd and bogomflops, for both float and * double data types. In this case we are trying to determine * how well the CPU can schedule the various arithmetic units * and overlap adjacent operations to get the maximal throughput * from the system. In addition, we are using relatively short * vectors so these operations should be going to/from L1 (or * possibly L2) cache, rather than main memory, which should * reduce or eliminate the memory overheads. * * The vector operations use a slightly unrolled loop because * this is common in scientific codes that do these sorts of * operations. */ char *id = "$Id$\n"; #include "bench.h" struct _state { int N; int M; int K; double* data; }; #define FIVE(a) a a a a a #define TEN(a) a a a a a a a a a a #define HUNDRED(a) TEN(TEN(a)) void float_initialize(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int i; register float* x; if (iterations) return; x = (float*)malloc(pState->M * sizeof(float)); pState->data = (double*)x; if (!pState->data) { perror("malloc"); exit(1); } for (i = 0; i < pState->M; ++i) { x[i] = 3.14159265; } } void double_initialize(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int i; if (iterations) return; pState->data = (double*)malloc(pState->M * sizeof(double)); if (!pState->data) { perror("malloc"); exit(1); } for (i = 0; i < pState->M; ++i) { pState->data[i] = 3.14159265; } } void cleanup(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; if (iterations) return; if (pState->data) free(pState->data); } void do_integer_bitwise(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int r = pState->N; register int s = (int)iterations; while (iterations-- > 0) { HUNDRED(r ^= iterations; s ^= r; r |= s;) } use_int(r); } void do_integer_add(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int i; register int a = pState->N + 57; while (iterations-- > 0) { for (i = 1; i < 1001; ++i) { #ifndef __GNUC__ /* required because of an HP ANSI/C compiler bug */ HUNDRED(a=(a+i)^a;) #else TEN(a=a+a+i;) #endif } } use_int(a); } void do_integer_mul(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int r = pState->N + 37431; register int s = pState->N + 4; register int t = r * s * s * s * s * s * s * s * s * s * s - r; while (iterations-- > 0) { TEN(r *= s;); r -= t; TEN(r *= s;); r -= t; } use_int(r); } void do_integer_div(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int r = pState->N + 36; register int s = (r + 1) << 20; while (iterations-- > 0) { HUNDRED(r = s / r;) } use_int(r); } void do_integer_mod(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int r = pState->N + iterations; register int s = pState->N + 62; while (iterations-- > 0) { HUNDRED(r %= s; r |= s;) } use_int(r); } void do_int64_bitwise(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int64 r = (int64)pState->N | ((int64)pState->N << 32); register int64 s = (int64)iterations | ((int64)iterations << 32); register int64 i = ((int64)iterations << 34) - 1; while (iterations-- > 0) { HUNDRED(r ^= i; s ^= r; r |= s;) i--; } use_int((int)r); } void do_int64_add(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int64 i; register int64 a = (int64)pState->N + 37420; register int64 b = (int64)pState->N + 21698324; a += (int64)(0xFE + pState->N)<<30; b += (int64)(0xFFFE + pState->N)<<29; while (iterations-- > 0) { for (i = 1; i < 1001; ++i) { #ifndef __GNUC__ /* required because of an HP ANSI/C compiler bug */ HUNDRED(a=(a+i)^a;) #else TEN(a=a+a+i;) #endif } } use_int((int)a+(int)b); } void do_int64_mul(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int64 r = (int64)pState->N + 37420; register int64 s = (int64)pState->N + 4; register int64 t; r += (int64)(pState->N + 6)<<32; t = r * s * s * s * s * s * s * s * s * s * s - r; while (iterations-- > 0) { TEN(r *= s;); r -= t; TEN(r *= s;); r -= t; } use_int((int)r); } void do_int64_div(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int64 r = (int64)pState->N + 36; register int64 s; r += r << 33; s = (r + 17) << 13; while (iterations-- > 0) { HUNDRED(r = s / r;) } use_int((int)r); } void do_int64_mod(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int64 r = iterations + ((int64)iterations << 32); register int64 s = (int64)pState->N + ((int64)pState->N << 56); while (iterations-- > 0) { HUNDRED(r %= s; r |= s;); } use_int((int)r); } void do_float_add(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register float f = (float)pState->N; register float g = (float)pState->K; while (iterations-- > 0) { TEN(f += (float)f;) f += (float)g; TEN(f += (float)f;) f += (float)g; } use_int((int)f); use_int((int)g); } void do_float_mul(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register float f = 8.0f * (float)pState->N; register float g = 0.125f * (float)pState->M / 1000.0; while (iterations-- > 0) { TEN(f *= f; f *= g;); TEN(f *= f; f *= g;); } use_int((int)f); use_int((int)g); } void do_float_div(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register float f = 1.41421356f * (float)pState->N; register float g = 3.14159265f * (float)pState->M / 1000.0; while (iterations-- > 0) { FIVE(TEN(f = g / f;) TEN(g = f / g;)) } use_int((int)f); use_int((int)g); } void do_double_add(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register double f = (double)pState->N; register double g = (double)pState->K; while (iterations-- > 0) { TEN(f += (double)f;) f += (double)g; TEN(f += (double)f;) f += (double)g; } use_int((int)f); use_int((int)g); } void do_double_mul(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register double f = 8.0 * (double)pState->N; register double g = 0.125 * (double)pState->M / 1000.0; while (iterations-- > 0) { TEN(f *= f; f *= g;) TEN(f *= f; f *= g;) } use_int((int)f); use_int((int)g); } void do_double_div(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register double f = 1.41421356 * (double)pState->N; register double g = 3.14159265 * (double)pState->M / 1000.0; while (iterations-- > 0) { FIVE(TEN(f = g / f;) TEN(g = f / g;)) } use_int((int)f); use_int((int)g); } void do_float_bogomflops(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int i; register int M = pState->M / 10; while (iterations-- > 0) { register float *x = (float*)pState->data; for (i = 0; i < M; ++i) { x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0]; x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1]; x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2]; x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3]; x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4]; x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5]; x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6]; x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7]; x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8]; x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9]; x += 10; } } } void do_double_bogomflops(iter_t iterations, void* cookie) { struct _state *pState = (struct _state*)cookie; register int i; register int M = pState->M / 10; while (iterations-- > 0) { register double *x = (double*)pState->data; for (i = 0; i < M; ++i) { x[0] = (1.0f + x[0]) * (1.5f - x[0]) / x[0]; x[1] = (1.0f + x[1]) * (1.5f - x[1]) / x[1]; x[2] = (1.0f + x[2]) * (1.5f - x[2]) / x[2]; x[3] = (1.0f + x[3]) * (1.5f - x[3]) / x[3]; x[4] = (1.0f + x[4]) * (1.5f - x[4]) / x[4]; x[5] = (1.0f + x[5]) * (1.5f - x[5]) / x[5]; x[6] = (1.0f + x[6]) * (1.5f - x[6]) / x[6]; x[7] = (1.0f + x[7]) * (1.5f - x[7]) / x[7]; x[8] = (1.0f + x[8]) * (1.5f - x[8]) / x[8]; x[9] = (1.0f + x[9]) * (1.5f - x[9]) / x[9]; x += 10; } } } int main(int ac, char **av) { int c; int warmup = 0; int parallel = 1; int repetitions = -1; uint64 iop_time; uint64 iop_N; struct _state state; char *usage = "[-W <warmup>] [-N <repetitions>] [-P <parallel>] \n"; state.N = 1; state.M = 1000; state.K = -1023; state.data = NULL; while (( c = getopt(ac, av, "W:N:P:")) != EOF) { switch(c) { case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; default: lmbench_usage(ac, av, usage); break; } } benchmp(NULL, do_integer_bitwise, NULL, 0, 1, warmup, repetitions, &state); nano("integer bit", get_n() * 100 * 3); iop_time = gettime(); iop_N = get_n() * 100 * 3; benchmp(NULL, do_integer_add, NULL, 0, 1, warmup, repetitions, &state); #ifndef __GNUC__ settime(gettime() - (get_n() * 100000 * iop_time) / iop_N); nano("integer add", get_n() * 100000); #else nano("integer add", get_n() * 10000 * 2); #endif benchmp(NULL, do_integer_mul, NULL, 0, 1, warmup, repetitions, &state); settime(gettime() - (get_n() * 2 * iop_time) / iop_N); nano("integer mul", get_n() * 10 * 2); benchmp(NULL, do_integer_div, NULL, 0, 1, warmup, repetitions, &state); nano("integer div", get_n() * 100); benchmp(NULL, do_integer_mod, NULL, 0, 1, warmup, repetitions, &state); settime(gettime() - (get_n() * 100 * iop_time) / iop_N); nano("integer mod", get_n() * 100); benchmp(NULL, do_int64_bitwise, NULL, 0, 1, warmup, repetitions, &state); nano("int64 bit", get_n() * 100 * 3); iop_time = gettime(); iop_N = get_n() * 100 * 3; benchmp(NULL, do_int64_add, NULL, 0, 1, warmup, repetitions, &state); #ifndef __GNUC__ settime(gettime() - (get_n() * 100000 * iop_time) / iop_N); nano("uint64 add", get_n() * 100000); #else nano("uint64 add", get_n() * 10000 * 2); #endif benchmp(NULL, do_int64_mul, NULL, 0, 1, warmup, repetitions, &state); settime(gettime() - (get_n() * 2 * iop_time) / iop_N); nano("int64 mul", get_n() * 10 * 2); benchmp(NULL, do_int64_div, NULL, 0, 1, warmup, repetitions, &state); nano("int64 div", get_n() * 100); benchmp(NULL, do_int64_mod, NULL, 0, 1, warmup, repetitions, &state); settime(gettime() - (get_n() * 100 * iop_time) / iop_N); nano("int64 mod", get_n() * 100); benchmp(NULL, do_float_add, NULL, 0, 1, warmup, repetitions, &state); nano("float add", get_n() * (10 + 1) * 2); benchmp(NULL, do_float_mul, NULL, 0, 1, warmup, repetitions, &state); nano("float mul", get_n() * 10 * 2 * 2); benchmp(NULL, do_float_div, NULL, 0, 1, warmup, repetitions, &state); nano("float div", get_n() * 100); benchmp(NULL, do_double_add, NULL, 0, 1, warmup, repetitions, &state); nano("double add", get_n() * (10 + 1) * 2); benchmp(NULL, do_double_mul, NULL, 0, 1, warmup, repetitions, &state); nano("double mul", get_n() * 10 * 2 * 2); benchmp(NULL, do_double_div, NULL, 0, 1, warmup, repetitions, &state); nano("double div", get_n() * 100); benchmp(float_initialize, do_float_bogomflops, cleanup, 0, parallel, warmup, repetitions, &state); nano("float bogomflops", get_n() * state.M); fflush(stdout); fflush(stderr); benchmp(double_initialize, do_double_bogomflops, cleanup, 0, parallel, warmup, repetitions, &state); nano("double bogomflops", get_n() * state.M); fflush(stdout); fflush(stderr); return(0); } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_pagefault.c������������������������������������������������������������������0000664�0000764�0000764�00000010664�10715547567�016760� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_pagefault.c - time a page fault in * * Usage: lat_pagefault [-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define CHK(x) if ((x) == -1) { perror("x"); exit(1); } typedef struct _state { int fd; int size; int npages; int clone; char* file; char* where; size_t* pages; } state_t; void initialize(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void benchmark(iter_t iterations, void * cookie); void benchmark_mmap(iter_t iterations, void * cookie); int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int c; double t_mmap; double t_combined; struct stat st; struct _state state; char buf[2048]; char* usage = "[-C] [-P <parallel>] [-W <warmup>] [-N <repetitions>] file\n"; state.clone = 0; while (( c = getopt(ac, av, "P:W:N:C")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'C': state.clone = 1; break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1 ) { lmbench_usage(ac, av, usage); } state.file = av[optind]; CHK(stat(state.file, &st)); state.npages = st.st_size / (size_t)getpagesize(); #ifdef MS_INVALIDATE benchmp(initialize, benchmark_mmap, cleanup, 0, parallel, warmup, repetitions, &state); t_mmap = gettime() / (double)get_n(); benchmp(initialize, benchmark, cleanup, 0, parallel, warmup, repetitions, &state); t_combined = gettime() / (double)get_n(); settime(get_n() * (t_combined - t_mmap)); sprintf(buf, "Pagefaults on %s", state.file); micro(buf, state.npages * get_n()); #endif return(0); } void initialize(iter_t iterations, void* cookie) { int pagesize; struct stat sbuf; state_t *state = (state_t *) cookie; if (iterations) return; if (state->clone) { char buf[128]; char* s; /* copy original file into a process-specific one */ sprintf(buf, "%d", (int)getpid()); s = (char*)malloc(strlen(state->file) + strlen(buf) + 1); if (!s) { perror("malloc"); exit(1); } sprintf(s, "%s%d", state->file, (int)getpid()); if (cp(state->file, s, S_IREAD|S_IWRITE) < 0) { perror("Could not copy file"); unlink(s); exit(1); } state->file = s; } CHK(state->fd = open(state->file, 0)); if (state->clone) unlink(state->file); CHK(fstat(state->fd, &sbuf)); srand(getpid()); pagesize = getpagesize(); state->size = sbuf.st_size; state->size -= state->size % pagesize; state->npages = state->size / pagesize; state->pages = permutation(state->npages, pagesize); if (state->size < 1024*1024) { fprintf(stderr, "lat_pagefault: %s too small\n", state->file); exit(1); } state->where = mmap(0, state->size, PROT_READ, MAP_SHARED, state->fd, 0); #ifdef MS_INVALIDATE if (msync(state->where, state->size, MS_INVALIDATE) != 0) { perror("msync"); exit(1); } #endif } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; munmap(state->where, state->size); if (state->fd >= 0) close(state->fd); free(state->pages); } void benchmark(iter_t iterations, void* cookie) { int i; int sum = 0; state_t *state = (state_t *) cookie; while (iterations-- > 0) { for (i = 0; i < state->npages; ++i) { sum += *(state->where + state->pages[i]); } munmap(state->where, state->size); state->where = mmap(0, state->size, PROT_READ, MAP_SHARED, state->fd, 0); #ifdef MS_INVALIDATE if (msync(state->where, state->size, MS_INVALIDATE) != 0) { perror("msync"); exit(1); } #endif } use_int(sum); } void benchmark_mmap(iter_t iterations, void* cookie) { int sum = 0; state_t *state = (state_t *) cookie; while (iterations-- > 0) { munmap(state->where, state->size); state->where = mmap(0, state->size, PROT_READ, MAP_SHARED, state->fd, 0); #ifdef MS_INVALIDATE if (msync(state->where, state->size, MS_INVALIDATE) != 0) { perror("msync"); exit(1); } #endif } use_int(sum); } ����������������������������������������������������������������������������lmbench-3.0-a9/src/lat_pipe.c�����������������������������������������������������������������������0000664�0000764�0000764�00000005730�10620624536�015726� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_pipe.c - pipe transaction test * * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void initialize(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void doit(iter_t iterations, void *cookie); void writer(int w, int r); typedef struct _state { int pid; int p1[2]; int p2[2]; } state_t; int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } state.pid = 0; benchmp(initialize, doit, cleanup, SHORT, parallel, warmup, repetitions, &state); micro("Pipe latency", get_n()); return (0); } void initialize(iter_t iterations, void* cookie) { char c; state_t * state = (state_t *)cookie; if (iterations) return; if (pipe(state->p1) == -1) { perror("pipe"); exit(1); } if (pipe(state->p2) == -1) { perror("pipe"); exit(1); } handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case 0: handle_scheduler(benchmp_childid(), 1, 1); signal(SIGTERM, exit); close(state->p1[1]); close(state->p2[0]); writer(state->p2[1], state->p1[0]); return; case -1: perror("fork"); return; default: close(state->p1[0]); close(state->p2[1]); break; } /* * One time around to make sure both processes are started. */ if (write(state->p1[1], &c, 1) != 1 || read(state->p2[0], &c, 1) != 1){ perror("(i) read/write on pipe"); exit(1); } } void cleanup(iter_t iterations, void* cookie) { state_t * state = (state_t *)cookie; if (iterations) return; if (state->pid) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); state->pid = 0; } } void doit(register iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; char c; register int w = state->p1[1]; register int r = state->p2[0]; register char *cptr = &c; while (iterations-- > 0) { if (write(w, cptr, 1) != 1 || read(r, cptr, 1) != 1) { perror("(r) read/write on pipe"); exit(1); } } } void writer(register int w, register int r) { char c; register char *cptr = &c; for ( ;; ) { if (read(r, cptr, 1) != 1 || write(w, cptr, 1) != 1) { perror("(w) read/write on pipe"); } } } ����������������������������������������lmbench-3.0-a9/src/lat_pmake.c����������������������������������������������������������������������0000664�0000764�0000764�00000007341�10715547567�016103� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_pmake.c - time to complete N jobs which each do usecs worth of work * * usage: lat_pipe [-P <parallelism>] [-W <warmup>] [-N <repetitions>] jobs usecs * * Copyright (c) 1994 Larry McVoy. * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void setup(iter_t iterations, void* cookie); void bench(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void work(iter_t iterations, void *cookie); typedef struct _state { int jobs; /* number of jobs to create */ iter_t iterations; /* how long each job should work */ long* x; /* used by work() */ long** p; pid_t* pids; } state_t; int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; double time; uint64 usecs; char buf[1024]; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] Njobs usecs...\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (ac < optind + 2) { lmbench_usage(ac, av, usage); } state.jobs = atoi(av[optind]); state.pids = NULL; fprintf(stderr, "\"pmake jobs=%d\n", state.jobs); while (++optind < ac) { usecs = bytes(av[optind]); benchmp(setup, work, NULL, 0, 1, 0, TRIES, &state); if (gettime() == 0) exit(1); state.iterations = (iter_t)((usecs * get_n()) / gettime()); benchmp(setup, bench, NULL, 0, parallel, warmup, repetitions, &state); time = gettime(); time /= get_n(); if (time > 0.0) fprintf(stderr, "%llu %.2f\n", usecs, time); } return (0); } void setup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->x = (long*)malloc(sizeof(long*)); if (!state->x) { perror("malloc"); exit(1); } *(long**)state->x = state->x; state->p = (long**)state->x; handle_scheduler(benchmp_childid(), 0, state->jobs); } void bench(register iter_t iterations, void *cookie) { int i; int status; state_t *state = (state_t *) cookie; state->pids = (pid_t*)malloc(state->jobs * sizeof(pid_t)); if (!state->pids) { perror("malloc"); exit(2); } /* * This design has one buglet --- we cannot detect if the * worker process died prematurely. I.e., we don't have * a handshake step to collect "I finished correctly" * messages. */ while (iterations-- > 0) { for (i = 0; i < state->jobs; ++i) { if ((state->pids[i] = fork()) == 0) { handle_scheduler(benchmp_childid(), i+1, state->jobs); work(state->iterations, state); exit(0); } } for (i = 0; i < state->jobs; ++i) { waitpid(state->pids[i], &status, 0); state->pids[i] = -1; /* child died badly */ if (!WIFEXITED(status)) { cleanup(0, cookie); exit(1); } } } } void cleanup(register iter_t iterations, void *cookie) { int i; state_t *state = (state_t *) cookie; for (i = 0; i < state->jobs; ++i) { if (state->pids[i] > 0) { kill(state->pids[i], SIGKILL); waitpid(state->pids[i], NULL, 0); state->pids[i] = -1; } } } void work(register iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; register long** p = state->p; #define WORK_TEN(one) one one one one one one one one one one while (iterations-- > 0) { WORK_TEN(p = (long**) *p;); } state->p = p; } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_proc.c�����������������������������������������������������������������������0000664�0000764�0000764�00000007516�10620624537�015741� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_proc.c - process creation tests * * Usage: lat_proc [-P <parallelism] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell * * TODO - linux clone, plan9 rfork, IRIX sproc(). * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #ifdef STATIC #define PROG "/tmp/hello-s" #define STATIC_PREFIX "Static " #else #define PROG "/tmp/hello" #define STATIC_PREFIX "" #endif void do_shell(iter_t iterations, void* cookie); void do_forkexec(iter_t iterations,void* cookie); void do_fork(iter_t iterations, void* cookie); void do_procedure(iter_t iterations, void* cookie); pid_t child_pid; void cleanup(iter_t iterations, void* cookie) { if (iterations) return; if (child_pid) { kill(child_pid, SIGKILL); waitpid(child_pid, NULL, 0); child_pid = 0; } } int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] procedure|fork|exec|shell\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 1 != ac) { /* should have one argument left */ lmbench_usage(ac, av, usage); } if (!strcmp("procedure", av[optind])) { benchmp(NULL, do_procedure, cleanup, 0, parallel, warmup, repetitions, &ac); micro("Procedure call", get_n()); } else if (!strcmp("fork", av[optind])) { benchmp(NULL, do_fork, cleanup, 0, parallel, warmup, repetitions, NULL); micro(STATIC_PREFIX "Process fork+exit", get_n()); } else if (!strcmp("exec", av[optind])) { benchmp(NULL, do_forkexec, cleanup, 0, parallel, warmup, repetitions, NULL); micro(STATIC_PREFIX "Process fork+execve", get_n()); } else if (!strcmp("shell", av[optind])) { benchmp(NULL, do_shell, cleanup, 0, parallel, warmup, repetitions, NULL); micro(STATIC_PREFIX "Process fork+/bin/sh -c", get_n()); } else { lmbench_usage(ac, av, usage); } return(0); } void do_shell(iter_t iterations, void* cookie) { signal(SIGCHLD, SIG_DFL); handle_scheduler(benchmp_childid(), 0, 1); while (iterations-- > 0) { switch (child_pid = fork()) { case -1: perror("fork"); exit(1); case 0: /* child */ handle_scheduler(benchmp_childid(), 1, 1); close(1); execlp("/bin/sh", "sh", "-c", PROG, 0); exit(1); default: waitpid(child_pid, NULL,0); } child_pid = 0; } } void do_forkexec(iter_t iterations, void* cookie) { char *nav[2]; signal(SIGCHLD, SIG_DFL); handle_scheduler(benchmp_childid(), 0, 1); while (iterations-- > 0) { nav[0] = PROG; nav[1] = 0; switch (child_pid = fork()) { case -1: perror("fork"); exit(1); case 0: /* child */ handle_scheduler(benchmp_childid(), 1, 1); close(1); execve(PROG, nav, 0); exit(1); default: waitpid(child_pid, NULL,0); } child_pid = 0; } } void do_fork(iter_t iterations, void* cookie) { signal(SIGCHLD, SIG_DFL); handle_scheduler(benchmp_childid(), 0, 1); while (iterations-- > 0) { switch (child_pid = fork()) { case -1: perror("fork"); exit(1); case 0: /* child */ handle_scheduler(benchmp_childid(), 1, 1); exit(1); default: waitpid(child_pid, NULL,0); } child_pid = 0; } } void do_procedure(iter_t iterations, void* cookie) { int r = *(int *) cookie; handle_scheduler(benchmp_childid(), 0, 1); while (iterations-- > 0) { use_int(r); } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_rand.c�����������������������������������������������������������������������0000664�0000764�0000764�00000005072�10620624537�015715� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_rand.c - random number generation * * usage: lat_rand [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Hewlett-Packard is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #ifdef HAVE_DRAND48 void bench_drand48(iter_t iterations, void *cookie); void bench_lrand48(iter_t iterations, void *cookie); #endif #ifdef HAVE_RAND void bench_rand(iter_t iterations, void *cookie); #endif #ifdef HAVE_RANDOM void bench_random(iter_t iterations, void *cookie); #endif int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } #ifdef HAVE_DRAND48 benchmp(NULL, bench_drand48, NULL, 0, parallel, warmup, repetitions, NULL); nano("drand48 latency", get_n()); benchmp(NULL, bench_lrand48, NULL, 0, parallel, warmup, repetitions, NULL); nano("lrand48 latency", get_n()); #endif #ifdef HAVE_RAND benchmp(NULL, bench_rand, NULL, 0, parallel, warmup, repetitions, NULL); nano("rand latency", get_n()); #endif #ifdef HAVE_RANDOM benchmp(NULL, bench_random, NULL, 0, parallel, warmup, repetitions, NULL); nano("random latency", get_n()); #endif return (0); } #ifdef HAVE_DRAND48 void bench_drand48(register iter_t iterations, void *cookie) { register double v = 0.0; while (iterations-- > 0) { v += drand48(); } use_int((int)v); } void bench_lrand48(register iter_t iterations, void *cookie) { register long v = 0.0; while (iterations-- > 0) { v += lrand48(); } use_int((int)v); } #endif /* HAVE_DRAND48 */ #ifdef HAVE_RAND void bench_rand(register iter_t iterations, void *cookie) { register int v = 0.0; while (iterations-- > 0) { v += rand(); } use_int((int)v); } #endif /* HAVE_RAND */ #ifdef HAVE_RANDOM void bench_random(register iter_t iterations, void *cookie) { register int v = 0.0; while (iterations-- > 0) { v += random(); } use_int((int)v); } #endif /* HAVE_RANDOM */ ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_rpc.c������������������������������������������������������������������������0000664�0000764�0000764�00000013655�10620624537�015563� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_rpc.c - simple RPC transaction latency test * * Four programs in one - * server usage: lat_rpc -s * client usage: lat_rpc hostname * client usage: lat_rpc -p tcp hostname * client usage: lat_rpc -p udp hostname * shutdown: lat_rpc -S hostname * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void client_main(int ac, char **av); void server_main(); void benchmark(iter_t iterations, void* _state); char *client_rpc_xact_1(char *argp, CLIENT *clnt); void doit(CLIENT *cl, char *server) { char c = 1; char *resp; resp = client_rpc_xact_1(&c, cl); if (!resp) { clnt_perror(cl, server); exit(1); } if (*resp != 123) { fprintf(stderr, "lat_rpc: got bad data\n"); exit(1); } } /* Default timeout can be changed using clnt_control() */ static struct timeval TIMEOUT = { 0, 25000 }; char *proto[] = { "tcp", "udp", 0 }; typedef struct state_ { int msize; char *server; char *protocol; CLIENT *cl; } state_t; void initialize(iter_t iterations, void* cookie) { struct timeval tv; state_t *state = (state_t*)cookie; if (iterations) return; state->cl = clnt_create(state->server, XACT_PROG, XACT_VERS, state->protocol); if (!state->cl) { clnt_pcreateerror(state->server); exit(1); } if (strcasecmp(state->protocol, proto[1]) == 0) { tv.tv_sec = 0; tv.tv_usec = 2500; if (!clnt_control(state->cl, CLSET_RETRY_TIMEOUT, (char *)&tv)) { clnt_perror(state->cl, "setting timeout"); exit(1); } } } void benchmark(iter_t iterations, void* _state) { state_t* state = (state_t*)_state; while (iterations-- > 0) { doit(state->cl, state->server); } } int main(int ac, char **av) { int c; int parallel = 1; int warmup = 0; int repetitions = -1; state_t state; CLIENT *cl; char buf[1024]; char *protocol = NULL; char *usage = "-s\n OR [-p <tcp|udp>] [-P parallel] [-W <warmup>] [-N <repetitions>] serverhost\n OR -S serverhost\n"; state.msize = 1; while (( c = getopt(ac, av, "sS:m:p:P:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); case 'S': /* shutdown serverhost */ { cl = clnt_create(optarg, XACT_PROG, XACT_VERS, "udp"); if (!cl) { clnt_pcreateerror(state.server); exit(1); } clnt_call(cl, RPC_EXIT, (xdrproc_t)xdr_void, 0, (xdrproc_t)xdr_void, 0, TIMEOUT); exit(0); } case 'm': state.msize = atoi(optarg); break; case 'p': protocol = optarg; break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1) { lmbench_usage(ac, av, usage); } state.server = av[optind++]; if (protocol == NULL || !strcasecmp(protocol, proto[0])) { state.protocol = proto[0]; benchmp(initialize, benchmark, NULL, MEDIUM, parallel, warmup, repetitions, &state); sprintf(buf, "RPC/%s latency using %s", proto[0], state.server); micro(buf, get_n()); } if (protocol == NULL || !strcasecmp(protocol, proto[1])) { state.protocol = proto[1]; benchmp(initialize, benchmark, NULL, MEDIUM, parallel, warmup, repetitions, &state); sprintf(buf, "RPC/%s latency using %s", proto[1], state.server); micro(buf, get_n()); } exit(0); } char * client_rpc_xact_1(char *argp, CLIENT *clnt) { static char res; bzero((void*)&res, sizeof(res)); if (clnt_call(clnt, RPC_XACT, (xdrproc_t)xdr_char, argp, (xdrproc_t)xdr_char, &res, TIMEOUT) != RPC_SUCCESS) { return (NULL); } return (&res); } /* * The remote procedure[s] that will be called */ /* ARGSUSED */ char * rpc_xact_1(msg, transp) char *msg; register SVCXPRT *transp; { static char r = 123; return &r; } static void xact_prog_1(); void server_main() { register SVCXPRT *transp; GO_AWAY; (void) pmap_unset(XACT_PROG, XACT_VERS); transp = svcudp_create(RPC_ANYSOCK); if (transp == NULL) { fprintf(stderr, "cannot create udp service.\n"); exit(1); } if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_UDP)) { fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, udp).\n"); exit(1); } transp = svctcp_create(RPC_ANYSOCK, 0, 0); if (transp == NULL) { fprintf(stderr, "cannot create tcp service.\n"); exit(1); } if (!svc_register(transp, XACT_PROG, XACT_VERS, xact_prog_1, IPPROTO_TCP)) { fprintf(stderr, "unable to register (XACT_PROG, XACT_VERS, tcp).\n"); exit(1); } svc_run(); fprintf(stderr, "svc_run returned\n"); exit(1); /* NOTREACHED */ } static void xact_prog_1(rqstp, transp) struct svc_req *rqstp; register SVCXPRT *transp; { union { char rpc_xact_1_arg; } argument; char *result; bool_t (*xdr_argument)(), (*xdr_result)(); char *(*local)(); switch (rqstp->rq_proc) { case NULLPROC: (void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL); return; case RPC_XACT: xdr_argument = xdr_char; xdr_result = xdr_char; local = (char *(*)()) rpc_xact_1; break; case RPC_EXIT: (void) svc_sendreply(transp, (xdrproc_t)xdr_void, (char *)NULL); (void) pmap_unset(XACT_PROG, XACT_VERS); exit(0); default: svcerr_noproc(transp); return; } bzero((char *)&argument, sizeof(argument)); if (!svc_getargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) { svcerr_decode(transp); return; } result = (*local)(&argument, rqstp); if (result != NULL && !svc_sendreply(transp, (xdrproc_t)xdr_result, result)) { svcerr_systemerr(transp); } if (!svc_freeargs(transp, (xdrproc_t)xdr_argument, (char*)&argument)) { fprintf(stderr, "unable to free arguments\n"); exit(1); } return; } �����������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_select.c���������������������������������������������������������������������0000664�0000764�0000764�00000010566�10620624537�016254� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_select.c - time select system call * * usage: lat_select [-P <parallelism>] [-W <warmup>] [-N <repetitions>] [n] * * Copyright (c) 1996 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. */ char *id = "$Id$\n"; #include "bench.h" void initialize(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void doit(iter_t iterations, void *cookie); void writer(int w, int r); void server(void* cookie); typedef int (*open_f)(void* cookie); int open_file(void* cookie); int open_socket(void* cookie); typedef struct _state { char fname[L_tmpnam]; open_f fid_f; pid_t pid; int sock; int fid; int num; int max; fd_set set; } state_t; int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-n <#descriptors>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] file|tcp\n"; char buf[256]; morefds(); /* bump fd_cur to fd_max */ state.num = 200; while (( c = getopt(ac, av, "P:W:N:n:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; case 'n': state.num = bytes(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 1 != ac) { lmbench_usage(ac, av, usage); } if (streq("tcp", av[optind])) { state.fid_f = open_socket; server(&state); benchmp(initialize, doit, cleanup, 0, parallel, warmup, repetitions, &state); sprintf(buf, "Select on %d tcp fd's", state.num); kill(state.pid, SIGKILL); waitpid(state.pid, NULL, 0); micro(buf, get_n()); } else if (streq("file", av[optind])) { state.fid_f = open_file; server(&state); benchmp(initialize, doit, cleanup, 0, parallel, warmup, repetitions, &state); unlink(state.fname); sprintf(buf, "Select on %d fd's", state.num); micro(buf, get_n()); } else { lmbench_usage(ac, av, usage); } exit(0); } void server(void* cookie) { int pid; state_t* state = (state_t*)cookie; pid = getpid(); state->pid = 0; if (state->fid_f == open_file) { /* Create a temporary file for clients to open */ sprintf(state->fname, "lat_selectXXXXXX"); state->fid = mkstemp(state->fname); if (state->fid <= 0) { char buf[L_tmpnam+128]; sprintf(buf, "lat_select: Could not create temp file %s", state->fname); perror(buf); exit(1); } close(state->fid); return; } /* Create a socket for clients to connect to */ state->sock = tcp_server(TCP_SELECT, SOCKOPT_REUSE); if (state->sock <= 0) { perror("lat_select: Could not open tcp server socket"); exit(1); } /* Start a server process to accept client connections */ switch(state->pid = fork()) { case 0: /* child server process */ while (pid == getppid()) { int newsock = tcp_accept(state->sock, SOCKOPT_NONE); read(newsock, &state->fid, 1); close(newsock); } exit(0); case -1: /* error */ perror("lat_select::server(): fork() failed"); exit(1); default: break; } } int open_socket(void* cookie) { return tcp_connect("localhost", TCP_SELECT, SOCKOPT_NONE); } int open_file(void* cookie) { state_t* state = (state_t*)cookie; return open(state->fname, O_RDONLY); } void doit(iter_t iterations, void * cookie) { state_t * state = (state_t *)cookie; fd_set nosave; static struct timeval tv; tv.tv_sec = 0; tv.tv_usec = 0; while (iterations-- > 0) { nosave = state->set; select(state->num, 0, &nosave, 0, &tv); } } void initialize(iter_t iterations, void *cookie) { state_t * state = (state_t *)cookie; int n; int N = state->num, fid, fd; if (iterations) return; fid = (*state->fid_f)(cookie); if (fid <= 0) { perror("Could not open device"); exit(1); } state->max = 0; FD_ZERO(&(state->set)); for (n = 0; n < N; n++) { fd = dup(fid); if (fd == -1) break; if (fd > state->max) state->max = fd; FD_SET(fd, &(state->set)); } state->max++; close(fid); if (n != N) exit(1); } void cleanup(iter_t iterations, void *cookie) { int i; state_t * state = (state_t *)cookie; if (iterations) return; for (i = 0; i <= state->max; ++i) { if (FD_ISSET(i, &(state->set))) close(i); } FD_ZERO(&(state->set)); } ������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_sem.c������������������������������������������������������������������������0000664�0000764�0000764�00000006052�10620624537�015554� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_sem.c - semaphore test * * usage: lat_sem [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #include <sys/sem.h> void initialize(iter_t iterations, void *cookie); void cleanup(iter_t iterations, void *cookie); void doit(iter_t iterations, void *cookie); void writer(int sid); typedef struct _state { int pid; int semid; } state_t; int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } state.pid = 0; benchmp(initialize, doit, cleanup, SHORT, parallel, warmup, repetitions, &state); micro("Semaphore latency", get_n() * 2); return (0); } void initialize(iter_t iterations, void* cookie) { char c; state_t * state = (state_t *)cookie; if (iterations) return; state->semid = semget(IPC_PRIVATE, 2, IPC_CREAT | IPC_EXCL | 0600); semctl(state->semid, 0, SETVAL, 0); semctl(state->semid, 1, SETVAL, 0); handle_scheduler(benchmp_childid(), 0, 1); switch (state->pid = fork()) { case 0: signal(SIGTERM, exit); handle_scheduler(benchmp_childid(), 1, 1); writer(state->semid); return; case -1: perror("fork"); return; default: break; } } void cleanup(iter_t iterations, void* cookie) { state_t * state = (state_t *)cookie; if (iterations) return; if (state->pid) { kill(state->pid, SIGKILL); waitpid(state->pid, NULL, 0); state->pid = 0; } /* free the semaphores */ semctl(state->semid, 0, IPC_RMID); } void doit(register iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; struct sembuf sop[2]; sop[0].sem_num = 1; sop[0].sem_op = -1; sop[0].sem_flg = 0; sop[1].sem_num = 0; sop[1].sem_op = 1; sop[1].sem_flg = 0; while (iterations-- > 0) { if (semop(state->semid, sop, 2) < 0) { perror("(r) error on semaphore"); exit(1); } } } void writer(register int sid) { struct sembuf sop[2]; sop[0].sem_num = 1; sop[0].sem_op = 1; sop[0].sem_flg = 0; if (semop(sid, sop, 1) < 0) { perror("(w) error on initial semaphore"); exit(1); } sop[0].sem_num = 0; sop[0].sem_op = -1; sop[0].sem_flg = 0; sop[1].sem_num = 1; sop[1].sem_op = 1; sop[1].sem_flg = 0; for ( ;; ) { if (semop(sid, sop, 2) < 0) { perror("(w) error on semaphore"); exit(1); } } } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_sig.c������������������������������������������������������������������������0000664�0000764�0000764�00000010624�10620624540�015544� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_sig.c - signal handler test * * XXX - this benchmark requires the POSIX sigaction interface. The reason * for that is that the signal handler stays installed with that interface. * The more portable signal() interface may or may not stay installed and * reinstalling it each time is expensive. * * XXX - should really do a two process version. * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. */ char *id = "$Id$\n"; #include "bench.h" #include <setjmp.h> uint64 caught, n; double adj; void handler(int s) { } jmp_buf prot_env; void do_send(iter_t iterations, void* cookie) { int me = getpid(); while (--iterations > 0) { kill(me, 0); } } void do_install(iter_t iterations, void* cookie) { struct sigaction sa, old; while (iterations-- > 0) { sa.sa_handler = handler; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGUSR1, &sa, &old); } } void do_catch(iter_t iterations, void* cookie) { int me = getpid(); struct sigaction sa, old; sa.sa_handler = handler; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGUSR1, &sa, &old); while (--iterations > 0) { kill(me, SIGUSR1); } } struct _state { char* fname; char* where; }; void prot(int s) { if (++caught == n) { caught = 0; n = benchmp_interval(benchmp_getstate()); } } void initialize(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; int fd; struct sigaction sa; if (iterations) return; fd = open(state->fname, 0); state->where = mmap(0, 4096, PROT_READ, MAP_SHARED, fd, 0); if ((long)state->where == -1) { perror("mmap"); exit(1); } sa.sa_handler = prot; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGSEGV, &sa, 0); sigaction(SIGBUS, &sa, 0); } void do_prot(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; n = iterations; caught = 0; /* start the first timing interval */ start(0); /* trigger the page fault, causing us to jump to prot() */ *state->where = 1; } /* * Cost of catching the signal less the cost of sending it */ void bench_catch(int parallel, int warmup, int repetitions) { uint64 send_usecs, send_n; /* measure cost of sending signal */ benchmp(NULL, do_send, NULL, 0, parallel, warmup, repetitions, NULL); send_usecs = gettime(); send_n = get_n(); /* measure cost of sending & catching signal */ benchmp(NULL, do_catch, NULL, 0, parallel, warmup, repetitions, NULL); /* subtract cost of sending signal */ if (gettime() > (send_usecs * get_n()) / send_n) { settime(gettime() - (send_usecs * get_n()) / send_n); } else { settime(0); } } void bench_prot(char* fname, int parallel, int warmup, int repetitions) { uint64 catch_usecs, catch_n; struct _state state; state.fname = fname; /* * Catch protection faults. * Assume that they will cost the same as a normal catch. */ bench_catch(parallel, warmup, repetitions); catch_usecs = gettime(); catch_n = get_n(); benchmp(initialize, do_prot, NULL, 0, parallel, warmup, repetitions, &state); if (gettime() > (catch_usecs * get_n()) / catch_n) { settime(gettime() - (catch_usecs * get_n()) / catch_n); } else { settime(0); } } int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int c; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] install|catch|prot [file]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1 && optind != ac - 2) { lmbench_usage(ac, av, usage); } if (!strcmp("install", av[optind])) { benchmp(NULL, do_install, NULL, 0, parallel, warmup, repetitions, NULL); micro("Signal handler installation", get_n()); } else if (!strcmp("catch", av[optind])) { bench_catch(parallel, warmup, repetitions); micro("Signal handler overhead", get_n()); } else if (!strcmp("prot", av[optind]) && optind == ac - 2) { bench_prot(av[optind+1], parallel, warmup, repetitions); micro("Protection fault", get_n()); } else { lmbench_usage(ac, av, usage); } return(0); } ������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_syscall.c��������������������������������������������������������������������0000664�0000764�0000764�00000007127�10620624540�016440� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_syscall.c - time simple system calls * * Copyright (c) 1996 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. */ char *id = "$Id: s.lat_syscall.c 1.11 97/06/15 22:38:58-07:00 lm $\n"; #include "bench.h" #define FNAME "/usr/include/sys/types.h" struct _state { int fd; char* file; }; void do_getppid(iter_t iterations, void *cookie) { while (iterations-- > 0) { getppid(); } } void do_write(iter_t iterations, void *cookie) { struct _state *pState = (struct _state*)cookie; char c; while (iterations-- > 0) { if (write(pState->fd, &c, 1) != 1) { perror("/dev/null"); return; } } } void do_read(iter_t iterations, void *cookie) { struct _state *pState = (struct _state*)cookie; char c; while (iterations-- > 0) { if (read(pState->fd, &c, 1) != 1) { perror("/dev/zero"); return; } } } void do_stat(iter_t iterations, void *cookie) { struct _state *pState = (struct _state*)cookie; struct stat sbuf; while (iterations-- > 0) { if (stat(pState->file, &sbuf) == -1) { perror(pState->file); return; } } } void do_fstat(iter_t iterations, void *cookie) { struct _state *pState = (struct _state*)cookie; struct stat sbuf; while (iterations-- > 0) { if (fstat(pState->fd, &sbuf) == -1) { perror("fstat"); return; } } } void do_openclose(iter_t iterations, void *cookie) { struct _state *pState = (struct _state*)cookie; int fd; while (iterations-- > 0) { fd = open(pState->file, 0); if (fd == -1) { perror(pState->file); return; } close(fd); } } int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; int c; struct _state state; char* usage = "[-P <parallelism>] [-W <warmup>] [-N <repetitions>] null|read|write|stat|fstat|open [file]\n"; while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1 && optind != ac - 2 ) { lmbench_usage(ac, av, usage); } state.file = FNAME; if (optind == ac - 2) state.file = av[optind + 1]; if (!strcmp("null", av[optind])) { benchmp(NULL, do_getppid, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple syscall", get_n()); } else if (!strcmp("write", av[optind])) { state.fd = open("/dev/null", 1); benchmp(NULL, do_write, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple write", get_n()); close(state.fd); } else if (!strcmp("read", av[optind])) { state.fd = open("/dev/zero", 0); if (state.fd == -1) { fprintf(stderr, "Simple read: -1\n"); return(1); } benchmp(NULL, do_read, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple read", get_n()); close(state.fd); } else if (!strcmp("stat", av[optind])) { benchmp(NULL, do_stat, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple stat", get_n()); } else if (!strcmp("fstat", av[optind])) { state.fd = open(state.file, 0); benchmp(NULL, do_fstat, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple fstat", get_n()); close(state.fd); } else if (!strcmp("open", av[optind])) { benchmp(NULL, do_openclose, NULL, 0, parallel, warmup, repetitions, &state); micro("Simple open/close", get_n()); } else { lmbench_usage(ac, av, usage); } return(0); } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_tcp.c������������������������������������������������������������������������0000664�0000764�0000764�00000007012�10715547567�015567� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_tcp.c - simple TCP transaction latency test * * Three programs in one - * server usage: tcp_xact -s * client usage: tcp_xact [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname * shutdown: tcp_xact -S hostname * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" typedef struct _state { int msize; int sock; char *server; char *buf; } state_t; void init(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); void doclient(iter_t iterations, void* cookie); void server_main(); void doserver(int sock); int main(int ac, char **av) { state_t state; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char buf[256]; char *usage = "-s\n OR [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n OR -S server\n"; state.msize = 1; while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); case 'S': /* shutdown serverhost */ state.sock = tcp_connect(optarg, TCP_XACT, SOCKOPT_NONE); close(state.sock); exit(0); case 'm': state.msize = atoi(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1) { lmbench_usage(ac, av, usage); } state.server = av[optind]; benchmp(init, doclient, cleanup, MEDIUM, parallel, warmup, repetitions, &state); sprintf(buf, "TCP latency using %s", state.server); micro(buf, get_n()); exit(0); } void init(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; int msize = htonl(state->msize); if (iterations) return; state->sock = tcp_connect(state->server, TCP_XACT, SOCKOPT_NONE); state->buf = malloc(state->msize); if (!state->buf) { perror("malloc"); exit(1); } write(state->sock, &msize, sizeof(int)); } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; close(state->sock); free(state->buf); } void doclient(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; int sock = state->sock; while (iterations-- > 0) { write(sock, state->buf, state->msize); read(sock, state->buf, state->msize); } } void server_main() { int newsock, sock; GO_AWAY; signal(SIGCHLD, sigchld_wait_handler); sock = tcp_server(TCP_XACT, SOCKOPT_REUSE); for (;;) { newsock = tcp_accept(sock, SOCKOPT_NONE); switch (fork()) { case -1: perror("fork"); break; case 0: doserver(newsock); exit(0); default: close(newsock); break; } } /* NOTREACHED */ } void doserver(int sock) { int n; if (read(sock, &n, sizeof(int)) == sizeof(int)) { int msize = ntohl(n); char* buf = (char*)malloc(msize); if (!buf) { perror("malloc"); exit(4); } for (n = 0; read(sock, buf, msize) > 0; n++) { write(sock, buf, msize); } free(buf); } else { /* * A connection with no data means shut down. */ tcp_done(TCP_XACT); kill(getppid(), SIGTERM); exit(0); } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_udp.c������������������������������������������������������������������������0000664�0000764�0000764�00000010363�10715547567�015574� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * udp_xact.c - simple UDP transaction latency test * * Three programs in one - * server usage: lat_udp -s * client usage: lat_udp [-P <parallelism>] [-W <warmup>] [-N <repetitions>] hostname * shutdown: lat_udp -S hostname * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define MAX_MSIZE (10 * 1024 * 1024) void client_main(int ac, char **av); void server_main(); void timeout(); void init(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); void doit(iter_t iterations, void* cookie); typedef struct _state { int sock; int seq; int msize; char *server; char *buf; } state_t; int main(int ac, char **av) { state_t state; int c; int parallel = 1; int warmup = 0; int repetitions = -1; int msize = 4; char buf[256]; char *usage = "-s\n OR [-S] [-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] server\n NOTE: message size must be >= 4\n"; if (sizeof(int) != 4) { fprintf(stderr, "lat_udp: Wrong sequence size\n"); return(1); } while (( c = getopt(ac, av, "sS:m:P:W:N:")) != EOF) { switch(c) { case 's': /* Server */ if (fork() == 0) { server_main(); } exit(0); case 'S': /* shutdown serverhost */ { int seq, n; int sock = udp_connect(optarg, UDP_XACT, SOCKOPT_NONE); for (n = -1; n > -5; --n) { seq = htonl(n); (void) send(sock, &seq, sizeof(int), 0); } close(sock); exit (0); } case 'm': msize = atoi(optarg); if (msize < sizeof(int)) { lmbench_usage(ac, av, usage); msize = 4; } if (msize > MAX_MSIZE) { lmbench_usage(ac, av, usage); msize = MAX_MSIZE; } break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind + 1 != ac) { lmbench_usage(ac, av, usage); } state.server = av[optind]; state.msize = msize; benchmp(init, doit, cleanup, SHORT, parallel, warmup, repetitions, &state); sprintf(buf, "UDP latency using %s", state.server); micro(buf, get_n()); exit(0); } void init(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; state->sock = udp_connect(state->server, UDP_XACT, SOCKOPT_NONE); state->seq = 0; state->buf = (char*)malloc(state->msize); if (!state->buf) { perror("malloc"); exit(1); } signal(SIGALRM, timeout); alarm(15); } void doit(iter_t iterations, void *cookie) { state_t *state = (state_t *) cookie; int seq = state->seq; int sock = state->sock; alarm(15); while (iterations-- > 0) { *(int*)state->buf = htonl(seq++); if (send(sock, state->buf, state->msize, 0) != state->msize) { perror("lat_udp client: send failed"); exit(5); } if (recv(sock, state->buf, state->msize, 0) != state->msize) { perror("lat_udp client: recv failed"); exit(5); } } state->seq = seq; } void cleanup(iter_t iterations, void* cookie) { state_t *state = (state_t *) cookie; if (iterations) return; close(state->sock); free(state->buf); } void timeout() { fprintf(stderr, "Recv timed out\n"); exit(1); } void server_main() { char *buf = (char*)valloc(MAX_MSIZE); int sock, sent, seq = 0; socklen_t namelen; struct sockaddr_in it; if (!buf) { perror("malloc"); exit(4); } GO_AWAY; sock = udp_server(UDP_XACT, SOCKOPT_REUSE); while (1) { int nbytes; namelen = sizeof(it); if ((nbytes = recvfrom(sock, (void*)buf, MAX_MSIZE, 0, (struct sockaddr*)&it, &namelen)) < 0) { fprintf(stderr, "lat_udp server: recvfrom: got wrong size\n"); exit(9); } sent = ntohl(*(int*)buf); if (sent < 0) { udp_done(UDP_XACT); exit(0); } if (sent != ++seq) { seq = sent; } *(int*)buf = htonl(seq); if (sendto(sock, (void*)buf, nbytes, 0, (struct sockaddr*)&it, sizeof(it)) < 0) { perror("lat_udp sendto"); exit(9); } } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_unix.c�����������������������������������������������������������������������0000664�0000764�0000764�00000005464�10620624541�015754� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * tcp_unix.c - simple UNIX socket transaction latency test * * lat_unix [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * * Copyright (c) 1994-2000 Carl Staelin and Larry McVoy. * Distributed under the FSF GPL with additional restriction that * results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" struct _state { int sv[2]; int pid; int msize; char* buf; }; void initialize(iter_t iterations, void* cookie); void benchmark(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; struct _state state; int c; char* usage = "[-m <message size>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n"; state.msize = 1; state.pid = 0; while (( c = getopt(ac, av, "m:P:W:N:")) != EOF) { switch(c) { case 'm': state.msize = atoi(optarg); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind < ac) { lmbench_usage(ac, av, usage); } benchmp(initialize, benchmark, cleanup, 0, parallel, warmup, repetitions, &state); micro("AF_UNIX sock stream latency", get_n()); return(0); } void initialize(iter_t iterations, void* cookie) { struct _state* pState = (struct _state*)cookie; if (iterations) return; if (socketpair(AF_UNIX, SOCK_STREAM, 0, pState->sv) == -1) { perror("socketpair"); } pState->buf = malloc(pState->msize); if (pState->buf == NULL) { fprintf(stderr, "buffer allocation\n"); exit(1); } handle_scheduler(benchmp_childid(), 0, 1); if (pState->pid = fork()) return; handle_scheduler(benchmp_childid(), 1, 1); /* Child sits and ping-pongs packets back to parent */ signal(SIGTERM, exit); while (read(pState->sv[0], pState->buf, pState->msize) == pState->msize) { write(pState->sv[0], pState->buf, pState->msize); } exit(0); } void benchmark(iter_t iterations, void* cookie) { struct _state* pState = (struct _state*)cookie; while (iterations-- > 0) { if (write(pState->sv[1], pState->buf, pState->msize) != pState->msize || read(pState->sv[1], pState->buf, pState->msize) != pState->msize) { /* error handling: how do we signal failure? */ cleanup(0, cookie); exit(0); } } } void cleanup(iter_t iterations, void* cookie) { struct _state* pState = (struct _state*)cookie; if (iterations) return; if (pState->pid) { kill(pState->pid, SIGKILL); waitpid(pState->pid, NULL, 0); pState->pid = 0; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_unix_connect.c���������������������������������������������������������������0000664�0000764�0000764�00000004127�10620624541�017460� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_unix_connect.c - simple UNIX connection latency test * * Three programs in one - * server usage: lat_connect -s * client usage: lat_connect [-P <parallelism>] [-W <warmup>] [-N <repetitions>] * shutdown: lat_connect -q * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #define CONNAME "/tmp/af_unix" void server_main(void); void benchmark(iter_t iterations, void* cookie) { while (iterations-- > 0) { int sock = unix_connect(CONNAME); if (sock <= 0) printf("error on iteration %lu\n",iterations); close(sock); } } int main(int ac, char **av) { int parallel = 1; int warmup = 0; int repetitions = -1; char *usage = "-s\n OR [-P <parallelism>] [-W <warmup>] [-N <repetitions>]\n OR -S\n"; int c; /* Start the server "-s" or Shut down the server "-S" */ if (ac == 2) { if (!strcmp(av[1], "-s")) { if (fork() == 0) { server_main(); } exit(0); } if (!strcmp(av[1], "-S")) { int sock = unix_connect(CONNAME); write(sock, "0", 1); close(sock); exit(0); } } /* * Rest is client */ while (( c = getopt(ac, av, "P:W:N:")) != EOF) { switch(c) { case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac) { lmbench_usage(ac, av, usage); } benchmp(NULL, benchmark, NULL, 0, parallel, warmup, repetitions, NULL); micro("UNIX connection cost", get_n()); } void server_main(void) { int newsock, sock; char c; GO_AWAY; sock = unix_server(CONNAME); for (;;) { newsock = unix_accept(sock); c = 0; read(newsock, &c, 1); if (c && c == '0') { unix_done(sock, CONNAME); exit(0); } close(newsock); } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lat_usleep.c���������������������������������������������������������������������0000664�0000764�0000764�00000014160�10620624541�016257� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lat_usleep.c - usleep duration/latency * * The APIs for usleep(3), nanosleep(2), select(2), pselect(2), * getitimer(2) and setitimer(2) support resolutions down to * a micro-second. However, many implementations do not support * such resolution. Most current implementations (as of Fall * 2002) simply put the process back on the run queue and the * process may get run on the next scheduler time slice (10-20 * milli-second resolution). * * This benchmark measures the true latency from the timer/sleep * call to the resumption of program execution. If the timers * actually worked properly, then the latency would be identical * to the requested duration, or a little longer, so the input * and output figures would be nearly identical. In most current * implementations the value is rounded up to the next scheduler * timeslice (e.g., a resolution of 20 milli-seconds, with all * values rounded up). * * usage: lat_usleep [-u | -i] [-P <parallelism>] [-W <warmup>] \ * [-N <repetitions>] usecs * * Copyright (c) 2002 Carl Staelin. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" #include <sched.h> typedef enum {USLEEP, NANOSLEEP, SELECT, PSELECT, ITIMER} timer_e; uint64 caught, n; struct itimerval value; typedef struct _state { unsigned long usecs; } state_t; void bench_usleep(iter_t iterations, void *cookie) { state_t *state = (state_t*)cookie; while (iterations-- > 0) { usleep(state->usecs); } } void bench_nanosleep(iter_t iterations, void *cookie) { state_t *state = (state_t*)cookie; struct timespec req; struct timespec rem; req.tv_sec = 0; req.tv_nsec = state->usecs * 1000; while (iterations-- > 0) { if (nanosleep(&req, &rem) < 0) { while (nanosleep(&rem, &rem) < 0) ; } } } void bench_select(iter_t iterations, void *cookie) { state_t *state = (state_t*)cookie; struct timeval tv; while (iterations-- > 0) { tv.tv_sec = 0; tv.tv_usec = state->usecs; select(0, NULL, NULL, NULL, &tv); } } #ifdef _POSIX_SELECT void bench_pselect(iter_t iterations, void *cookie) { state_t *state = (state_t*)cookie; struct timespec ts; while (iterations-- > 0) { ts.tv_sec = 0; ts.tv_nsec = state->usecs * 1000; pselect(0, NULL, NULL, NULL, &ts, NULL); } } #endif /* _POSIX_SELECT */ void interval(int x) { if (++caught == n) { caught = 0; n = benchmp_interval(benchmp_getstate()); } setitimer(ITIMER_REAL, &value, NULL); } void initialize(iter_t iterations, void *cookie) { state_t *state = (state_t*)cookie; struct sigaction sa; if (iterations) return; value.it_interval.tv_sec = 0; value.it_interval.tv_usec = state->usecs; value.it_value.tv_sec = 0; value.it_value.tv_usec = state->usecs; sa.sa_handler = interval; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGALRM, &sa, 0); } void bench_itimer(iter_t iterations, void *cookie) { n = iterations; caught = 0; /* * start the first timing interval */ start(0); /* * create the first timer, causing us to jump to interval() */ setitimer(ITIMER_REAL, &value, NULL); while (1) { sleep(100000); } } int set_realtime() { struct sched_param sp; sp.sched_priority = sched_get_priority_max(SCHED_RR); if (sched_setscheduler(0, SCHED_RR, &sp) >= 0) return TRUE; perror("sched_setscheduler"); return FALSE; } int main(int ac, char **av) { int realtime = 0; int parallel = 1; int warmup = 0; int repetitions = -1; int c; char buf[512]; timer_e what = USLEEP; state_t state; char *scheduler = ""; char *mechanism = "usleep"; char *usage = "[-r] [-u <method>] [-P <parallelism>] [-W <warmup>] [-N <repetitions>] usecs\nmethod=usleep|nanosleep|select|pselect|itimer\n"; realtime = 0; while ((c = getopt(ac, av, "ru:W:N:")) != EOF) { switch (c) { case 'r': realtime = 1; break; case 'u': if (strcmp(optarg, "usleep") == 0) { what = USLEEP; mechanism = "usleep"; } else if (strcmp(optarg, "nanosleep") == 0) { what = NANOSLEEP; mechanism = "nanosleep"; } else if (strcmp(optarg, "select") == 0) { what = SELECT; mechanism = "select"; #ifdef _POSIX_SELECT } else if (strcmp(optarg, "pselect") == 0) { what = PSELECT; mechanism = "pselect"; #endif /* _POSIX_SELECT */ } else if (strcmp(optarg, "itimer") == 0) { what = ITIMER; mechanism = "itimer"; } else { lmbench_usage(ac, av, usage); } break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if (optind != ac - 1) { lmbench_usage(ac, av, usage); } state.usecs = bytes(av[optind]); if (realtime && set_realtime()) scheduler = "realtime "; switch (what) { case USLEEP: benchmp(NULL, bench_usleep, NULL, 0, parallel, warmup, repetitions, &state); break; case NANOSLEEP: benchmp(NULL, bench_nanosleep, NULL, 0, parallel, warmup, repetitions, &state); break; case SELECT: benchmp(NULL, bench_select, NULL, 0, parallel, warmup, repetitions, &state); break; #ifdef _POSIX_SELECT case PSELECT: benchmp(NULL, bench_pselect, NULL, 0, parallel, warmup, repetitions, &state); break; #endif /* _POSIX_SELECT */ case ITIMER: benchmp(initialize, bench_itimer, NULL, 0, parallel, warmup, repetitions, &state); break; default: lmbench_usage(ac, av, usage); break; } sprintf(buf, "%s%s %lu microseconds", scheduler, mechanism, state.usecs); micro(buf, get_n()); return (0); } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_debug.c����������������������������������������������������������������������0000664�0000764�0000764�00000006310�10450256147�016040� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include "bench.h" #include "lib_debug.h" /* * return micro-seconds / iteration at the the fraction point. * * some examples: * min = percent_point(values, size, 0.0) * 1st quartile = percent_point(values, size, 0.25) * median = percent_point(values, size, 0.5) * 3rd quartile = percent_point(values, size, 0.75) * max = percent_point(values, size, 1.0) * * the data points in the results structure are sorted from * largest to smallest, so we adjust the fraction accordingly. */ double percent_point(double fraction) { double t, r; result_t* results = get_results(); t = (1.0 - fraction) * (results->N - 1); if (t == floor(t)) { /* no interpolation */ r = results->v[(int)t].u / (double)results->v[(int)t].n; } else { /* percent point falls between two points, interpolate */ r = results->v[(int)t].u / (double)results->v[(int)t].n; r += results->v[(int)t+1].u / (double)results->v[(int)t+1].n; r /= 2.0; } return r; } void print_results(int details) { int i; result_t* results = get_results(); fprintf(stderr, "N=%d, t={", results->N); for (i = 0; i < results->N; ++i) { fprintf(stderr, "%.2f", (double)results->v[i].u/results->v[i].n); if (i < results->N - 1) fprintf(stderr, ", "); } fprintf(stderr, "}\n"); if (details) { fprintf(stderr, "\t/* %d {", results->N); for (i = 0; i < results->N; ++i) { fprintf(stderr, "%llu/%llu", results->v[i].u, results->v[i].n); if (i < results->N - 1) fprintf(stderr, ", "); } fprintf(stderr, "} */\n"); } } /* * Prints bandwidth (MB/s) quartile information * * bytes - bytes per iteration */ void bw_quartile(uint64 bytes) { fprintf(stderr, "%lu\t%e\t%e\t%e\t%e\t%e\n", (unsigned long)get_n(), (double)bytes / (1000000. * percent_point(0.00)), (double)bytes / (1000000. * percent_point(0.25)), (double)bytes / (1000000. * percent_point(0.50)), (double)bytes / (1000000. * percent_point(0.75)), (double)bytes / (1000000. * percent_point(1.00))); } /* * Prints latency (nano-seconds) quartile information * * n - number of operations per iteration */ void nano_quartile(uint64 n) { fprintf(stderr, "%lu\t%e\t%e\t%e\t%e\t%e\n", (unsigned long)get_n(), percent_point(0.00) * 1000. / (double)n, percent_point(0.25) * 1000. / (double)n, percent_point(0.50) * 1000. / (double)n, percent_point(0.75) * 1000. / (double)n, percent_point(1.00) * 1000. / (double)n); } /* * print the page|line|word offset for each link in the pointer chain. */ void print_mem(char* addr, size_t size, size_t line) { char* p; uint64 base, off; size_t pagesize = getpagesize(); base = (uint64)addr; for (p = addr; *(char**)p != addr; p = *(char**)p) { off = (uint64)p - base; fprintf(stderr, "\t%lu\t%lu\t%lu\n", (unsigned long)(off / pagesize), (unsigned long)((off % pagesize) / line), (unsigned long)((off % line) / sizeof(char*))); } } void check_mem(char* addr, size_t size) { char* p; size_t i; size_t max = size / sizeof(char*) + 1; for (p=addr, i=0; *(char**)p != addr && i < max; p = *(char**)p, i++) { if (p < addr || addr + size <= p) { fprintf(stderr, "check_mem: pointer out of range!\n"); } } if (*(char**)p != addr) { fprintf(stderr, "check_mem: pointer chain doesn't loop\n"); } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_debug.h����������������������������������������������������������������������0000664�0000764�0000764�00000000404�07546242516�016052� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#ifndef _LIB_DEBUG_H #define _LIB_DEBUG_H void print_results(int details); void bw_quartile(uint64 bytes); void nano_quartile(uint64 n); void print_mem(char* addr, size_t size, size_t line); void check_mem(char* addr, size_t size); #endif /* _LIB_DEBUG_H */ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_mem.c������������������������������������������������������������������������0000664�0000764�0000764�00000044374�10715547567�015561� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * lib_mem.c - library of routines used to analyze the memory hierarchy * * %W% %@% * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. * Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ #include "bench.h" #define FIVE(m) m m m m m #define TEN(m) FIVE(m) FIVE(m) #define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) #define HUNDRED(m) FIFTY(m) FIFTY(m) #define DEREF(N) p##N = (char**)*p##N; #define DECLARE(N) static char **sp##N; register char **p##N; #define INIT(N) p##N = (mem_benchmark_rerun && addr_save==state->addr) ? sp##N : (char**)state->p[N]; #define SAVE(N) sp##N = p##N; #define MEM_BENCHMARK_F(N) mem_benchmark_##N, benchmp_f mem_benchmarks[] = {REPEAT_15(MEM_BENCHMARK_F)}; static int mem_benchmark_rerun = 0; #define MEM_BENCHMARK_DEF(N,repeat,body) \ void \ mem_benchmark_##N(iter_t iterations, void *cookie) \ { \ struct mem_state* state = (struct mem_state*)cookie; \ static char *addr_save = NULL; \ repeat(DECLARE); \ \ repeat(INIT); \ while (iterations-- > 0) { \ HUNDRED(repeat(body)); \ } \ \ repeat(SAVE); \ addr_save = state->addr; \ mem_benchmark_rerun = 1; \ } MEM_BENCHMARK_DEF(0, REPEAT_0, DEREF) MEM_BENCHMARK_DEF(1, REPEAT_1, DEREF) MEM_BENCHMARK_DEF(2, REPEAT_2, DEREF) MEM_BENCHMARK_DEF(3, REPEAT_3, DEREF) MEM_BENCHMARK_DEF(4, REPEAT_4, DEREF) MEM_BENCHMARK_DEF(5, REPEAT_5, DEREF) MEM_BENCHMARK_DEF(6, REPEAT_6, DEREF) MEM_BENCHMARK_DEF(7, REPEAT_7, DEREF) MEM_BENCHMARK_DEF(8, REPEAT_8, DEREF) MEM_BENCHMARK_DEF(9, REPEAT_9, DEREF) MEM_BENCHMARK_DEF(10, REPEAT_10, DEREF) MEM_BENCHMARK_DEF(11, REPEAT_11, DEREF) MEM_BENCHMARK_DEF(12, REPEAT_12, DEREF) MEM_BENCHMARK_DEF(13, REPEAT_13, DEREF) MEM_BENCHMARK_DEF(14, REPEAT_14, DEREF) MEM_BENCHMARK_DEF(15, REPEAT_15, DEREF) size_t* words_initialize(size_t max, int scale); void mem_reset() { mem_benchmark_rerun = 0; } void mem_cleanup(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; if (iterations) return; if (state->addr) { free(state->addr); state->addr = NULL; } if (state->lines) { free(state->lines); state->lines = NULL; } if (state->pages) { free(state->pages); state->pages = NULL; } if (state->words) { free(state->words); state->words = NULL; } } void tlb_cleanup(iter_t iterations, void* cookie) { size_t i; struct mem_state* state = (struct mem_state*)cookie; char **addr = (char**)state->addr; if (iterations) return; if (addr) { for (i = 0; i < state->npages; ++i) { if (addr[i]) free(addr[i]); } free(addr); state->addr = NULL; } if (state->pages) { free(state->pages); state->pages = NULL; } if (state->lines) { free(state->lines); state->lines = NULL; } } void base_initialize(iter_t iterations, void* cookie) { size_t nwords, nlines, nbytes, npages, nmpages; size_t *pages; size_t *lines; size_t *words; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; state->initialized = 0; nbytes = state->len; nwords = state->line / sizeof(char*); nlines = state->pagesize / state->line; npages = (nbytes + state->pagesize - 1) / state->pagesize; nmpages= (state->maxlen + state->pagesize - 1) / state->pagesize; srand(getpid()); words = NULL; lines = NULL; pages = permutation(nmpages, state->pagesize); p = state->addr = (char*)malloc(state->maxlen + 2 * state->pagesize); if (!p) { perror("base_initialize: malloc"); exit(1); } state->nwords = nwords; state->nlines = nlines; state->npages = npages; state->lines = lines; state->pages = pages; state->words = words; if (state->addr == NULL || pages == NULL) return; if ((unsigned long)p % state->pagesize) { p += state->pagesize - (unsigned long)p % state->pagesize; } state->base = p; state->initialized = 1; mem_reset(); } /* * Create a circular list of pointers using a simple striding * algorithm. * * This access pattern corresponds to many array/matrix * algorithms. It should be easily and correctly predicted * by any decent hardware prefetch algorithm. */ void stride_initialize(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; size_t i; size_t range = state->len; size_t stride = state->line; char* addr; base_initialize(iterations, cookie); if (!state->initialized) return; addr = state->base; for (i = stride; i < range; i += stride) { *(char **)&addr[i - stride] = (char*)&addr[i]; } *(char **)&addr[i - stride] = (char*)&addr[0]; state->p[0] = addr; mem_reset(); } void thrash_initialize(iter_t iterations, void* cookie) { struct mem_state* state = (struct mem_state*)cookie; size_t i; size_t j; size_t cur; size_t next; size_t cpage; size_t npage; char* addr; base_initialize(iterations, cookie); if (!state->initialized) return; addr = state->base; /* * Create a circular list of pointers with a random access * pattern. * * This stream corresponds more closely to linked list * memory access patterns. For large data structures each * access will likely cause both a cache miss and a TLB miss. * * Access a different page each time. This will eventually * cause a tlb miss each page. It will also cause maximal * thrashing in the cache between the user data stream and * the page table entries. */ if (state->len % state->pagesize) { state->nwords = state->len / state->line; state->words = words_initialize(state->nwords, state->line); if (!state->words) { perror("thrash_initialize: malloc"); exit(1); } for (i = 0; i < state->nwords - 1; ++i) { *(char **)&addr[state->words[i]] = (char*)&addr[state->words[i+1]]; } *(char **)&addr[state->words[i]] = addr; state->p[0] = addr; } else { state->nwords = state->pagesize / state->line; state->words = words_initialize(state->nwords, state->line); if (!state->words) { perror("thrash_initialize: malloc"); exit(2); } for (i = 0; i < state->npages - 1; ++i) { cpage = state->pages[i]; npage = state->pages[i + 1]; for (j = 0; j < state->nwords; ++j) { cur = cpage + state->words[(i + j) % state->nwords]; next = npage + state->words[(i + j + 1) % state->nwords]; *(char **)&addr[cur] = (char*)&addr[next]; } } cpage = state->pages[i]; npage = state->pages[0]; for (j = 0; j < state->nwords; ++j) { cur = cpage + state->words[(i + j) % state->nwords]; next = npage + state->words[(j + 1) % state->nwords]; *(char **)&addr[cur] = (char*)&addr[next]; } state->p[0] = (char*)&addr[state->pages[0]]; } mem_reset(); } /* * mem_initialize * * Create a circular pointer chain that runs through memory. * * The chain threads through each cache line on a page before * moving to the next page. The order of cache line accesses * is randomized to defeat cache prefetching algorithms. In * addition, the order of page accesses is randomized. Finally, * to reduce the impact of incorrect line-size estimates on * machines with direct-mapped caches, we randomize which * word in the cache line is used to hold the pointer. * * It initializes state->width pointers to elements evenly * spaced through the chain. */ void mem_initialize(iter_t iterations, void* cookie) { int i, j, k, l, nw, nwords, nlines, npages, npointers; size_t *pages; size_t *lines; size_t *words; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; base_initialize(iterations, cookie); if (!state->initialized) return; state->initialized = 0; npointers = state->len / state->line; nwords = state->nwords; nlines = state->nlines; npages = state->npages; words = state->words = words_initialize(nwords, sizeof(char*)); lines = state->lines = words_initialize(nlines, state->line); pages = state->pages; p = state->base; if (state->addr == NULL \ || pages == NULL || lines == NULL || words == NULL) { perror("mem_initialize: malloc"); exit(1); } /* setup the run through the pages */ l = 0; j = nlines - 1; for (i = 0; i < npages; ++i) { for (j = 0; j < nlines - 1 && l < npointers - 1; ++j, ++l) { for (k = 0; k < state->line; k += sizeof(char*)) { *(char**)(p + pages[i] + lines[j] + k) = p + pages[i] + lines[j+1] + k; } if (l % (npointers/state->width) == 0 && l / (npointers/state->width) < MAX_MEM_PARALLELISM) { k = l / (npointers/state->width); state->p[k] = p + pages[i] + lines[j] + words[k % nwords]; } } if (i < npages - 1) { for (k = 0; k < nwords; ++k) *(char**)(p + pages[i] + lines[j] + words[k]) = p + pages[i+1] + lines[0] + words[k]; } } for (k = 0; k < nwords; ++k) { nw = (k == nwords - 1) ? 0 : k + 1; *(char**)(p + pages[npages-1] + lines[j] + words[k]) = p + pages[0] + lines[0] + words[nw]; } /* now, run through the chain once to clear the cache */ mem_reset(); (*mem_benchmarks[state->width-1])((nwords * npointers + 100) / 100, state); state->initialized = 1; } /* * line_initialize * * This is very similar to mem_initialize, except that we always use * the first element of the cache line to hold the pointer. * */ void line_initialize(iter_t iterations, void* cookie) { int i, j, nlines, npages; size_t *pages; size_t *lines; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; base_initialize(iterations, cookie); if (!state->initialized) return; state->initialized = 0; nlines = state->nlines; npages = state->npages; lines = state->lines = words_initialize(nlines, state->line); pages = state->pages; p = state->base; state->width = 1; if (state->addr == NULL || lines == NULL || pages == NULL) return; /* new setup runs through the lines */ for (i = 0; i < npages; ++i) { /* sequence through the first word of each line */ for (j = 0; j < nlines - 1; ++j) { *(char**)(p + pages[i] + lines[j]) = p + pages[i] + lines[j+1]; } /* jump to the fist word of the first line on next page */ *(char**)(p + pages[i] + lines[j]) = p + pages[(i < npages-1) ? i+1 : 0] + lines[0]; } state->p[0] = p + pages[0] + lines[0]; /* now, run through the chain once to clear the cache */ mem_reset(); mem_benchmark_0((nlines * npages + 100) / 100, state); state->initialized = 1; } /* * tlb_initialize * * Build a pointer chain which accesses one word per page, for a total * of (line * pages) bytes of data loaded into cache. * * If the number of elements in the chain (== #pages) is larger than the * number of pages addressed by the TLB, then each access should cause * a TLB miss (certainly as the number of pages becomes much larger than * the TLB-addressed space). * * In addition, if we arrange the chain properly, each word we access * will be in the cache. * * This means that the average access time for each pointer dereference * should be a cache hit plus a TLB miss. * */ void tlb_initialize(iter_t iterations, void* cookie) { int i, nlines, npages, pagesize; unsigned int r; char **pages = NULL; char **addr = NULL; size_t *lines = NULL; struct mem_state* state = (struct mem_state*)cookie; register char *p = 0 /* lint */; if (iterations) return; state->initialized = 0; pagesize = state->pagesize; nlines = pagesize / sizeof(char*); npages = state->len / pagesize; srand(getpid() ^ (getppid()<<7)); lines = words_initialize(nlines, sizeof(char*)); pages = (char**)malloc(npages * sizeof(char**)); addr = (char**)malloc(npages * sizeof(char**)); if (!lines || !pages || !addr) { perror("tlb_initialize: malloc"); exit(1); } state->nwords = 1; state->nlines = nlines; state->npages = npages; state->words = NULL; state->lines = lines; state->pages = (size_t*)pages; state->addr = (char*)addr; if (addr) bzero(addr, npages * sizeof(char**)); if (pages) bzero(pages, npages * sizeof(char**)); if (addr == NULL || pages == NULL || lines == NULL) { return; } /* first, layout the sequence of page accesses */ for (i = 0; i < npages; ++i) { p = addr[i] = (char*)valloc(pagesize); if (!p) { perror("tlb_initialize: valloc"); exit(4); } if ((unsigned long)p % pagesize) { free(p); p = addr[i] = (char*)valloc(2 * pagesize); if (!p) { perror("tlb_initialize: valloc"); exit(5); } p += pagesize - (unsigned long)p % pagesize; } pages[i] = (char*)p; } /* randomize the page sequences (except for zeroth page) */ r = (rand() << 15) ^ rand(); for (i = npages - 2; i > 0; --i) { char* l; r = (r << 1) ^ (rand() >> 4); l = pages[(r % i) + 1]; pages[(r % i) + 1] = pages[i + 1]; pages[i + 1] = l; } /* now setup run through the pages */ for (i = 0; i < npages - 1; ++i) { *(char**)(pages[i] + lines[i%nlines]) = pages[i+1] + lines[(i+1)%nlines]; } *(char**)(pages[i] + lines[i%nlines]) = pages[0] + lines[0]; state->p[0] = pages[0] + lines[0]; /* run through the chain once to clear the cache */ mem_reset(); mem_benchmark_0((npages + 100) / 100, state); state->initialized = 1; } /* * words_initialize * * This is supposed to create the order in which the words in a * "cache line" are used. Since we rarely know the cache line * size with any real reliability, we need to jump around so * as to maximize the number of potential cache misses, and to * minimize the possibility of re-using a cache line. */ size_t* words_initialize(size_t max, int scale) { size_t i, j, nbits; size_t* words = (size_t*)malloc(max * sizeof(size_t)); if (!words) return NULL; bzero(words, max * sizeof(size_t)); for (i = max>>1, nbits = 0; i != 0; i >>= 1, nbits++) ; for (i = 0; i < max; ++i) { /* now reverse the bits */ for (j = 0; j < nbits; j++) { if (i & (1<<j)) { words[i] |= (1<<(nbits-j-1)); } } words[i] *= scale; } return words; } ssize_t line_find(size_t len, int warmup, int repetitions, struct mem_state* state) { size_t i, big_jump, line; size_t maxline = getpagesize() / 16; double baseline = 0.0, t; big_jump = 0; line = 0; if (repetitions < 0) repetitions = TRIES; /* fprintf(stderr, "line_find(%lu, ...): entering\n", (unsigned long)len); /**/ state->width = 1; state->line = sizeof(char*); for (state->addr = NULL; !state->addr && len; ) { state->len = state->maxlen = len; line_initialize(0, state); if (state->addr == NULL) len >>= 1; } if (state->addr == NULL) return -1; for (i = sizeof(char*); i <= maxline; i<<=1) { t = line_test(i, warmup, repetitions, state); if (t == 0.0) break; if (i > sizeof(char*)) { if (t > 1.3 * baseline) { big_jump = 1; } else if (big_jump && t < 1.15 * baseline) { line = (i>>1); break; } } baseline = t; } mem_cleanup(0, state); /* fprintf(stderr, "line_find(%lu, ...): returning %lu\n", (unsigned long)len, (unsigned long)line); /**/ return line; } double line_test(size_t line, int warmup, int repetitions, struct mem_state* state) { size_t i; size_t npages = state->npages; size_t nlines = state->pagesize / line; double t; char* p = state->base; char* first = p + state->pages[0] + state->lines[0]; result_t *r, *r_save; if (repetitions < 0) repetitions = TRIES; /* only visit a subset of the lines in each page */ if (nlines < state->nlines) { p = state->base; for (i = 0; i < npages - 1; ++i) { *(char**)(p + state->pages[i] + state->lines[nlines-1]) = p + state->pages[i+1] + state->lines[0]; } *(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) = p + state->pages[0] + state->lines[0]; } r_save = get_results(); r = (result_t*)malloc(sizeof_result(repetitions)); if (!r) { perror("line_test: malloc"); exit(1); } insertinit(r); p = first; for (i = 0; i < repetitions; ++i) { BENCH1(HUNDRED(p = *(char**)p;),0); /* fprintf(stderr, "%d\t%d\t%d\n", line, (int)gettime(), (int)get_n()); /**/ insertsort(gettime(), get_n(), r); } use_pointer(p); set_results(r); t = 10. * (double)gettime() / (double)get_n(); set_results(r_save); free(r); /* fprintf(stderr, "%d\t%.5f\t%d\n", line, t, state->len); /**/ /* fixup full path again */ if (nlines < state->nlines) { p = state->base; for (i = 0; i < npages - 1; ++i) { *(char**)(p + state->pages[i] + state->lines[nlines-1]) = p + state->pages[i] + state->lines[nlines]; } *(char**)(p + state->pages[npages-1] + state->lines[nlines-1]) = p + state->pages[npages-1] + state->lines[nlines]; } return (t); } double par_mem(size_t len, int warmup, int repetitions, struct mem_state* state) { int i, j; iter_t __n = 1; double baseline, max_par, par; state->width = 1; max_par = 1.; if (repetitions < 0) repetitions = TRIES; for (state->addr = NULL; !state->addr && len; ) { state->len = state->maxlen = len; mem_initialize(0, state); if (state->addr == NULL) len >>= 1; } if (state->addr == NULL) return -1.; for (i = 0; i < MAX_MEM_PARALLELISM; ++i) { for (j = 0; j <= i; j++) { size_t nlines = len / state->line; size_t lines_per_chunk = nlines / (i + 1); size_t lines_per_page = state->pagesize / state->line; size_t line = j * lines_per_chunk; size_t word = (j * state->nwords) / (i + 1); /* if (state->len == 32768 && i == 7) { fprintf(stderr, "\tj=%d, line=%d, word=%d, page=%d, _line=%d, _word=%d\n", j, line, word, line / lines_per_page, line % lines_per_page, word % state->nwords); } /**/ state->p[j] = state->base + state->pages[line / lines_per_page] + state->lines[line % lines_per_page] + state->words[word % state->nwords]; } mem_reset(); (*mem_benchmarks[i])((len / sizeof(char*) + 100) / 100, state); BENCH((*mem_benchmarks[i])(__n, state); __n = 1;, 0); if (i == 0) { baseline = (double)gettime() / (double)get_n(); } else if (gettime() > 0) { par = baseline; par /= (double)gettime() / (double)((i + 1) * get_n()); /* fprintf(stderr, "par_mem(%d): i=%d, p=%5.2f, l=%d, lpp=%d, lpc=%d, nl=%d, wpc=%d\n", len, i, par, state->line, state->pagesize / state->line, (len / state->line) / (i + 1), len / state->line, state->nwords / (i + 1)); /**/ if (par > max_par) { max_par = par; } if (4.0 * max_par < i) break; } } mem_cleanup(0, state); return max_par; } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_mem.h������������������������������������������������������������������������0000664�0000764�0000764�00000003431�10425063126�015531� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#ifndef LMBENCH_MEM_H #define LMBENCH_MEM_H #define MAX_MEM_PARALLELISM 16 #define MEM_BENCHMARK_DECL(N) \ void mem_benchmark_##N(iter_t iterations, void* cookie); #define REPEAT_0(m) m(0) #define REPEAT_1(m) REPEAT_0(m) m(1) #define REPEAT_2(m) REPEAT_1(m) m(2) #define REPEAT_3(m) REPEAT_2(m) m(3) #define REPEAT_4(m) REPEAT_3(m) m(4) #define REPEAT_5(m) REPEAT_4(m) m(5) #define REPEAT_6(m) REPEAT_5(m) m(6) #define REPEAT_7(m) REPEAT_6(m) m(7) #define REPEAT_8(m) REPEAT_7(m) m(8) #define REPEAT_9(m) REPEAT_8(m) m(9) #define REPEAT_10(m) REPEAT_9(m) m(10) #define REPEAT_11(m) REPEAT_10(m) m(11) #define REPEAT_12(m) REPEAT_11(m) m(12) #define REPEAT_13(m) REPEAT_12(m) m(13) #define REPEAT_14(m) REPEAT_13(m) m(14) #define REPEAT_15(m) REPEAT_14(m) m(15) struct mem_state { char* addr; /* raw pointer returned by malloc */ char* base; /* page-aligned pointer */ char* p[MAX_MEM_PARALLELISM]; int initialized; int width; size_t len; size_t maxlen; size_t line; size_t pagesize; size_t nlines; size_t npages; size_t nwords; size_t* pages; size_t* lines; size_t* words; }; void stride_initialize(iter_t iterations, void* cookie); void thrash_initialize(iter_t iterations, void* cookie); void mem_initialize(iter_t iterations, void* cookie); void line_initialize(iter_t iterations, void* cookie); void tlb_initialize(iter_t iterations, void* cookie); void mem_cleanup(iter_t iterations, void* cookie); void tlb_cleanup(iter_t iterations, void* cookie); REPEAT_15(MEM_BENCHMARK_DECL) extern benchmp_f mem_benchmarks[]; ssize_t line_find(size_t l, int warmup, int repetitions, struct mem_state* state); double line_test(size_t l, int warmup, int repetitions, struct mem_state* state); double par_mem(size_t l, int warmup, int repetitions, struct mem_state* state); #endif /* LMBENCH_MEM_H */ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_sched.c����������������������������������������������������������������������0000664�0000764�0000764�00000014074�10477475635�016064� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include "bench.h" /* #define _DEBUG */ #if defined(HAVE_SYSMP) #include <sys/sysmp.h> #include <sys/sysinfo.h> #endif #if defined(HAVE_MPCTL) #include <sys/mpctl.h> #endif #if defined(HAVE_BINDPROCESSOR) #include <sys/processor.h> #endif #if defined(HAVE_PROCESSOR_BIND) #include <sys/types.h> #include <sys/processor.h> #include <sys/procset.h> #endif #if defined(HAVE_SCHED_SETAFFINITY) #include <sched.h> #endif extern int custom(char* str, int cpu); extern int reverse_bits(int cpu); extern int sched_ncpus(); extern int sched_pin(int cpu); /* * The interface used by benchmp. * * childno is the "logical" child id number. * In range [0, ..., parallel-1]. * benchproc is the "logical" id within the benchmark process. The * benchmp-created process is logical ID zero, child processes * created by the benchmark range from [1, ..., nbenchprocs]. * nbenchprocs is the number of child processes that each benchmark * process will create. Most benchmarks will leave this zero, * but some such as the pipe() benchmarks will not. */ int handle_scheduler(int childno, int benchproc, int nbenchprocs) { int cpu = 0; char* sched = getenv("LMBENCH_SCHED"); if (!sched || strcasecmp(sched, "DEFAULT") == 0) { /* do nothing. Allow scheduler to control placement */ return 0; } else if (strcasecmp(sched, "SINGLE") == 0) { /* assign all processes to CPU 0 */ cpu = 0; } else if (strcasecmp(sched, "BALANCED") == 0) { /* assign each benchmark process to its own processor, * but child processes will share the CPU with the * parent. */ cpu = childno; } else if (strcasecmp(sched, "BALANCED_SPREAD") == 0) { /* * assign each benchmark process to its own processor, * logically as far away from neighboring IDs as * possible. This can help identify bus contention * issues in SMPs with hierarchical busses or NUMA * memory. */ cpu = reverse_bits(childno); } else if (strcasecmp(sched, "UNIQUE") == 0) { /* * assign each benchmark process and each child process * to its own processor. */ cpu = childno * (nbenchprocs + 1) + benchproc; } else if (strcasecmp(sched, "UNIQUE_SPREAD") == 0) { /* * assign each benchmark process and each child process * to its own processor, logically as far away from * neighboring IDs as possible. This can help identify * bus contention issues in SMPs with hierarchical busses * or NUMA memory. */ cpu = reverse_bits(childno * (nbenchprocs + 1) + benchproc); } else if (strncasecmp(sched, "CUSTOM ", strlen("CUSTOM ")) == 0) { cpu = custom(sched + strlen("CUSTOM"), childno); } else if (strncasecmp(sched, "CUSTOM_SPREAD ", strlen("CUSTOM_SPREAD ")) == 0) { cpu = custom(sched + strlen("CUSTOM_SPREAD"), childno * (nbenchprocs + 1) + benchproc); } else { /* default action: do nothing */ return 0; } return sched_pin(cpu % sched_ncpus()); } /* * Use to get sequentially created processes "far" away from * each other in an SMP. * * XXX: probably doesn't work for NCPUS not a power of two. */ int reverse_bits(int cpu) { int i; int nbits; int max = sched_ncpus() - 1; int cpu_reverse = 0; for (i = max>>1, nbits = 1; i > 0; i >>= 1, nbits++) ; /* now reverse the bits */ for (i = 0; i < nbits; i++) { if (cpu & (1<<i)) cpu_reverse |= (1<<(nbits-i-1)); } return cpu_reverse; } /* * Custom is a user-defined sequence of CPU ids */ int custom(char* str, int cpu) { static int nvalues = -1; static int* values = NULL; if (values == NULL) { nvalues = 0; values = (int*)malloc(sizeof(int)); while (*str) { char* q; while (*str && !isdigit(*str)) str++; q = str; while (*str && isdigit(*str)) str++; if (str == q) break; *str++ = 0; sscanf(q, "%d", &values[nvalues++]); values = (int*)realloc((void*)values, (nvalues + 1) * sizeof(int)); } } if (nvalues == 0) return 0; return values[cpu % nvalues]; } /* * Return the number of processors in this host */ int sched_ncpus() { #ifdef MP_NPROCS /* SGI IRIX interface */ return sysmp(MP_NPROCS); #elif defined(HAVE_MPCTL) /* HP-UX interface */ return mpctl(MPC_GETNUMSPUS_SYS, 0, 0); #elif defined(_SC_NPROCESSORS_ONLN) /* AIX, Solaris, and Linux interface */ return sysconf(_SC_NPROCESSORS_ONLN); #else return 1; #endif } /* * Pin the current process to the given CPU * * return 0 when successful * returns -1 on error */ int sched_pin(int cpu) { int retval = -1; #ifdef HAVE_SYSMP /* SGI IRIX interface */ retval = sysmp(MP_MUSTRUN, cpu); #elif defined(HAVE_MPCTL) /* HP-UX interface */ retval = mpctl(MPC_SET_PROCESS, cpu, MPC_SELFPID); #elif defined(HAVE_BINDPROCESSOR) /* AIX interface */ retval = bindprocessor(BINDPROCESS, getpid(), cpu); #elif defined(HAVE_PROCESSOR_BIND) /* Solaris interface */ retval = processor_bind(P_PID, P_MYPID, cpu, NULL); #elif defined(HAVE_SCHED_SETAFFINITY) /* Linux interface */ static unsigned long* mask = NULL; static unsigned long* cpumask = NULL; static int sz = 0; static int ncpus = 0; int i; int j; if (cpumask == NULL) { sz = 1 + (2 * sched_ncpus()) / (8 * sizeof(unsigned long)); mask = (unsigned long*)malloc(sz * sizeof(unsigned long)); cpumask = (unsigned long*)malloc(sz * sizeof(unsigned long)); retval = sched_getaffinity(0, sz * sizeof(unsigned long), cpumask); if (retval < 0) perror("sched_getaffinity:"); if (retval < 0) return retval; for (i = 0; i < sz * 8 * sizeof(unsigned long); ++i) { int word = i / (8 * sizeof(unsigned long)); int bit = i % (8 * sizeof(unsigned long)); if (cpumask[word] & (1 << bit)) ncpus++; } } cpu %= ncpus; bzero(mask, sz * sizeof(unsigned long)); for (i = 0, j = 0; i < sz * 8 * sizeof(unsigned long); ++i) { int word = i / (8 * sizeof(unsigned long)); int bit = i % (8 * sizeof(unsigned long)); if (cpumask[word] & (1 << bit)) { if (j >= cpu) { mask[word] |= (1 << bit); break; } j++; } } retval = sched_setaffinity(0, sz * sizeof(unsigned long), mask); if (retval < 0) perror("sched_setaffinity:"); #ifdef _DEBUG fprintf(stderr, "sched_pin(%d): pid=%d, returning %d\n", cpu, (int)getpid(), retval); #endif /* _DEBUG */ #endif return retval; } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_stats.c����������������������������������������������������������������������0000664�0000764�0000764�00000027305�07125356346�016126� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include <math.h> #include "bench.h" #define BOOTSTRAP_COUNT 200 /* * a comparison function used by qsort */ int int_compare(const void *a, const void *b) { if (*(int*)a < *(int*)b) return -1; if (*(int*)a > *(int*)b) return 1; return 0; } /* * a comparison function used by qsort */ int uint64_compare(const void *a, const void *b) { if (*(uint64*)a < *(uint64*)b) return -1; if (*(uint64*)a > *(uint64*)b) return 1; return 0; } /* * a comparison function used by qsort */ int double_compare(const void *a, const void *b) { if (*(double*)a < *(double*)b) return -1; if (*(double*)a > *(double*)b) return 1; return 0; } /* * return the median value of an array of int */ int int_median(int *values, int size) { qsort(values, size, sizeof(int), int_compare); if (size == 0) return 0.; if (size % 2) { return values[size/2]; } return (values[size/2 - 1] + values[size/2]) / 2; } /* * return the median value of an array of int */ uint64 uint64_median(uint64 *values, int size) { qsort(values, size, sizeof(uint64), uint64_compare); if (size == 0) return 0.; if (size % 2) { return values[size/2]; } return (values[size/2 - 1] + values[size/2]) / 2; } /* * return the median value of an array of doubles */ double double_median(double *values, int size) { qsort(values, size, sizeof(double), double_compare); if (size == 0) return 0.; if (size % 2) { return values[size/2]; } return (values[size/2 - 1] + values[size/2]) / 2.0; } /* * return the mean value of an array of int */ int int_mean(int *values, int size) { int i; int sum = 0; for (i = 0; i < size; ++i) sum += values[i]; return sum / size; } /* * return the mean value of an array of int */ uint64 uint64_mean(uint64 *values, int size) { int i; uint64 sum = 0; for (i = 0; i < size; ++i) sum += values[i]; return sum / size; } /* * return the mean value of an array of doubles */ double double_mean(double *values, int size) { int i; double sum = 0.0; for (i = 0; i < size; ++i) sum += values[i]; return sum / (double)size; } /* * return the min value of an array of int */ int int_min(int *values, int size) { int i; int min = values[0]; for (i = 1; i < size; ++i) if (values[i] < min) min = values[i]; return min; } /* * return the min value of an array of int */ uint64 uint64_min(uint64 *values, int size) { int i; uint64 min = values[0]; for (i = 1; i < size; ++i) if (values[i] < min) min = values[i]; return min; } /* * return the min value of an array of doubles */ double double_min(double *values, int size) { int i; double min = values[0]; for (i = 1; i < size; ++i) if (values[i] < min) min = values[i]; return min; } /* * return the max value of an array of int */ int int_max(int *values, int size) { int i; int max = values[0]; for (i = 1; i < size; ++i) if (values[i] > max) max = values[i]; return max; } /* * return the max value of an array of int */ uint64 uint64_max(uint64 *values, int size) { int i; uint64 max = values[0]; for (i = 1; i < size; ++i) if (values[i] > max) max = values[i]; return max; } /* * return the max value of an array of doubles */ double double_max(double *values, int size) { int i; double max = values[0]; for (i = 1; i < size; ++i) if (values[i] > max) max = values[i]; return max; } /* * return the variance of an array of ints * * Reference: "Statistics for Experimenters" by * George E.P. Box et. al., page 41 */ double int_variance(int *values, int size) { int i; double sum = 0.0; int mean = int_mean(values, size); for (i = 0; i < size; ++i) sum += (double)((values[i] - mean) * (values[i] - mean)); return sum / (double)(size - 1); } /* * return the variance of an array of uint64s */ double uint64_variance(uint64 *values, int size) { int i; double sum = 0.0; uint64 mean = uint64_mean(values, size); for (i = 0; i < size; ++i) sum += (double)((values[i] - mean) * (values[i] - mean)); return sum / (double)(size - 1); } /* * return the variance of an array of doubles */ double double_variance(double *values, int size) { int i; double sum = 0.0; double mean = double_mean(values, size); for (i = 0; i < size; ++i) sum += (double)((values[i] - mean) * (values[i] - mean)); return sum / (double)(size - 1); } /* * return the moment of an array of ints * * Reference: "Statistics for Experimenters" by * George E.P. Box et. al., page 41, 90 */ double int_moment(int moment, int *values, int size) { int i, j; double sum = 0.0; int mean = int_mean(values, size); for (i = 0; i < size; ++i) { double diff = values[i] - mean; double m = diff; for (j = 1; j < moment; ++j) m *= diff; sum += m; } return sum / (double)size; } /* * return the moment of an array of uint64s */ double uint64_moment(int moment, uint64 *values, int size) { int i, j; double sum = 0.0; uint64 mean = uint64_mean(values, size); for (i = 0; i < size; ++i) { double diff = values[i] - mean; double m = diff; for (j = 1; j < moment; ++j) m *= diff; sum += m; } return sum / (double)size; } /* * return the moment of an array of doubles */ double double_moment(int moment, double *values, int size) { int i, j; double sum = 0.0; double mean = double_mean(values, size); for (i = 0; i < size; ++i) { double diff = values[i] - mean; double m = diff; for (j = 1; j < moment; ++j) m *= diff; sum += m; } return sum / (double)size; } /* * return the standard error of an array of ints * * Reference: "Statistics for Experimenters" by * George E.P. Box et. al., page 41 */ double int_stderr(int *values, int size) { return sqrt(int_variance(values, size)); } /* * return the standard error of an array of uint64s */ double uint64_stderr(uint64 *values, int size) { return sqrt(uint64_variance(values, size)); } /* * return the standard error of an array of doubles */ double double_stderr(double *values, int size) { return sqrt(double_variance(values, size)); } /* * return the skew of an array of ints * */ double int_skew(int *values, int size) { double sigma = int_stderr(values, size); double moment3 = int_moment(3, values, size); return moment3 / (sigma * sigma * sigma); } /* * return the skew of an array of uint64s */ double uint64_skew(uint64 *values, int size) { double sigma = uint64_stderr(values, size); double moment3 = uint64_moment(3, values, size); return moment3 / (sigma * sigma * sigma); } /* * return the skew of an array of doubles */ double double_skew(double *values, int size) { double sigma = double_stderr(values, size); double moment3 = double_moment(3, values, size); return moment3 / (sigma * sigma * sigma); } /* * return the kurtosis of an array of ints * * Reference: "Statistics for Experimenters" by * George E.P. Box et. al., page 90; */ double int_kurtosis(int *values, int size) { double variance = int_variance(values, size); double moment4 = int_moment(4, values, size); return moment4 / (variance * variance) - 3; } /* * return the kurtosis of an array of uint64s */ double uint64_kurtosis(uint64 *values, int size) { double variance = uint64_variance(values, size); double moment4 = uint64_moment(4, values, size); return moment4 / (variance * variance) - 3; } /* * return the kurtosis of an array of doubles */ double double_kurtosis(double *values, int size) { double variance = double_variance(values, size); double moment4 = double_moment(4, values, size); return moment4 / (variance * variance) - 3; } /* * BOOTSTRAP: * * stderr = sqrt(sum_i(s[i] - sum_j(s[j])/B)**2 / (B - 1)) * * Reference: "An Introduction to the Bootstrap" by Bradley * Efron and Robert J. Tibshirani, page 12. */ /* * return the bootstrap estimation of the standard error * of an array of ints */ double int_bootstrap_stderr(int *values, int size, int_stat f) { int i, j; int *samples = (int*)malloc(size * sizeof(int)); double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); double s_sum = 0; double sum = 0; /* generate the stderr for each of the bootstrap samples */ for (i = 0; i < BOOTSTRAP_COUNT; ++i) { for (j = 0; j < size; ++j) samples[j] = values[rand() % size]; s[i] = (double)(*f)(samples, size); s_sum += s[i]; /* CHS: worry about overflow */ } s_sum /= (double)BOOTSTRAP_COUNT; for (i = 0; i < BOOTSTRAP_COUNT; ++i) sum += (s[i] - s_sum) * (s[i] - s_sum); sum /= (double)(BOOTSTRAP_COUNT - 1); free(samples); free(s); return sqrt(sum); } /* * return the bootstrap estimation of the standard error * of an array of uint64s */ double uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f) { int i, j; uint64 *samples = (uint64*)malloc(size * sizeof(uint64)); double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); double s_sum; double sum; /* generate the stderr for each of the bootstrap samples */ for (i = 0, s_sum = 0.0; i < BOOTSTRAP_COUNT; ++i) { for (j = 0; j < size; ++j) samples[j] = values[rand() % size]; s[i] = (double)(*f)(samples, size); s_sum += s[i]; /* CHS: worry about overflow */ } s_sum /= (double)BOOTSTRAP_COUNT; for (i = 0, sum = 0.0; i < BOOTSTRAP_COUNT; ++i) sum += (s[i] - s_sum) * (s[i] - s_sum); free(samples); free(s); return sqrt(sum/(double)(BOOTSTRAP_COUNT - 1)); } /* * return the bootstrap estimation of the standard error * of an array of doubles */ double double_bootstrap_stderr(double *values, int size, double_stat f) { int i, j; double *samples = (double*)malloc(size * sizeof(double)); double *s = (double*)malloc(BOOTSTRAP_COUNT * sizeof(double)); double s_sum = 0; double sum = 0; /* generate the stderr for each of the bootstrap samples */ for (i = 0; i < BOOTSTRAP_COUNT; ++i) { for (j = 0; j < size; ++j) samples[j] = values[rand() % size]; s[i] = (*f)(samples, size); s_sum += (double)s[i]; /* CHS: worry about overflow */ } s_sum /= (double)BOOTSTRAP_COUNT; for (i = 0; i < BOOTSTRAP_COUNT; ++i) sum += (s[i] - s_sum) * (s[i] - s_sum); sum /= (double)(BOOTSTRAP_COUNT - 1); free(samples); free(s); return sqrt(sum); } /* * regression(x, y, sig, n, a, b, sig_a, sig_b, chi2) * * This routine is derived from equations in "Numerical Recipes in C" * (second edition) by Press, et. al., pages 661-665. * * compute the linear regression y = a + bx for (x,y), where y[i] has * standard deviation sig[i]. * * returns the coefficients a and b, along with an estimation of their * error (standard deviation) in sig_a and sig_b. * * returns chi2 for "goodness of fit" information. */ void regression(double *x, double *y, double *sig, int n, double *a, double *b, double *sig_a, double *sig_b, double *chi2) { int i; double S = 0.0, Sx = 0.0, Sy = 0.0, Stt = 0.0, Sx_S; /* compute some basic statistics */ for (i = 0; i < n; ++i) { /* Equations 15.2.4: for S, Sx, Sy */ double weight = 1.0 / (sig ? sig[i] * sig[i] : 1.0); S += weight; Sx += weight * x[i]; Sy += weight * y[i]; } *b = 0.0; Sx_S = Sx / S; for (i = 0; i < n; ++i) { /* * Equation 15.2.15 for t * Equation 15.2.16 for Stt * Equation 15.2.17 for b, do summation portion of equation * compute Sum i=0,n-1 (t_i * y[i] / sig[i])) */ double t_i = (x[i] - Sx_S) / (sig ? sig[i] : 1.0); Stt += t_i * t_i; *b += t_i * y[i] / (sig ? sig[i] : 1.0); } /* * Equation 15.2.17 for b, do 1/Stt * summation * Equation 15.2.18 for a * Equation 15.2.19 for sig_a * Equation 15.2.20 for sig_b */ *b /= Stt; *a = (Sy - *b * Sx) / S; *sig_a = sqrt((1.0 + (Sx * Sx) / (S * Stt)) / S); *sig_b = sqrt(1.0 / Stt); /* Equation 15.2.2 for chi2, the merit function */ *chi2 = 0.0; for (i = 0; i < n; ++i) { double merit = (y[i] - ((*a) + (*b) * x[i])) / (sig ? sig[i] : 1.0); *chi2 += merit * merit; } if (sig == NULL) { *sig_a *= sqrt((*chi2) / (n - 2)); *sig_b *= sqrt((*chi2) / (n - 2)); } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_tcp.c������������������������������������������������������������������������0000664�0000764�0000764�00000011767�10450256147�015554� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * tcp_lib.c - routines for managing TCP connections. * * Positive port/program numbers are RPC ports, negative ones are TCP ports. * * Copyright (c) 1994-1996 Larry McVoy. */ #define _LIB /* bench.h needs this */ #include "bench.h" /* * Get a TCP socket, bind it, figure out the port, * and advertise the port as program "prog". * * XXX - it would be nice if you could advertise ascii strings. */ int tcp_server(int prog, int rdwr) { int sock; struct sockaddr_in s; #ifdef LIBTCP_VERBOSE fprintf(stderr, "tcp_server(%u, %u)\n", prog, rdwr); #endif if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { perror("socket"); exit(1); } sock_optimize(sock, rdwr); bzero((void*)&s, sizeof(s)); s.sin_family = AF_INET; if (prog < 0) { s.sin_port = htons(-prog); } if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { perror("bind"); exit(2); } if (listen(sock, 100) < 0) { perror("listen"); exit(4); } if (prog > 0) { #ifdef LIBTCP_VERBOSE fprintf(stderr, "Server port %d\n", sockport(sock)); #endif (void)pmap_unset((u_long)prog, (u_long)1); if (!pmap_set((u_long)prog, (u_long)1, (u_long)IPPROTO_TCP, (unsigned short)sockport(sock))) { perror("pmap_set"); exit(5); } } return (sock); } /* * Unadvertise the socket */ int tcp_done(int prog) { if (prog > 0) { pmap_unset((u_long)prog, (u_long)1); } return (0); } /* * Accept a connection and return it */ int tcp_accept(int sock, int rdwr) { struct sockaddr_in s; int newsock; socklen_t namelen; namelen = sizeof(s); bzero((void*)&s, namelen); retry: if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) { if (errno == EINTR) goto retry; perror("accept"); exit(6); } #ifdef LIBTCP_VERBOSE fprintf(stderr, "Server newsock port %d\n", sockport(newsock)); #endif sock_optimize(newsock, rdwr); return (newsock); } /* * Connect to the TCP socket advertised as "prog" on "host" and * return the connected socket. * * Hacked Thu Oct 27 1994 to cache pmap_getport calls. This saves * about 4000 usecs in loopback lat_connect calls. I suppose we * should time gethostbyname() & pmap_getprot(), huh? */ int tcp_connect(char *host, int prog, int rdwr) { static struct hostent *h; static struct sockaddr_in s; static u_short save_port; static u_long save_prog; static char *save_host; int sock; static int tries = 0; if ((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0) { perror("socket"); exit(1); } if (rdwr & SOCKOPT_PID) { static unsigned short port; struct sockaddr_in sin; if (!port) { port = (unsigned short)(getpid() << 4); if (port < 1024) { port += 1024; } } do { port++; bzero((void*)&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_port = htons(port); } while (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) == -1); } #ifdef LIBTCP_VERBOSE else { struct sockaddr_in sin; bzero((void*)&sin, sizeof(sin)); sin.sin_family = AF_INET; if (bind(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) { perror("bind"); exit(2); } } fprintf(stderr, "Client port %d\n", sockport(sock)); #endif sock_optimize(sock, rdwr); if (!h || host != save_host || prog != save_prog) { save_host = host; /* XXX - counting on them not * changing it - benchmark only. */ save_prog = prog; if (!(h = gethostbyname(host))) { perror(host); exit(2); } bzero((void *) &s, sizeof(s)); s.sin_family = AF_INET; bcopy((void*)h->h_addr, (void *)&s.sin_addr, h->h_length); if (prog > 0) { save_port = pmap_getport(&s, prog, (u_long)1, IPPROTO_TCP); if (!save_port) { perror("lib TCP: No port found"); exit(3); } #ifdef LIBTCP_VERBOSE fprintf(stderr, "Server port %d\n", save_port); #endif s.sin_port = htons(save_port); } else { s.sin_port = htons(-prog); } } if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { if (errno == ECONNRESET || errno == ECONNREFUSED || errno == EAGAIN) { close(sock); if (++tries > 10) return(-1); return (tcp_connect(host, prog, rdwr)); } perror("connect"); exit(4); } tries = 0; return (sock); } void sock_optimize(int sock, int flags) { if (flags & SOCKOPT_READ) { int sockbuf = SOCKBUF; while (setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &sockbuf, sizeof(int))) { sockbuf >>= 1; } #ifdef LIBTCP_VERBOSE fprintf(stderr, "sockopt %d: RCV: %dK\n", sock, sockbuf>>10); #endif } if (flags & SOCKOPT_WRITE) { int sockbuf = SOCKBUF; while (setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &sockbuf, sizeof(int))) { sockbuf >>= 1; } #ifdef LIBTCP_VERBOSE fprintf(stderr, "sockopt %d: SND: %dK\n", sock, sockbuf>>10); #endif } if (flags & SOCKOPT_REUSE) { int val = 1; if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)) == -1) { perror("SO_REUSEADDR"); } } } int sockport(int s) { socklen_t namelen; struct sockaddr_in sin; namelen = sizeof(sin); if (getsockname(s, (struct sockaddr *)&sin, &namelen) < 0) { perror("getsockname"); return(-1); } return ((int)ntohs(sin.sin_port)); } ���������lmbench-3.0-a9/src/lib_tcp.h������������������������������������������������������������������������0000664�0000764�0000764�00000000500�07163347366�015553� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> #include <arpa/inet.h> int tcp_server(int prog, int rdwr); int tcp_done(int prog); int tcp_accept(int sock, int rdwr); int tcp_connect(char *host, int prog, int rdwr); void sock_optimize(int sock, int rdwr); int sockport(int s); ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_timing.c���������������������������������������������������������������������0000664�0000764�0000764�00000114222�10620624542�016240� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * a timing utilities library * * Requires 64bit integers to work. * * %W% %@% * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994-1998 Larry McVoy. * Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ #define _LIB /* bench.h needs this */ #include "bench.h" /* #define _DEBUG */ #define nz(x) ((x) == 0 ? 1 : (x)) /* * I know you think these should be 2^10 and 2^20, but people are quoting * disk sizes in powers of 10, and bandwidths are all power of ten. * Deal with it. */ #define MB (1000*1000.0) #define KB (1000.0) static struct timeval start_tv, stop_tv; FILE *ftiming; static volatile uint64 use_result_dummy; static uint64 iterations; static void init_timing(void); #if defined(hpux) || defined(__hpux) #include <sys/mman.h> #endif #ifdef RUSAGE #include <sys/resource.h> #define SECS(tv) (tv.tv_sec + tv.tv_usec / 1000000.0) #define mine(f) (int)(ru_stop.f - ru_start.f) static struct rusage ru_start, ru_stop; void rusage(void) { double sys, user, idle; double per; sys = SECS(ru_stop.ru_stime) - SECS(ru_start.ru_stime); user = SECS(ru_stop.ru_utime) - SECS(ru_start.ru_utime); idle = timespent() - (sys + user); per = idle / timespent() * 100; if (!ftiming) ftiming = stderr; fprintf(ftiming, "real=%.2f sys=%.2f user=%.2f idle=%.2f stall=%.0f%% ", timespent(), sys, user, idle, per); fprintf(ftiming, "rd=%d wr=%d min=%d maj=%d ctx=%d\n", mine(ru_inblock), mine(ru_oublock), mine(ru_minflt), mine(ru_majflt), mine(ru_nvcsw) + mine(ru_nivcsw)); } #endif /* RUSAGE */ void lmbench_usage(int argc, char *argv[], char* usage) { fprintf(stderr,"Usage: %s %s", argv[0], usage); exit(-1); } void sigchld_wait_handler(int signo) { wait(0); signal(SIGCHLD, sigchld_wait_handler); } static int benchmp_sigterm_received; static int benchmp_sigchld_received; static pid_t benchmp_sigalrm_pid; static int benchmp_sigalrm_timeout; void (*benchmp_sigterm_handler)(int); void (*benchmp_sigchld_handler)(int); void (*benchmp_sigalrm_handler)(int); void benchmp_sigterm(int signo) { benchmp_sigterm_received = 1; } void benchmp_sigchld(int signo) { signal(SIGCHLD, SIG_DFL); benchmp_sigchld_received = 1; #ifdef _DEBUG fprintf(stderr, "benchmp_sigchld handler\n"); #endif } void benchmp_sigalrm(int signo) { signal(SIGALRM, SIG_IGN); kill(benchmp_sigalrm_pid, SIGTERM); /* * Since we already waited a full timeout period for the child * to die, we only need to wait a little longer for subsequent * children to die. */ benchmp_sigalrm_timeout = 1; } void benchmp_child(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int childid, int response, int start_signal, int result_signal, int exit_signal, int parallel, iter_t iterations, int repetitions, int enough, void* cookie ); void benchmp_parent(int response, int start_signal, int result_signal, int exit_signal, pid_t* pids, int parallel, iter_t iterations, int warmup, int repetitions, int enough ); int sizeof_result(int repetitions); void benchmp(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int enough, int parallel, int warmup, int repetitions, void* cookie) { iter_t iterations = 1; long i; pid_t *pids = NULL; int response[2]; int start_signal[2]; int result_signal[2]; int exit_signal[2]; #ifdef _DEBUG fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): entering\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie); #endif enough = get_enough(enough); #ifdef _DEBUG fprintf(stderr, "\tenough=%d\n", enough); #endif if (repetitions < 0) repetitions = (1 < parallel || 1000000 <= enough ? 1 : TRIES); /* initialize results */ settime(0); save_n(1); if (parallel > 1) { /* Compute the baseline performance */ benchmp(initialize, benchmark, cleanup, enough, 1, warmup, repetitions, cookie); /* if we can't even do a single job, then give up */ if (gettime() == 0) return; /* calculate iterations for 1sec runtime */ iterations = get_n(); if (enough < SHORT) { double tmp = (double)SHORT * (double)get_n(); tmp /= (double)gettime(); iterations = (iter_t)tmp + 1; } settime(0); save_n(1); } /* Create the necessary pipes for control */ if (pipe(response) < 0 || pipe(start_signal) < 0 || pipe(result_signal) < 0 || pipe(exit_signal) < 0) { #ifdef _DEBUG fprintf(stderr, "BENCHMP: Could not create control pipes\n"); #endif /* _DEBUG */ return; } /* fork the necessary children */ benchmp_sigchld_received = 0; benchmp_sigterm_received = 0; benchmp_sigterm_handler = signal(SIGTERM, benchmp_sigterm); benchmp_sigchld_handler = signal(SIGCHLD, benchmp_sigchld); pids = (pid_t*)malloc(parallel * sizeof(pid_t)); if (!pids) return; bzero((void*)pids, parallel * sizeof(pid_t)); for (i = 0; i < parallel; ++i) { if (benchmp_sigterm_received) goto error_exit; #ifdef _DEBUG fprintf(stderr, "benchmp(%p, %p, %p, %d, %d, %d, %d, %p): creating child %d\n", initialize, benchmark, cleanup, enough, parallel, warmup, repetitions, cookie, i); #endif switch(pids[i] = fork()) { case -1: /* could not open enough children! */ #ifdef _DEBUG fprintf(stderr, "BENCHMP: fork() failed!\n"); #endif /* _DEBUG */ goto error_exit; case 0: /* If child */ close(response[0]); close(start_signal[1]); close(result_signal[1]); close(exit_signal[1]); handle_scheduler(i, 0, 0); benchmp_child(initialize, benchmark, cleanup, i, response[1], start_signal[0], result_signal[0], exit_signal[0], enough, iterations, parallel, repetitions, cookie ); exit(0); default: break; } } close(response[1]); close(start_signal[0]); close(result_signal[0]); close(exit_signal[0]); benchmp_parent(response[0], start_signal[1], result_signal[1], exit_signal[1], pids, parallel, iterations, warmup, repetitions, enough ); goto cleanup_exit; error_exit: /* give the children a chance to clean up gracefully */ signal(SIGCHLD, SIG_DFL); while (--i >= 0) { kill(pids[i], SIGTERM); waitpid(pids[i], NULL, 0); } cleanup_exit: /* * Clean up and kill all children * * NOTE: the children themselves SHOULD exit, and * Killing them could prevent them from * cleanup up subprocesses, etc... So, we only * want to kill child processes when it appears * that they will not die of their own accord. * We wait twice the timing interval plus two seconds * for children to die. If they haven't died by * that time, then we start killing them. */ benchmp_sigalrm_timeout = (int)((2 * enough)/1000000) + 2; if (benchmp_sigalrm_timeout < 5) benchmp_sigalrm_timeout = 5; signal(SIGCHLD, SIG_DFL); while (i-- > 0) { /* wait timeout seconds for child to die, then kill it */ benchmp_sigalrm_pid = pids[i]; benchmp_sigalrm_handler = signal(SIGALRM, benchmp_sigalrm); alarm(benchmp_sigalrm_timeout); waitpid(pids[i], NULL, 0); alarm(0); signal(SIGALRM, benchmp_sigalrm_handler); } if (pids) free(pids); #ifdef _DEBUG fprintf(stderr, "benchmp(0x%x, 0x%x, 0x%x, %d, %d, 0x%x): exiting\n", (unsigned int)initialize, (unsigned int)benchmark, (unsigned int)cleanup, enough, parallel, (unsigned int)cookie); #endif } void benchmp_parent( int response, int start_signal, int result_signal, int exit_signal, pid_t* pids, int parallel, iter_t iterations, int warmup, int repetitions, int enough ) { int i, j; int bytes_read; result_t* results = NULL; result_t* merged_results = NULL; char* signals = NULL; unsigned char* buf; fd_set fds_read, fds_error; struct timeval timeout; if (benchmp_sigchld_received || benchmp_sigterm_received) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: entering, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); #endif goto error_exit; } results = (result_t*)malloc(sizeof_result(repetitions)); merged_results = (result_t*)malloc(sizeof_result(parallel * repetitions)); signals = (char*)malloc(parallel * sizeof(char)); if (!results || !merged_results || !signals) return; /* Collect 'ready' signals */ for (i = 0; i < parallel * sizeof(char); i += bytes_read) { bytes_read = 0; FD_ZERO(&fds_read); FD_ZERO(&fds_error); FD_SET(response, &fds_read); FD_SET(response, &fds_error); timeout.tv_sec = 1; timeout.tv_usec = 0; select(response+1, &fds_read, NULL, &fds_error, &timeout); if (benchmp_sigchld_received || benchmp_sigterm_received || FD_ISSET(response, &fds_error)) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: ready, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); #endif goto error_exit; } if (!FD_ISSET(response, &fds_read)) { continue; } bytes_read = read(response, signals, parallel * sizeof(char) - i); if (bytes_read < 0) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: ready, bytes_read=%d, %s\n", bytes_read, strerror(errno)); #endif goto error_exit; } } /* let the children run for warmup microseconds */ if (warmup > 0) { struct timeval delay; delay.tv_sec = warmup / 1000000; delay.tv_usec = warmup % 1000000; select(0, NULL, NULL, NULL, &delay); } /* send 'start' signal */ write(start_signal, signals, parallel * sizeof(char)); /* Collect 'done' signals */ for (i = 0; i < parallel * sizeof(char); i += bytes_read) { bytes_read = 0; FD_ZERO(&fds_read); FD_ZERO(&fds_error); FD_SET(response, &fds_read); FD_SET(response, &fds_error); timeout.tv_sec = 1; timeout.tv_usec = 0; select(response+1, &fds_read, NULL, &fds_error, &timeout); if (benchmp_sigchld_received || benchmp_sigterm_received || FD_ISSET(response, &fds_error)) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: done, benchmp_child_died=%d\n", benchmp_sigchld_received); #endif goto error_exit; } if (!FD_ISSET(response, &fds_read)) { continue; } bytes_read = read(response, signals, parallel * sizeof(char) - i); if (bytes_read < 0) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: done, bytes_read=%d, %s\n", bytes_read, strerror(errno)); #endif goto error_exit; } } /* collect results */ insertinit(merged_results); for (i = 0; i < parallel; ++i) { int n = sizeof_result(repetitions); buf = (unsigned char*)results; FD_ZERO(&fds_read); FD_ZERO(&fds_error); /* tell one child to report its results */ write(result_signal, buf, sizeof(char)); for (; n > 0; n -= bytes_read, buf += bytes_read) { bytes_read = 0; FD_SET(response, &fds_read); FD_SET(response, &fds_error); timeout.tv_sec = 1; timeout.tv_usec = 0; select(response+1, &fds_read, NULL, &fds_error, &timeout); if (benchmp_sigchld_received || benchmp_sigterm_received || FD_ISSET(response, &fds_error)) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: results, benchmp_sigchld_received=%d\n", benchmp_sigchld_received); #endif goto error_exit; } if (!FD_ISSET(response, &fds_read)) { continue; } bytes_read = read(response, buf, n); if (bytes_read < 0) { #ifdef _DEBUG fprintf(stderr, "benchmp_parent: results, bytes_read=%d, %s\n", bytes_read, strerror(errno)); #endif goto error_exit; } } for (j = 0; j < results->N; ++j) { insertsort(results->v[j].u, results->v[j].n, merged_results); } } /* we allow children to die now, without it causing an error */ signal(SIGCHLD, SIG_DFL); /* send 'exit' signals */ write(exit_signal, results, parallel * sizeof(char)); /* Compute median time; iterations is constant! */ set_results(merged_results); goto cleanup_exit; error_exit: #ifdef _DEBUG fprintf(stderr, "benchmp_parent: error_exit!\n"); #endif signal(SIGCHLD, SIG_DFL); for (i = 0; i < parallel; ++i) { kill(pids[i], SIGTERM); waitpid(pids[i], NULL, 0); } free(merged_results); cleanup_exit: close(response); close(start_signal); close(result_signal); close(exit_signal); if (results) free(results); if (signals) free(signals); } typedef enum { warmup, timing_interval, cooldown } benchmp_state; typedef struct { benchmp_state state; benchmp_f initialize; benchmp_f benchmark; benchmp_f cleanup; int childid; int response; int start_signal; int result_signal; int exit_signal; int enough; iter_t iterations; int parallel; int repetitions; void* cookie; iter_t iterations_batch; int need_warmup; long i; int r_size; result_t* r; } benchmp_child_state; static benchmp_child_state _benchmp_child_state; int benchmp_childid() { return _benchmp_child_state.childid; } void benchmp_child_sigchld(int signo) { #ifdef _DEBUG fprintf(stderr, "benchmp_child_sigchld handler\n"); #endif if (_benchmp_child_state.cleanup) { signal(SIGCHLD, SIG_DFL); (*_benchmp_child_state.cleanup)(0, &_benchmp_child_state); } exit(1); } void benchmp_child_sigterm(int signo) { signal(SIGTERM, SIG_IGN); if (_benchmp_child_state.cleanup) { void (*sig)(int) = signal(SIGCHLD, SIG_DFL); if (sig != benchmp_child_sigchld && sig != SIG_DFL) { signal(SIGCHLD, sig); } (*_benchmp_child_state.cleanup)(0, &_benchmp_child_state); } exit(0); } void* benchmp_getstate() { return ((void*)&_benchmp_child_state); } void benchmp_child(benchmp_f initialize, benchmp_f benchmark, benchmp_f cleanup, int childid, int response, int start_signal, int result_signal, int exit_signal, int enough, iter_t iterations, int parallel, int repetitions, void* cookie ) { iter_t iterations_batch = (parallel > 1) ? get_n() : 1; _benchmp_child_state.state = warmup; _benchmp_child_state.initialize = initialize; _benchmp_child_state.benchmark = benchmark; _benchmp_child_state.cleanup = cleanup; _benchmp_child_state.childid = childid; _benchmp_child_state.response = response; _benchmp_child_state.start_signal = start_signal; _benchmp_child_state.result_signal = result_signal; _benchmp_child_state.exit_signal = exit_signal; _benchmp_child_state.enough = enough; _benchmp_child_state.iterations = iterations; _benchmp_child_state.iterations_batch = iterations_batch; _benchmp_child_state.parallel = parallel; _benchmp_child_state.repetitions = repetitions; _benchmp_child_state.cookie = cookie; _benchmp_child_state.need_warmup = 1; _benchmp_child_state.i = 0; _benchmp_child_state.r_size = sizeof_result(repetitions); _benchmp_child_state.r = (result_t*)malloc(_benchmp_child_state.r_size); if (!_benchmp_child_state.r) return; insertinit(_benchmp_child_state.r); set_results(_benchmp_child_state.r); if (benchmp_sigchld_handler != SIG_DFL) { signal(SIGCHLD, benchmp_sigchld_handler); } else { signal(SIGCHLD, benchmp_child_sigchld); } if (initialize) (*initialize)(0, cookie); if (benchmp_sigterm_handler != SIG_DFL) { signal(SIGTERM, benchmp_sigterm_handler); } else { signal(SIGTERM, benchmp_child_sigterm); } if (benchmp_sigterm_received) benchmp_child_sigterm(SIGTERM); /* start experiments, collecting results */ insertinit(_benchmp_child_state.r); while (1) { (*benchmark)(benchmp_interval(&_benchmp_child_state), cookie); } } iter_t benchmp_interval(void* _state) { char c; iter_t iterations; double result; fd_set fds; struct timeval timeout; benchmp_child_state* state = (benchmp_child_state*)_state; iterations = (state->state == timing_interval ? state->iterations : state->iterations_batch); if (state->need_warmup) { /* remove spurious compilation warning */ result = state->enough; } else { result = stop(0,0); if (state->cleanup) { if (benchmp_sigchld_handler == SIG_DFL) signal(SIGCHLD, SIG_DFL); (*state->cleanup)(iterations, state->cookie); } save_n(state->iterations); result -= t_overhead() + get_n() * l_overhead(); settime(result >= 0. ? (uint64)result : 0.); } /* if the parent died, then give up */ if (getppid() == 1 && state->cleanup) { if (benchmp_sigchld_handler == SIG_DFL) signal(SIGCHLD, SIG_DFL); (*state->cleanup)(0, state->cookie); exit(0); } timeout.tv_sec = 0; timeout.tv_usec = 0; FD_ZERO(&fds); switch (state->state) { case warmup: iterations = state->iterations_batch; FD_SET(state->start_signal, &fds); select(state->start_signal+1, &fds, NULL, NULL, &timeout); if (FD_ISSET(state->start_signal, &fds)) { state->state = timing_interval; read(state->start_signal, &c, sizeof(char)); iterations = state->iterations; } if (state->need_warmup) { state->need_warmup = 0; /* send 'ready' */ write(state->response, &c, sizeof(char)); } break; case timing_interval: iterations = state->iterations; if (state->parallel > 1 || result > 0.95 * state->enough) { insertsort(gettime(), get_n(), get_results()); state->i++; /* we completed all the experiments, return results */ if (state->i >= state->repetitions) { state->state = cooldown; } } if (state->parallel == 1 && (result < 0.99 * state->enough || result > 1.2 * state->enough)) { if (result > 150.) { double tmp = iterations / result; tmp *= 1.1 * state->enough; iterations = (iter_t)(tmp + 1); } else { iterations <<= 3; if (iterations > 1<<27 || (result < 0. && iterations > 1<<20)) { state->state = cooldown; } } } state->iterations = iterations; if (state->state == cooldown) { /* send 'done' */ write(state->response, (void*)&c, sizeof(char)); iterations = state->iterations_batch; } break; case cooldown: iterations = state->iterations_batch; FD_SET(state->result_signal, &fds); select(state->result_signal+1, &fds, NULL, NULL, &timeout); if (FD_ISSET(state->result_signal, &fds)) { /* * At this point all children have stopped their * measurement loops, so we can block waiting for * the parent to tell us to send our results back. * From this point on, we will do no more "work". */ read(state->result_signal, (void*)&c, sizeof(char)); write(state->response, (void*)get_results(), state->r_size); if (state->cleanup) { if (benchmp_sigchld_handler == SIG_DFL) signal(SIGCHLD, SIG_DFL); (*state->cleanup)(0, state->cookie); } /* Now wait for signal to exit */ read(state->exit_signal, (void*)&c, sizeof(char)); exit(0); } }; if (state->initialize) { (*state->initialize)(iterations, state->cookie); } start(0); return (iterations); } /* * Redirect output someplace else. */ void timing(FILE *out) { ftiming = out; } /* * Start timing now. */ void start(struct timeval *tv) { if (tv == NULL) { tv = &start_tv; } #ifdef RUSAGE getrusage(RUSAGE_SELF, &ru_start); #endif (void) gettimeofday(tv, (struct timezone *) 0); } /* * Stop timing and return real time in microseconds. */ uint64 stop(struct timeval *begin, struct timeval *end) { if (end == NULL) { end = &stop_tv; } (void) gettimeofday(end, (struct timezone *) 0); #ifdef RUSAGE getrusage(RUSAGE_SELF, &ru_stop); #endif if (begin == NULL) { begin = &start_tv; } return (tvdelta(begin, end)); } uint64 now(void) { struct timeval t; uint64 m; (void) gettimeofday(&t, (struct timezone *) 0); m = t.tv_sec; m *= 1000000; m += t.tv_usec; return (m); } double Now(void) { struct timeval t; (void) gettimeofday(&t, (struct timezone *) 0); return (t.tv_sec * 1000000.0 + t.tv_usec); } uint64 delta(void) { static struct timeval last; struct timeval t; struct timeval diff; uint64 m; (void) gettimeofday(&t, (struct timezone *) 0); if (last.tv_usec) { tvsub(&diff, &t, &last); last = t; m = diff.tv_sec; m *= 1000000; m += diff.tv_usec; return (m); } else { last = t; return (0); } } double Delta(void) { struct timeval t; struct timeval diff; (void) gettimeofday(&t, (struct timezone *) 0); tvsub(&diff, &t, &start_tv); return (diff.tv_sec + diff.tv_usec / 1000000.0); } void save_n(uint64 n) { iterations = n; } uint64 get_n(void) { return (iterations); } /* * Make the time spend be usecs. */ void settime(uint64 usecs) { bzero((void*)&start_tv, sizeof(start_tv)); stop_tv.tv_sec = usecs / 1000000; stop_tv.tv_usec = usecs % 1000000; } void bandwidth(uint64 bytes, uint64 times, int verbose) { struct timeval tdiff; double mb, secs; tvsub(&tdiff, &stop_tv, &start_tv); secs = tdiff.tv_sec; secs *= 1000000; secs += tdiff.tv_usec; secs /= 1000000; secs /= times; mb = bytes / MB; if (!ftiming) ftiming = stderr; if (verbose) { (void) fprintf(ftiming, "%.4f MB in %.4f secs, %.4f MB/sec\n", mb, secs, mb/secs); } else { if (mb < 1) { (void) fprintf(ftiming, "%.6f ", mb); } else { (void) fprintf(ftiming, "%.2f ", mb); } if (mb / secs < 1) { (void) fprintf(ftiming, "%.6f\n", mb/secs); } else { (void) fprintf(ftiming, "%.2f\n", mb/secs); } } } void kb(uint64 bytes) { struct timeval td; double s, bs; tvsub(&td, &stop_tv, &start_tv); s = td.tv_sec + td.tv_usec / 1000000.0; bs = bytes / nz(s); if (s == 0.0) return; if (!ftiming) ftiming = stderr; (void) fprintf(ftiming, "%.0f KB/sec\n", bs / KB); } void mb(uint64 bytes) { struct timeval td; double s, bs; tvsub(&td, &stop_tv, &start_tv); s = td.tv_sec + td.tv_usec / 1000000.0; bs = bytes / nz(s); if (s == 0.0) return; if (!ftiming) ftiming = stderr; (void) fprintf(ftiming, "%.2f MB/sec\n", bs / MB); } void latency(uint64 xfers, uint64 size) { struct timeval td; double s; if (!ftiming) ftiming = stderr; tvsub(&td, &stop_tv, &start_tv); s = td.tv_sec + td.tv_usec / 1000000.0; if (s == 0.0) return; if (xfers > 1) { fprintf(ftiming, "%d %dKB xfers in %.2f secs, ", (int) xfers, (int) (size / KB), s); } else { fprintf(ftiming, "%.1fKB in ", size / KB); } if ((s * 1000 / xfers) > 100) { fprintf(ftiming, "%.0f millisec%s, ", s * 1000 / xfers, xfers > 1 ? "/xfer" : "s"); } else { fprintf(ftiming, "%.4f millisec%s, ", s * 1000 / xfers, xfers > 1 ? "/xfer" : "s"); } if (((xfers * size) / (MB * s)) > 1) { fprintf(ftiming, "%.2f MB/sec\n", (xfers * size) / (MB * s)); } else { fprintf(ftiming, "%.2f KB/sec\n", (xfers * size) / (KB * s)); } } void context(uint64 xfers) { struct timeval td; double s; tvsub(&td, &stop_tv, &start_tv); s = td.tv_sec + td.tv_usec / 1000000.0; if (s == 0.0) return; if (!ftiming) ftiming = stderr; fprintf(ftiming, "%d context switches in %.2f secs, %.0f microsec/switch\n", (int)xfers, s, s * 1000000 / xfers); } void nano(char *s, uint64 n) { struct timeval td; double micro; tvsub(&td, &stop_tv, &start_tv); micro = td.tv_sec * 1000000 + td.tv_usec; micro *= 1000; if (micro == 0.0) return; if (!ftiming) ftiming = stderr; fprintf(ftiming, "%s: %.2f nanoseconds\n", s, micro / n); } void micro(char *s, uint64 n) { struct timeval td; double micro; tvsub(&td, &stop_tv, &start_tv); micro = td.tv_sec * 1000000 + td.tv_usec; micro /= n; if (micro == 0.0) return; if (!ftiming) ftiming = stderr; fprintf(ftiming, "%s: %.4f microseconds\n", s, micro); #if 0 if (micro >= 100) { fprintf(ftiming, "%s: %.1f microseconds\n", s, micro); } else if (micro >= 10) { fprintf(ftiming, "%s: %.3f microseconds\n", s, micro); } else { fprintf(ftiming, "%s: %.4f microseconds\n", s, micro); } #endif } void micromb(uint64 sz, uint64 n) { struct timeval td; double mb, micro; tvsub(&td, &stop_tv, &start_tv); micro = td.tv_sec * 1000000 + td.tv_usec; micro /= n; mb = sz; mb /= MB; if (micro == 0.0) return; if (!ftiming) ftiming = stderr; if (micro >= 10) { fprintf(ftiming, "%.6f %.0f\n", mb, micro); } else { fprintf(ftiming, "%.6f %.3f\n", mb, micro); } } void milli(char *s, uint64 n) { struct timeval td; uint64 milli; tvsub(&td, &stop_tv, &start_tv); milli = td.tv_sec * 1000 + td.tv_usec / 1000; milli /= n; if (milli == 0.0) return; if (!ftiming) ftiming = stderr; fprintf(ftiming, "%s: %d milliseconds\n", s, (int)milli); } void ptime(uint64 n) { struct timeval td; double s; tvsub(&td, &stop_tv, &start_tv); s = td.tv_sec + td.tv_usec / 1000000.0; if (s == 0.0) return; if (!ftiming) ftiming = stderr; fprintf(ftiming, "%d in %.2f secs, %.0f microseconds each\n", (int)n, s, s * 1000000 / n); } uint64 tvdelta(struct timeval *start, struct timeval *stop) { struct timeval td; uint64 usecs; tvsub(&td, stop, start); usecs = td.tv_sec; usecs *= 1000000; usecs += td.tv_usec; return (usecs); } void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0) { tdiff->tv_sec = t1->tv_sec - t0->tv_sec; tdiff->tv_usec = t1->tv_usec - t0->tv_usec; if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) { tdiff->tv_sec--; tdiff->tv_usec += 1000000; assert(tdiff->tv_usec >= 0); } /* time shouldn't go backwards!!! */ if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) { tdiff->tv_sec = 0; tdiff->tv_usec = 0; } } uint64 gettime(void) { return (tvdelta(&start_tv, &stop_tv)); } double timespent(void) { struct timeval td; tvsub(&td, &stop_tv, &start_tv); return (td.tv_sec + td.tv_usec / 1000000.0); } static char p64buf[10][20]; static int n; char * p64(uint64 big) { char *s = p64buf[n++]; if (n == 10) n = 0; #ifdef linux { int *a = (int*)&big; if (a[1]) { sprintf(s, "0x%x%08x", a[1], a[0]); } else { sprintf(s, "0x%x", a[0]); } } #endif #ifdef __sgi sprintf(s, "0x%llx", big); #endif return (s); } char * p64sz(uint64 big) { double d = big; char *tags = " KMGTPE"; int t = 0; char *s = p64buf[n++]; if (n == 10) n = 0; while (d > 512) t++, d /= 1024; if (d == 0) { return ("0"); } if (d < 100) { sprintf(s, "%.4f%c", d, tags[t]); } else { sprintf(s, "%.2f%c", d, tags[t]); } return (s); } char last(char *s) { while (*s++) ; return (s[-2]); } uint64 bytes(char *s) { uint64 n; if (sscanf(s, "%llu", &n) < 1) return (0); switch (last(s)) { case 'k': n <<= 10; break; case 'K': n *= 1000; break; case 'm': n <<= 20; break; case 'M': n *= 1000000; break; case 'g': n <<= 30; break; case 'G': n *= 1000000000L; break; } return (n); } void use_int(int result) { use_result_dummy += result; } void use_pointer(void *result) { use_result_dummy += (long)result; } int sizeof_result(int repetitions) { if (repetitions <= TRIES) return (sizeof(result_t)); return (sizeof(result_t) + (repetitions - TRIES) * sizeof(value_t)); } void insertinit(result_t *r) { r->N = 0; } /* biggest to smallest */ void insertsort(uint64 u, uint64 n, result_t *r) { int i, j; if (u == 0) return; #ifdef _DEBUG fprintf(stderr, "\tinsertsort(%llu, %llu, %p)\n", u, n, r); #endif /* _DEBUG */ for (i = 0; i < r->N; ++i) { if (u/(double)n > r->v[i].u/(double)r->v[i].n) { for (j = r->N; j > i; --j) { r->v[j] = r->v[j - 1]; } break; } } r->v[i].u = u; r->v[i].n = n; r->N++; } static result_t _results; static result_t* results = &_results; result_t* get_results() { return (results); } void set_results(result_t *r) { results = r; save_median(); } void save_minimum() { if (results->N == 0) { save_n(1); settime(0); } else { save_n(results->v[results->N - 1].n); settime(results->v[results->N - 1].u); } } void save_median() { int i = results->N / 2; uint64 u, n; if (results->N == 0) { n = 1; u = 0; } else if (results->N % 2) { n = results->v[i].n; u = results->v[i].u; } else { n = (results->v[i].n + results->v[i-1].n) / 2; u = (results->v[i].u + results->v[i-1].u) / 2; } #ifdef _DEBUG fprintf(stderr, "save_median: N=%d, n=%lu, u=%lu\n", results->N, (unsigned long)n, (unsigned long)u); #endif /* _DEBUG */ save_n(n); settime(u); } /* * The inner loop tracks bench.h but uses a different results array. */ static long * one_op(register long *p) { BENCH_INNER(p = (long *)*p;, 0); return (p); } static long * two_op(register long *p) { BENCH_INNER(p = (long *)*p; p = (long*)*p;, 0); return (p); } static long *p = (long *)&p; static long *q = (long *)&q; double l_overhead(void) { int i; uint64 N_save, u_save; static double overhead; static int initialized = 0; result_t one, two, *r_save; init_timing(); if (initialized) return (overhead); initialized = 1; if (getenv("LOOP_O")) { overhead = atof(getenv("LOOP_O")); } else { r_save = get_results(); N_save = get_n(); u_save = gettime(); insertinit(&one); insertinit(&two); for (i = 0; i < TRIES; ++i) { use_pointer((void*)one_op(p)); if (gettime() > t_overhead()) insertsort(gettime() - t_overhead(), get_n(), &one); use_pointer((void *)two_op(p)); if (gettime() > t_overhead()) insertsort(gettime() - t_overhead(), get_n(), &two); } /* * u1 = (n1 * (overhead + work)) * u2 = (n2 * (overhead + 2 * work)) * ==> overhead = 2. * u1 / n1 - u2 / n2 */ set_results(&one); save_minimum(); overhead = 2. * gettime() / (double)get_n(); set_results(&two); save_minimum(); overhead -= gettime() / (double)get_n(); if (overhead < 0.) overhead = 0.; /* Gag */ set_results(r_save); save_n(N_save); settime(u_save); } return (overhead); } /* * Figure out the timing overhead. This has to track bench.h */ uint64 t_overhead(void) { uint64 N_save, u_save; static int initialized = 0; static uint64 overhead = 0; struct timeval tv; result_t *r_save; init_timing(); if (initialized) return (overhead); initialized = 1; if (getenv("TIMING_O")) { overhead = atof(getenv("TIMING_O")); } else if (get_enough(0) <= 50000) { /* it is not in the noise, so compute it */ int i; result_t r; r_save = get_results(); N_save = get_n(); u_save = gettime(); insertinit(&r); for (i = 0; i < TRIES; ++i) { BENCH_INNER(gettimeofday(&tv, 0), 0); insertsort(gettime(), get_n(), &r); } set_results(&r); save_minimum(); overhead = gettime() / get_n(); set_results(r_save); save_n(N_save); settime(u_save); } return (overhead); } /* * Figure out how long to run it. * If enough == 0, then they want us to figure it out. * If enough is !0 then return it unless we think it is too short. */ static int long_enough; static int compute_enough(); int get_enough(int e) { init_timing(); return (long_enough > e ? long_enough : e); } static void init_timing(void) { static int done = 0; if (done) return; done = 1; long_enough = compute_enough(); t_overhead(); l_overhead(); } typedef long TYPE; static TYPE ** enough_duration(register long N, register TYPE ** p) { #define ENOUGH_DURATION_TEN(one) one one one one one one one one one one while (N-- > 0) { ENOUGH_DURATION_TEN(p = (TYPE **) *p;); } return (p); } static uint64 duration(long N) { uint64 usecs; TYPE *x = (TYPE *)&x; TYPE **p = (TYPE **)&x; start(0); p = enough_duration(N, p); usecs = stop(0, 0); use_pointer((void *)p); return (usecs); } /* * find the minimum time that work "N" takes in "tries" tests */ static uint64 time_N(iter_t N) { int i; uint64 usecs; result_t r, *r_save; r_save = get_results(); insertinit(&r); for (i = 1; i < TRIES; ++i) { usecs = duration(N); insertsort(usecs, N, &r); } set_results(&r); save_minimum(); usecs = gettime(); set_results(r_save); return (usecs); } /* * return the amount of work needed to run "enough" microseconds */ static iter_t find_N(int enough) { int tries; static iter_t N = 10000; static uint64 usecs = 0; if (!usecs) usecs = time_N(N); for (tries = 0; tries < 10; ++tries) { if (0.98 * enough < usecs && usecs < 1.02 * enough) return (N); if (usecs < 1000) N *= 10; else { double n = N; n /= usecs; n *= enough; N = n + 1; } usecs = time_N(N); } return (0); } /* * We want to verify that small modifications proportionally affect the runtime */ static double test_points[] = {1.015, 1.02, 1.035}; static int test_time(int enough) { int i; iter_t N; uint64 usecs, expected, baseline, diff; if ((N = find_N(enough)) == 0) return (0); baseline = time_N(N); for (i = 0; i < sizeof(test_points) / sizeof(double); ++i) { usecs = time_N((int)((double) N * test_points[i])); expected = (uint64)((double)baseline * test_points[i]); diff = expected > usecs ? expected - usecs : usecs - expected; if (diff / (double)expected > 0.0025) return (0); } return (1); } /* * We want to find the smallest timing interval that has accurate timing */ static int possibilities[] = { 5000, 10000, 50000, 100000 }; static int compute_enough() { int i; if (getenv("ENOUGH")) { return (atoi(getenv("ENOUGH"))); } for (i = 0; i < sizeof(possibilities) / sizeof(int); ++i) { if (test_time(possibilities[i])) return (possibilities[i]); } /* * if we can't find a timing interval that is sufficient, * then use SHORT as a default. */ return (SHORT); } /* * This stuff isn't really lib_timing, but ... */ void morefds(void) { #ifdef RLIMIT_NOFILE struct rlimit r; getrlimit(RLIMIT_NOFILE, &r); r.rlim_cur = r.rlim_max; setrlimit(RLIMIT_NOFILE, &r); #endif } /* analogous to bzero, bcopy, etc., except that it just reads * data into the processor */ long bread(void* buf, long nbytes) { long sum = 0; register long *p, *next; register char *end; p = (long*)buf; end = (char*)buf + nbytes; for (next = p + 128; (void*)next <= (void*)end; p = next, next += 128) { sum += p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+ p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+ p[15]+p[16]+p[17]+p[18]+p[19]+p[20]+p[21]+ p[22]+p[23]+p[24]+p[25]+p[26]+p[27]+p[28]+ p[29]+p[30]+p[31]+p[32]+p[33]+p[34]+p[35]+ p[36]+p[37]+p[38]+p[39]+p[40]+p[41]+p[42]+ p[43]+p[44]+p[45]+p[46]+p[47]+p[48]+p[49]+ p[50]+p[51]+p[52]+p[53]+p[54]+p[55]+p[56]+ p[57]+p[58]+p[59]+p[60]+p[61]+p[62]+p[63]+ p[64]+p[65]+p[66]+p[67]+p[68]+p[69]+p[70]+ p[71]+p[72]+p[73]+p[74]+p[75]+p[76]+p[77]+ p[78]+p[79]+p[80]+p[81]+p[82]+p[83]+p[84]+ p[85]+p[86]+p[87]+p[88]+p[89]+p[90]+p[91]+ p[92]+p[93]+p[94]+p[95]+p[96]+p[97]+p[98]+ p[99]+p[100]+p[101]+p[102]+p[103]+p[104]+ p[105]+p[106]+p[107]+p[108]+p[109]+p[110]+ p[111]+p[112]+p[113]+p[114]+p[115]+p[116]+ p[117]+p[118]+p[119]+p[120]+p[121]+p[122]+ p[123]+p[124]+p[125]+p[126]+p[127]; } for (next = p + 16; (void*)next <= (void*)end; p = next, next += 16) { sum += p[0]+p[1]+p[2]+p[3]+p[4]+p[5]+p[6]+p[7]+ p[8]+p[9]+p[10]+p[11]+p[12]+p[13]+p[14]+ p[15]; } for (next = p + 1; (void*)next <= (void*)end; p = next, next++) { sum += *p; } return sum; } void touch(char *buf, size_t nbytes) { static size_t psize; if (!psize) { psize = getpagesize(); } while (nbytes >= psize) { *buf = 1; buf += psize; nbytes -= psize; } } size_t* permutation(size_t max, size_t scale) { size_t i, v, o; static size_t r = 0; size_t* result = (size_t*)malloc(max * sizeof(size_t)); if (result == NULL) return NULL; for (i = 0; i < max; ++i) { result[i] = i * scale; } if (r == 0) r = (getpid()<<6) ^ getppid() ^ rand() ^ (rand()<<10); /* randomize the sequence */ for (i = 0; i < max; ++i) { r = (r << 1) ^ rand(); o = r % max; v = result[o]; result[o] = result[i]; result[i] = v; } #ifdef _DEBUG fprintf(stderr, "permutation(%d): {", max); for (i = 0; i < max; ++i) { fprintf(stderr, "%d", result[i]); if (i < max - 1) fprintf(stderr, ","); } fprintf(stderr, "}\n"); fflush(stderr); #endif /* _DEBUG */ return (result); } int cp(char* src, char* dst, mode_t mode) { int sfd, dfd; char buf[8192]; ssize_t size; if ((sfd = open(src, O_RDONLY)) < 0) { return -1; } if ((dfd = open(dst, O_CREAT|O_TRUNC|O_RDWR, mode)) < 0) { return -1; } while ((size = read(sfd, buf, 8192)) > 0) { if (write(dfd, buf, size) < size) return -1; } fsync(dfd); close(sfd); close(dfd); return 0; } #define BIGSEEK (1<<30) off64_t seekto(int fd, off64_t off, int whence) { #ifdef HAVE_lseek64 return lseek64(fd, off, whence); #else int64 here = 0; int delta = (off >= 0 ? BIGSEEK : -BIGSEEK); int v; /* For large files, the return value will be wrong */ switch (whence) { case SEEK_SET: lseek(fd, 0, 0); break; case SEEK_END: if (lseek(fd, 0, SEEK_END) == -1) return ((off64_t)-1); break; case SEEK_CUR: if (off == 0) return lseek(fd, 0, SEEK_CUR); default: break; } /* fprintf(stderr, "seekto(%d, %lld, %d): did initial seek\n", fd, off, whence); */ while ((off - here < delta && delta < 0) || (0 < delta && delta < off - here)) { /* fprintf(stderr, "about to lseek(%d, %d, %d)\n", fd, delta, whence); */ if (lseek(fd, delta, SEEK_CUR) == -1 && errno) return ((off64_t)-1); here += delta; } v = lseek(fd, (int)(off - here), SEEK_CUR); /* fprintf(stderr, "lseek(%d, %d, %d) returned %d\n", fd, (int)(off - here), SEEK_CUR, v); */ if (v != -1 && whence == SEEK_SET) return (off); return ((off64_t)v); #endif } #if defined(hpux) || defined(__hpux) int getpagesize() { return (sysconf(_SC_PAGE_SIZE)); } #endif #ifdef WIN32 int getpagesize() { SYSTEM_INFO s; GetSystemInfo(&s); return ((int)s.dwPageSize); } LARGE_INTEGER getFILETIMEoffset() { SYSTEMTIME s; FILETIME f; LARGE_INTEGER t; s.wYear = 1970; s.wMonth = 1; s.wDay = 1; s.wHour = 0; s.wMinute = 0; s.wSecond = 0; s.wMilliseconds = 0; SystemTimeToFileTime(&s, &f); t.QuadPart = f.dwHighDateTime; t.QuadPart <<= 32; t.QuadPart |= f.dwLowDateTime; return (t); } int gettimeofday(struct timeval *tv, struct timezone *tz) { LARGE_INTEGER t; FILETIME f; double microseconds; static LARGE_INTEGER offset; static double frequencyToMicroseconds; static int initialized = 0; static BOOL usePerformanceCounter = 0; if (!initialized) { LARGE_INTEGER performanceFrequency; initialized = 1; usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency); if (usePerformanceCounter) { QueryPerformanceCounter(&offset); frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.; } else { offset = getFILETIMEoffset(); frequencyToMicroseconds = 10.; } } if (usePerformanceCounter) QueryPerformanceCounter(&t); else { GetSystemTimeAsFileTime(&f); t.QuadPart = f.dwHighDateTime; t.QuadPart <<= 32; t.QuadPart |= f.dwLowDateTime; } t.QuadPart -= offset.QuadPart; microseconds = (double)t.QuadPart / frequencyToMicroseconds; t.QuadPart = microseconds; tv->tv_sec = t.QuadPart / 1000000; tv->tv_usec = t.QuadPart % 1000000; return (0); } #endif ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_udp.c������������������������������������������������������������������������0000664�0000764�0000764�00000003644�10450256150�015543� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * udp_lib.c - routines for managing UDP connections * * %W% %G% * * Copyright (c) 1994 Larry McVoy. */ #define _LIB /* bench.h needs this */ #include "bench.h" /* * Get a UDP socket, bind it, figure out the port, * and advertise the port as program "prog". * * XXX - it would be nice if you could advertise ascii strings. */ int udp_server(u_long prog, int rdwr) { int sock; struct sockaddr_in s; if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) { perror("socket"); exit(1); } sock_optimize(sock, rdwr); bzero((void*)&s, sizeof(s)); s.sin_family = AF_INET; #ifdef NO_PORTMAPPER s.sin_port = htons(prog); #endif if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { perror("bind"); exit(2); } #ifndef NO_PORTMAPPER (void)pmap_unset(prog, (u_long)1); if (!pmap_set(prog, (u_long)1, (u_long)IPPROTO_UDP, (unsigned short)sockport(sock))) { perror("pmap_set"); exit(5); } #endif return (sock); } /* * Unadvertise the socket */ void udp_done(u_long prog) { (void)pmap_unset(prog, (u_long)1); } /* * "Connect" to the UCP socket advertised as "prog" on "host" and * return the connected socket. */ int udp_connect(char *host, u_long prog, int rdwr) { struct hostent *h; struct sockaddr_in sin; int sock; #ifndef NO_PORTMAPPER u_short port; #endif if ((sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0) { perror("socket"); exit(1); } sock_optimize(sock, rdwr); if (!(h = gethostbyname(host))) { perror(host); exit(2); } bzero((void *) &sin, sizeof(sin)); sin.sin_family = AF_INET; bcopy((void*)h->h_addr, (void *) &sin.sin_addr, h->h_length); #ifdef NO_PORTMAPPER sin.sin_port = htons(prog); #else port = pmap_getport(&sin, prog, (u_long)1, IPPROTO_UDP); if (!port) { perror("lib UDP: No port found"); exit(3); } sin.sin_port = htons(port); #endif if (connect(sock, (struct sockaddr*)&sin, sizeof(sin)) < 0) { perror("connect"); exit(4); } return (sock); } ��������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_udp.h������������������������������������������������������������������������0000664�0000764�0000764�00000000445�10425062635�015551� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <netdb.h> #include <arpa/inet.h> int udp_server(u_long prog, int rdwr); void udp_done(u_long prog); int udp_connect(char *host, u_long prog, int rdwr); void sock_optimize(int sock, int rdwr); int sockport(int); ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_unix.c�����������������������������������������������������������������������0000664�0000764�0000764�00000003222�10450256150�015726� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * unix_lib.c - routines for managing UNIX connections. * * Positive port/program numbers are RPC ports, negative ones are UNIX ports. * * Copyright (c) 1994-1996 Larry McVoy. */ #define _LIB /* bench.h needs this */ #include "bench.h" /* * Get a UNIX socket, bind it. */ int unix_server(char *path) { int sock; struct sockaddr_un s; #ifdef LIBUNIX_VERBOSE fprintf(stderr, "unix_server(%s, %u)\n", prog, rdwr); #endif if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { perror("socket"); exit(1); } bzero((void*)&s, sizeof(s)); s.sun_family = AF_UNIX; strcpy(s.sun_path, path); if (bind(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { perror("bind"); exit(2); } if (listen(sock, 100) < 0) { perror("listen"); exit(4); } return (sock); } /* * Unadvertise the socket */ int unix_done(int sock, char *path) { close(sock); unlink(path); return (0); } /* * Accept a connection and return it */ int unix_accept(int sock) { struct sockaddr_un s; int newsock; socklen_t namelen; namelen = sizeof(s); bzero((void*)&s, namelen); retry: if ((newsock = accept(sock, (struct sockaddr*)&s, &namelen)) < 0) { if (errno == EINTR) goto retry; perror("accept"); exit(6); } return (newsock); } /* * Connect to the UNIX socket advertised as "path" and * return the connected socket. */ int unix_connect(char *path) { struct sockaddr_un s; int sock; if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { perror("socket"); exit(1); } bzero((void*)&s, sizeof(s)); s.sun_family = AF_UNIX; strcpy(s.sun_path, path); if (connect(sock, (struct sockaddr*)&s, sizeof(s)) < 0) { perror("connect"); exit(4); } return (sock); } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lib_unix.h�����������������������������������������������������������������������0000664�0000764�0000764�00000000275�07045412511�015741� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* lib_unix.c */ #ifndef _LIB_UNIX_H_ #define _LIB_UNIX_H_ int unix_server(char *path); int unix_done(int sock, char *path); int unix_accept(int sock); int unix_connect(char *path); #endif �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/line.c���������������������������������������������������������������������������0000664�0000764�0000764�00000002710�10715547567�015070� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * line.c - guess the cache line size * * usage: line * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" /* * Assumptions: * * 1) Cache lines are a multiple of pointer-size words * 2) Cache lines are no larger than 1/4 a page size * 3) Pages are an even multiple of cache lines */ int main(int ac, char **av) { int l; int verbose = 0; int warmup = 0; int repetitions = (1000000 <= get_enough(0) ? 1 : TRIES); int c; size_t maxlen = 64 * 1024 * 1024; struct mem_state state; char *usage = "[-v] [-W <warmup>] [-N <repetitions>][-M len[K|M]]\n"; state.line = sizeof(char*); state.pagesize = getpagesize(); while (( c = getopt(ac, av, "avM:W:N:")) != EOF) { switch(c) { case 'v': verbose = 1; break; case 'M': maxlen = bytes(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } if ((l = line_find(maxlen, warmup, repetitions, &state)) > 0) { if (verbose) { printf("cache line size: %d bytes\n", l); } else { printf("%d\n", l); } } return (0); } ��������������������������������������������������������lmbench-3.0-a9/src/lmdd.1���������������������������������������������������������������������������0000664�0000764�0000764�00000006160�07045412511�014760� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������.\" %W% %G% .TH LMDD 1 .SH NAME lmdd \- move io for performance and debugging tests .SH SYNOPSIS .B lmdd [ .IB option = value ] .\|.\|. .SH DESCRIPTION .B lmdd copies a specified input file to a specified output with possible conversions. This program is primarily useful for timing I/O since it prints out the timing statistics after completing. .SH OPTIONS .TP 15 .BI if= name Input file is taken from .IR name ; .I internal is the default. .I internal is a special file that acts like Sun's .IR /dev/zero , i.e., it provides a buffer of zeros without doing a system call to get them. .TP .BI of= name Output file is taken from .IR name ; .I internal is the default. .I internal is a special file that acts like .IR /dev/null , without doing a system call to get rid of the data. .TP .BI bs= n Input and output block size .I n bytes (default 8192). Note that this is different from dd(1), it has a 512 byte default. Also note that the block size can be followed by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), respectively. .TP .BI ipat= n If .B n is non zero, expect a known pattern in the file (see opat). Mismatches will be displayed as "ERROR: off=%d want=%x got=%x". The pattern is a sequence of 4 byte integers with the first 0, second 1, and so on. The default is not to check for the pattern. .TP .BI opat= n If .B n is non zero, generate a known pattern on the output stream. Used for debugging file system correctness. The default is not to generate the pattern. .TP .BI mismatch= n If .B n is non zero, stop at the first mismatched value. Used with ipat. .TP .BI skip= n Skip .IR n "" input blocks before starting copy. .TP .BI fsync= n If .I n is non-zero, call fsync(2) on the output file before exiting or printing timing statistics. .TP .BI sync= n If .I n is non-zero, call sync(2) before exiting or printing timing statistics. .TP .BI rand= n This argument, by default off, turns on random behavior. The argument is not a flag, it is a size, that size is used as the upper bound for the seeks. Also note that the block size can be followed by 'k' or 'm' to indicate kilo bytes (*1024) or megabytes (*1024*1024), .TP .BI flush= n If .I n is non-zero and mmap(2) is available, call msync(2) to invalidate the output file. This flushes the file to disk so that you don't have unmount/mount. It is not as good as mount/unmount because it just flushes file pages - it misses the indirect blocks which are still cached. Not supported on all systems, compile time option. .TP .BI rusage= n If .I n is non-zero, print rusage statistics as well as timing statistics. Not supported on all systems, compile time option. .TP .BI count= n Copy only .IR n "" input records. .SH EXAMPLES .LP This is the most common usage, the intent is to measure disk performance. The disk is a spare partition mounted on /spare. .sp .nf .in +4 # mount /spare # lmdd if=internal of=/spare/XXX count=1000 fsync=1 7.81 MB in 3.78 seconds (2.0676 MB/sec) : Flush cache # umount /spare # mount /spare # lmdd if=/spare/XXX of=internal 7.81 MB in 2.83 seconds (2.7611 MB/sec) .in .sp .fi .SH AUTHOR Larry McVoy, lm@sun.com .br Not copyrighted. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lmdd.c���������������������������������������������������������������������������0000664�0000764�0000764�00000044116�10450256150�015044� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������char *id = "$Id: lmdd.c,v 1.23 1997/12/01 23:47:59 lm Exp $\n"; /* * defaults: * bs=8k * count=forever * if=internal * of=internal * ipat=0 * opat=0 * mismatch=0 * rusage=0 * flush=0 * rand=0 * print=0 * direct=0 * rt=0 * rtmax=0 * wtmax=0 * rtmin=0 * wtmin=0 * label="" * shorthands: * k, m, g are 2^10, 2^20, 2^30 multipliers. * K, M, G are 10^3, 10^6, 10^9 multipliers. * recognizes "internal" as an internal /dev/zero /dev/null file. * * Copyright (c) 1994-1998 by Larry McVoy. All rights reserved. * See the file COPYING for the licensing terms. * * TODO - rewrite this entire thing from scratch. This is disgusting code. */ #ifndef __Lynx__ #define FLUSH #endif #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <signal.h> #include <string.h> #include <unistd.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/time.h> #include "bench.h" #undef ALIGN #define ALIGN(x, bs) ((x + (bs - 1)) & ~(bs - 1)) #ifdef FLUSH #include <sys/mman.h> #include <sys/stat.h> void flush(void); #endif #define USE_VALLOC #ifdef USE_VALLOC #define VALLOC valloc #else #define VALLOC malloc #endif int awrite, poff, out, Print, Fsync, Sync, Flush, Bsize, ru; uint64 Start, End, Rand, int_count; int hash; int Realtime, Notrunc; int Rtmax, Rtmin, Wtmax, Wtmin; int rthist[12]; /* histogram of read times */ int wthist[12]; /* histogram of write times */ char *Label; uint64 *norepeat; int norepeats = -1; #ifdef USE_BDS bds_msg *m1, *m2; #endif uint64 getarg(); int been_there(uint64 off); int getfile(char *s, int ac, char **av); char *cmds[] = { "bs", /* block size */ "bufs", /* use this many buffers round robin */ "count", /* number of blocks */ #ifdef DBG "debug", /* set external variable "dbg" */ #endif #ifdef O_DIRECT "direct", /* direct I/O on input and output */ "idirect", /* direct I/O on input */ "odirect", /* direct I/O on output */ #endif #ifdef FLUSH "flush", /* map in out and invalidate (flush) */ #endif "fork", /* fork to do write I/O */ "fsync", /* fsync output before exit */ "if", /* input file */ "ipat", /* check input for pattern */ "label", /* prefix print out with this */ "mismatch", /* stop at first mismatch */ "move", /* instead of count, limit transfer to this */ "of", /* output file */ "opat", /* generate pattern on output */ "print", /* report type */ "rand", /* do randoms over the specified size */ /* must be power of two, not checked */ "poff", /* Print the offsets as we do the io. */ #ifdef RUSAGE "rusage", /* dump rusage stats */ #endif "skip", /* skip this number of blocks */ "sync", /* sync output before exit */ "touch", /* touch each buffer after the I/O */ #if !defined(hpux) "usleep", /* sleep this many usecs between I/O */ #endif "hash", /* hash marks like FTP */ "append", /* O_APPEND */ "rtmax", /* read latency histogram max in mills */ "wtmax", /* write latency histogram max in mills */ "rtmin", /* read latency histogram max in mills */ "wtmin", /* write latency histogram max in mills */ "realtime", /* create files as XFS realtime files */ "notrunc", /* overwrite rather than truncing out file */ "end", /* limit randoms to this size near the * Rand endpoints. */ "start", /* Add this to Rand */ "time", /* Run for this many seconds only. */ "srand", /* Seed the random number generator */ "padin", /* Pad an extra untimed block_size read */ #ifdef USE_BDS "awrite", /* use async writes and pipeline them. */ #endif "norepeat", /* don't ever do the same I/O twice */ #ifdef sgi "mpin", /* pin the buffer */ #endif "timeopen", /* include open time in results */ "nocreate", /* just open for writing, don't create/trunc it */ #ifdef O_SYNC "osync", /* O_SYNC */ #endif 0, }; void error(char *); void done(); #ifdef DBG extern int dbg; #endif int main(int ac, char **av) { uint *buf; uint *bufs[10]; int nbufs, nextbuf = 0; int Fork, misses, mismatch, outpat, inpat, in, timeopen, gotcnt; int slp; uint64 skip, size, count; void chkarg(); int i; uint64 off = 0; int touch; int time; int mills; int pad_in; int pid = 0; struct timeval start_tv; struct timeval stop_tv; if (sizeof(int) != 4) { fprintf(stderr, "sizeof(int) != 4\n"); exit(1); } for (i = 1; i < ac; ++i) { chkarg(av[i]); } signal(SIGINT, done); signal(SIGALRM, done); misses = mismatch = getarg("mismatch=", ac, av); inpat = getarg("ipat=", ac, av); outpat = getarg("opat=", ac, av); Bsize = getarg("bs=", ac, av); if (Bsize < 0) Bsize = 8192; #if !defined(hpux) slp = getarg("usleep=", ac, av); #endif Fork = getarg("fork=", ac, av); Fsync = getarg("fsync=", ac, av); Sync = getarg("sync=", ac, av); Rand = getarg("rand=", ac, av); Start = getarg("start=", ac, av); End = getarg("end=", ac, av); time = getarg("time=", ac, av); if ((End != -1) && (Rand != -1) && (End > Rand)) { End = Rand; } if (getarg("srand=", ac, av) != -1) { srand48((long)getarg("srand=", ac, av)); } poff = getarg("poff=", ac, av) != -1; Print = getarg("print=", ac, av); nbufs = getarg("bufs=", ac, av); Realtime = getarg("realtime=", ac, av); Rtmax = getarg("rtmax=", ac, av); if ((Rtmax != -1) && (Rtmax < 10)) Rtmax = 10; Rtmin = getarg("rtmin=", ac, av); if ((Rtmax != -1) && (Rtmin == -1)) { Rtmin = 0; } Wtmax = getarg("wtmax=", ac, av); if ((Wtmax != -1) && (Wtmax < 10)) Wtmax = 10; Wtmin = getarg("wtmin=", ac, av); if ((Wtmax != -1) && (Wtmin == -1)) { Wtmin = 0; } if ((Rtmin && !Rtmax) || (Wtmin && !Wtmax)) { fprintf(stderr, "Need a max to go with that min.\n"); exit(1); } if ((Rtmin > Rtmax) || (Wtmin > Wtmax)) { fprintf(stderr, "min has to be less than max, R=%d,%d W=%d,%d\n", Rtmax, Rtmin, Wtmax, Wtmin); exit(1); } timeopen = getarg("timeopen=", ac, av); pad_in = getarg("padin=", ac, av); if (pad_in == -1) pad_in = 0; if (nbufs == -1) nbufs = 1; if (nbufs > 10) { printf("Too many bufs\n"); exit(1); } #ifdef DBG dbg = getarg("debug=", ac, av) != -1; #endif #ifdef RUSAGE ru = getarg("rusage=", ac, av); #endif touch = getarg("touch=", ac, av) != -1; hash = getarg("hash=", ac, av) != (uint64)-1; Label = (char *)getarg("label=", ac, av); count = getarg("count=", ac, av); size = getarg("move=", ac, av); if (size != (uint64)-1) count = size / Bsize; if (Rand != -1) { size = Rand - Bsize; size = ALIGN(size, Bsize); } #ifdef FLUSH Flush = getarg("flush=", ac, av); #endif if (count == (uint64)-1) gotcnt = 0; else gotcnt = 1; int_count = 0; skip = getarg("skip=", ac, av); if (getarg("norepeat=", ac, av) != -1) { if (gotcnt) { norepeat = (uint64*)calloc(count, sizeof(uint64)); } else { norepeat = (uint64*)calloc(10<<10, sizeof(uint64)); } } if ((inpat != -1 || outpat != -1) && (Bsize & 3)) { fprintf(stderr, "Block size 0x%x must be word aligned\n", Bsize); exit(1); } if ((Bsize >> 2) == 0) { fprintf(stderr, "Block size must be at least 4.\n"); exit(1); } for (i = 0; i < nbufs; i++) { if (!(bufs[i] = (uint *) VALLOC((unsigned) Bsize))) { perror("VALLOC"); exit(1); } bzero((char *) bufs[i], Bsize); #ifdef sgi if (getarg("mpin=", ac, av) != -1) { if (mpin((void *)bufs[i], (size_t)Bsize)) { perror("mpin for adam"); } } #endif } if (time != -1) { alarm(time); } if (timeopen != -1) { start(NULL); } in = getfile("if=", ac, av); out = getfile("of=", ac, av); if (timeopen == -1) { start(NULL); } if ((Rtmax != -1) && in < 0) { fprintf(stderr, "I think you wanted wtmax, not rtmax\n"); exit(1); } if ((Wtmax != -1) && out < 0) { fprintf(stderr, "I think you wanted rtmax, not wtmax\n"); exit(1); } if (skip != (uint64)-1) { off = skip; off *= Bsize; if (in >= 0) { seekto(in, off, 0); } if (out >= 0) { seekto(out, off, 0); } if (poff) { fprintf(stderr, "%s ", p64sz(off)); } } for (;;) { register int moved; if (gotcnt && count-- <= 0) { done(); } /* * If End is set, it means alternate back and forth * between the end points of Rand, doing randoms within * the area 0..End and Rand-End..Rand */ if (End != -1) { static uint64 start = 0; start = start ? 0 : Rand - End; do { off = drand48() * End; off = ALIGN(off, Bsize); off += start; if (Start != -1) { off += Start; } } while (norepeat && been_there(off)); if (norepeat) { norepeat[norepeats++] = off; if (!gotcnt && (norepeats == 10<<10)) { norepeats = 0; } } if (in >= 0) { seekto(in, off, 0); } if (out >= 0) { seekto(out, off, 0); } } /* * Set the seek pointer if doing randoms */ else if (Rand != -1) { do { off = drand48() * (size - Bsize); if (Start != -1) { off += Start; } off = ALIGN(off, Bsize); } while (norepeat && been_there(off)); if (norepeat) { norepeat[norepeats++] = off; } if (!gotcnt && (norepeats == 10<<10)) { norepeats = 0; } if (in >= 0) { seekto(in, off, 0); } if (out >= 0) { seekto(out, off, 0); } } if (poff) { fprintf(stderr, "%s ", p64sz(off)); } buf = bufs[nextbuf]; if (++nextbuf == nbufs) nextbuf = 0; if (in >= 0) { if ((Rtmax != -1) || (Rtmin != -1)) { start(&start_tv); } moved = read(in, buf, Bsize); if (pad_in) { /* ignore this run, restart clock */ pad_in = 0; count++; start(NULL); continue; } if ((Rtmax != -1) || (Rtmin != -1)) { int mics = stop(&start_tv, &stop_tv); mills = mics / 1000; if ((mills > Rtmax) || (mills < Rtmin)) { fprintf(stderr, "READ: %.02f milliseconds offset %s\n", ((float)mics) / 1000, p64sz(seekto(in, 0, SEEK_CUR))); } /* * Put this read time in the histogram. * The buckets are each 1/10th of Rtmax. */ if (mills >= Rtmax) { rthist[11]++; } else if (mills < Rtmin) { rthist[0]++; } else { int step = (Rtmax - Rtmin) / 10; int i; for (i = 1; i <= 10; ++i) { if (mills < i * step + Rtmin) { rthist[i]++; break; } } } } } else { moved = Bsize; } if (moved == -1) { perror("read"); } if (moved <= 0) { done(); } if (inpat != -1) { register int foo, cnt; for (foo = 0, cnt = moved/sizeof(int); cnt--; foo++) { if (buf[foo] != (uint) (off + foo*sizeof(int))) { fprintf(stderr, "off=%u want=%x got=%x\n", (uint)off, (uint)(off + foo*sizeof(int)), buf[foo]); if (mismatch != -1 && --misses == 0) { done(); } } } } if ((in >= 0) && touch) { int i; for (i = 0; i < moved; i += 4096) { ((char *)buf)[i] = 0; } } if (out >= 0) { int moved2; if (Fork != -1) { if (pid) { waitpid(pid, 0, 0); } if ((pid = fork())) { off += moved; int_count += (moved >> 2); continue; } } if (outpat != -1) { register int foo, cnt; for (foo = 0, cnt = moved/sizeof(int); cnt--; foo++) { buf[foo] = (uint)(off + foo*sizeof(int)); } } if ((Wtmax != -1) || (Wtmin != -1)) { start(&start_tv); } #ifdef USE_BDS /* * The first time through, m1 & m2 are null. * The Nth time through, we start the I/O into * m2, and wait on m1, then switch. */ if (awrite) { if (m1) { m2 = bds_awrite(out, buf, moved); moved2 = bds_adone(out, m1); m1 = m2; } else { m1 = bds_awrite(out, buf, moved); goto writedone; } } else { moved2 = write(out, buf, moved); } #else moved2 = write(out, buf, moved); #endif if (moved2 == -1) { perror("write"); } if (moved2 != moved) { fprintf(stderr, "write: wanted=%d got=%d\n", moved, moved2); done(); } if ((Wtmax != -1) || (Wtmin != -1)) { int mics = stop(&start_tv, &stop_tv); mills = mics / 1000; if ((mills > Wtmax) || (mills < Wtmin)) { fprintf(stderr, "WRITE: %.02f milliseconds offset %s\n", ((float)mics) / 1000, p64sz(seekto(out, 0, SEEK_CUR))); } /* * Put this write time in the histogram. * The buckets are each 1/10th of Wtmax. */ if (mills >= Wtmax) { wthist[11]++; } else if (mills < Wtmin) { wthist[0]++; } else { int step = (Wtmax - Wtmin) / 10; int i; for (i = 1; i <= 10; ++i) { if (mills < i * step + Wtmin) { wthist[i]++; break; } } } } if (moved2 == -1) { perror("write"); } if (moved2 != moved) { done(); } if (touch) { int i; for (i = 0; i < moved; i += 4096) { ((char *)buf)[i] = 0; } } } #ifdef USE_BDS writedone: /* for the first async write */ #endif off += moved; int_count += (moved >> 2); #if !defined(hpux) if (slp != -1) { usleep(slp); } #endif if (hash) { fprintf(stderr, "#"); } if (Fork != -1) { exit(0); } } } int been_there(uint64 off) { register int i; for (i = 0; i <= norepeats; ++i) { if (off == norepeat[i]) { fprintf(stderr, "norepeat on %u\n", (uint)off); return (1); } } return (0); } void chkarg(char *arg) { int i; char *a, *b; for (i = 0; cmds[i]; ++i) { for (a = arg, b = cmds[i]; *a && *b && *a == *b; a++, b++) ; if (*a == '=') return; } fprintf(stderr, "Bad arg: %s, possible arguments are: ", arg); for (i = 0; cmds[i]; ++i) { fprintf(stderr, "%s ", cmds[i]); } fprintf(stderr, "\n"); exit(1); /*NOTREACHED*/ } void done(void) { int i; int step; int size; #ifdef USE_BDS if (awrite && m1) { bds_adone(out, m1); } #endif if (Sync > 0) sync(); if (Fsync > 0) fsync(out); #ifdef FLUSH if (Flush > 0) flush(); #endif stop(NULL, NULL); #ifdef RUSAGE if (ru != -1) rusage(); #endif if (hash || poff) { fprintf(stderr, "\n"); } if ((long)Label != -1) { fprintf(stderr, "%s", Label); } int_count <<= 2; switch (Print) { case 0: /* no print out */ break; case 1: /* latency type print out */ latency((uint64)(int_count / Bsize), (uint64)Bsize); break; case 2: /* microsecond per op print out */ micro("", (uint64)(int_count / Bsize)); break; case 3: /* kb / sec print out */ kb(int_count); break; case 4: /* mb / sec print out */ mb(int_count); break; case 5: /* Xgraph output */ bandwidth(int_count, 1, 0); break; default: /* bandwidth print out */ bandwidth(int_count, 1, 1); break; } if (Rtmax != -1) { printf("READ operation latencies\n"); step = (Rtmax - Rtmin) / 10; if (rthist[0]) { printf("%d- ms: %d\n", Rtmin, rthist[0]); } for (i = 1, size = Rtmin; i <= 10; i++, size += step) { if (!rthist[i]) continue; printf("%d to %d ms: %d\n", size, size + step - 1, rthist[i]); } if (rthist[11]) { printf("%d+ ms: %d\n", Rtmax, rthist[11]); } } if (Wtmax != -1) { printf("WRITE operation latencies\n"); step = (Wtmax - Wtmin) / 10; if (wthist[0]) { printf("%d- ms: %d\n", Wtmin, wthist[0]); } for (i = 1, size = Wtmin; i <= 10; i++, size += step) { if (!wthist[i]) continue; printf("%d to %d ms: %d\n", size, size + step - 1, wthist[i]); } if (wthist[11]) { printf("%d+ ms: %d\n", Wtmax, wthist[11]); } } exit(0); } uint64 getarg(char *s, int ac, char **av) { register uint64 len, i; len = strlen(s); for (i = 1; i < ac; ++i) { if (!strncmp(av[i], s, len)) { register uint64 bs = bytes(&av[i][len]); if (!strncmp(av[i], "label=", 6)) { return (uint64)(&av[i][len]); /* HACK */ } return (bs); } } return ((uint64)-1); } char *output; int getfile(char *s, int ac, char **av) { register int ret, len, i; int append = getarg("append=", ac, av) != -1; int notrunc = getarg("notrunc=", ac, av) != -1; int nocreate = getarg("nocreate=", ac, av) != -1; #ifdef O_SYNC int osync = getarg("osync=", ac, av) != -1; #endif int oflags; len = strlen(s); for (i = 1; i < ac; ++i) { if (!strncmp(av[i], s, len)) { if (av[i][0] == 'o') { if (!strcmp("of=internal", av[i])) return (-2); if (!strcmp("of=stdout", av[i])) return (1); if (!strcmp("of=1", av[i])) return (1); if (!strcmp("of=-", av[i])) return (1); if (!strcmp("of=stderr", av[i])) return (2); if (!strcmp("of=2", av[i])) return (2); oflags = O_WRONLY; oflags |= (notrunc || append) ? 0 : O_TRUNC; oflags |= nocreate ? 0 : O_CREAT; oflags |= append ? O_APPEND : 0; #ifdef O_SYNC oflags |= osync ? O_SYNC : 0; #endif ret = open(&av[i][len], oflags,0644); #ifdef O_DIRECT if ((getarg("odirect=", ac, av) != -1) || (getarg("direct=", ac, av) != -1)) { close(ret); ret = open(&av[i][len], oflags|O_DIRECT); awrite = getarg("awrite=", ac, av) != -1; } #endif if (ret == -1) error(&av[i][len]); #ifdef F_FSSETXATTR if (Realtime == 1) { struct fsxattr fsxattr; bzero(&fsxattr,sizeof(struct fsxattr)); fsxattr.fsx_xflags = 0x1; if (fcntl(ret,F_FSSETXATTR,&fsxattr)){ printf("WARNING: Could not make %s a real time file\n", &av[i][len]); } } #endif output = &av[i][len]; return (ret); } else { if (!strcmp("if=internal", av[i])) return (-2); if (!strcmp("if=stdin", av[i])) return (0); if (!strcmp("if=0", av[i])) return (0); if (!strcmp("if=-", av[i])) return (0); ret = open(&av[i][len], 0); #ifdef O_DIRECT if ((getarg("idirect=", ac, av) != -1) || (getarg("direct=", ac, av) != -1)) { close(ret); ret = open(&av[i][len], O_RDONLY|O_DIRECT); } #endif if (ret == -1) error(&av[i][len]); return (ret); } } } return (-2); } #ifdef FLUSH int warning(char *s) { if ((long)Label != -1) { fprintf(stderr, "%s: ", Label); } perror(s); return (-1); } void flush(void) { int fd; struct stat sb; caddr_t where; if (output == NULL || (fd = open(output, 2)) == -1) { warning("No output file"); return; } if (fstat(fd, &sb) == -1 || sb.st_size == 0) { warning(output); return; } where = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); msync(where, sb.st_size, MS_INVALIDATE); munmap(where, sb.st_size); } #endif void error(char *s) { if ((long)Label != -1) { fprintf(stderr, "%s: ", Label); } perror(s); exit(1); } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lmbench-3.0-a9/src/lmhttp.c�������������������������������������������������������������������������0000664�0000764�0000764�00000017704�10450256150�015437� 0����������������������������������������������������������������������������������������������������ustar �staelin�������������������������staelin����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/* * http_srv.c - simple HTTP "server" * * Only implements the simplest GET operation. * * usage: http_srv [-f#] [-l] [-d] [port] * * Copyright (c) 1994-6 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Other authors: Steve Alexander, sca@sgi.com. */ char *id = "$Id$\n"; #include "bench.h" #ifdef MAP_FILE # define MMAP_FLAGS MAP_FILE|MAP_SHARED #else # define MMAP_FLAGS MAP_SHARED #endif #define MMAPS_BETTER (4<<10) /* mmap is faster for sizes >= this */ #define LOGFILE "/usr/tmp/lmhttp.log" char *buf; char *bufs[3]; int Dflg, dflg, nflg, lflg, fflg, zflg; int data, logfile; void die(); void worker(); char *http_time(void); char *date(time_t *tt); char *type(char *name); int source(int sock); int isdir(char *name); void dodir(char *name, int sock); void fake(int sock, char *buf, int size); void rdwr(int fd, int sock, char *buf); int mmap_rdwr(int from, int to, int size); void logit(int sock, char *name, int size); int main(int ac, char **av) { int i, prog; #ifdef sgi int ncpus = sysmp(MP_NPROCS); #endif for (i = 1; i < ac; ++i) { if (av[i][0] != '-') { break; } switch (av[i][1]) { case 'D': Dflg = 1; break; /* Allow directories */ case 'd': dflg = 1; break; /* debugging */ case 'f': fflg = atoi(&av[i][2]); break; /* # of threads */ case 'l': lflg = 1; break; /* logging */ case 'n': nflg = 1; break; /* fake file i/o */ case 'z': zflg = 1; break; /* all files are 0 size */ default: fprintf(stderr, "Barf.\n"); exit(1); } } if (getenv("DOCROOT")) { if (chdir(getenv("DOCROOT")) == -1) { perror(getenv("DOCROOT")); exit(1); } } if (atoi(av[ac - 1]) != 0) { prog = -atoi(av[ac - 1]); } else { prog = -80; } /* * Steve - why is this here? */ signal(SIGPIPE, SIG_IGN); data = tcp_server(prog, SOCKOPT_REUSE); bufs[0] = valloc(XFERSIZE); bufs[1] = valloc(XFERSIZE); bufs[2] = valloc(XFERSIZE); logfile = open(LOGFILE, O_CREAT|O_APPEND|O_WRONLY, 0666); signal(SIGINT, die); signal(SIGHUP, die); signal(SIGTERM, die); for (i = 1; i < fflg; ++i) { if (fork() <= 0) { break; } } handle_scheduler(i, 0, 0); worker(); return(0); } void worker() { int newdata; int next = 0; for (;;) { buf = bufs[next]; if (++next == 3) next = 0; newdata = tcp_accept(data, SOCKOPT_REUSE); source(newdata); close(newdata); } } /* * "Tue, 28 Jan 97 01:20:30 GMT"; * 012345678901234567890123456 */ char *http_time() { time_t tt; static time_t save_tt; struct tm *t; static struct tm save_tm; static char buf[100]; time(&tt); /* costs 10 usecs */ if (tt == save_tt) { return (buf); } save_tt = tt; t = gmtime(&tt); /* costs 21 usecs */ if (buf[0] && (tt - save_tt < 3600)) { buf[22] = t->tm_sec / 10 + '0'; buf[21] = t->tm_sec % 10 + '0'; save_tm.tm_sec = t->tm_sec; if (save_tm.tm_min == t->tm_min) { return (buf); } } save_tm = *t; /* costs 120 usecs */ strftime(buf, sizeof(buf), "%a, %d %b %y %H:%M:%S %Z", t); return(buf); } /* * Input: dates that are probably within the last year. * Output: Tue, 28 Jan 97 01:20:30 GMT * * Since it costs 150 usecs or so to do this on an Indy, it may pay to * optimize this. */ char * date(time_t *tt) { return "Tue, 28 Jan 97 01:20:30 GMT"; } char * type(char *name) { int len = strlen(name); if (!strcmp(&name[len - 4], ".gif")) { return "image/gif"; } if (!strcmp(&name[len - 5], ".jpeg")) { return "image/jpeg"; } if (!strcmp(&name[len - 5], ".html")) { return "text/html"; } if (Dflg && isdir(name)) { return "text/html"; } return "text/plain"; } /* * Read the file to be transfered. * Write that file on the data socket. * The caller closes the socket. */ int source(int sock) { int fd, n, size; char *s; char file[100]; char hbuf[1024]; struct stat sb; #define name &buf[5] n = read(sock, buf, XFERSIZE); if (n <= 0) { perror("control nbytes"); return (-1); } buf[n] = 0; if (dflg) printf("%.*s\n", n, buf); if (zflg) { return (0); } if (!strncmp(buf, "EXIT", 4)) { exit(0); } if (strncmp(buf, "GET /", 5)) { perror(buf); return(1); } for (s = buf; *s && *s != '\r' && *s != '\n'; s++) ; *s = 0; for (s = name; *s && *s != ' '; s++) ; *s = 0; if (lflg) strncpy(file, name, sizeof(file)); if (dflg) printf("OPEN %s\n", name); fd = open(name, 0); if (fd == -1) { error: perror(name); close(fd); return (1); } if (fstat(fd, &sb) == -1) { if (dflg) printf("Couldn't stat %s\n", name); goto error; } size = sb.st_size; n = sprintf(hbuf, "HTTP/1.0 200 OK\r\n%s\r\nServer: lmhttp/0.1\r\nContent-Type: %s\r\nLast-Modified: %s\r\n\r\n", http_time(), type(name), date(&sb.st_mtime)); if (write(sock, hbuf, n) != n) { goto error; } if (Dflg && isdir(name)) { dodir(name, sock); } else if (nflg) { fake(sock, buf, size); } else if ((size > MMAPS_BETTER)) { /* XXX */ if (mmap_rdwr(fd, sock, size) == -1) { printf("%s mmap failed\n", name); } } else { rdwr(fd, sock, buf); } if (lflg) logit(sock, file, size); close(fd); return(0); } #undef name int isdir(char *name) { struct stat sb; if (stat(name, &sb) == -1) { return(0); } return (S_ISDIR(sb.st_mode)); } #ifdef example <HTML><HEAD> <TITLE>Index of /pub/Linux

Index of /pub/Linux

      Name                   Last modified     Size  Description

[   ] !INDEX 19-Sep-97 03:20 3k [TXT] !INDEX.html 19-Sep-97 03:20 6k #endif void dodir(char *name, int sock) { FILE *p; char buf[1024]; char path[1024]; if (dflg) printf("dodir(%s)\n", name); sprintf(buf, "cd %s && ls -1a", name); p = popen(buf, "r"); if (!p && dflg) printf("Couldn't popen %s\n", buf); sprintf(buf, "\ \nIndex of /%s

Index of /%s

\n", name, name); write(sock, buf, strlen(buf)); while (fgets(buf, sizeof(buf), p)) { buf[strlen(buf) - 1] = 0; sprintf(path, "/%s/%s", name, buf); if (dflg) printf("\t%s\n", path); write(sock, "", 2); write(sock, buf, strlen(buf)); write(sock, "
\n", 9); } pclose(p); } void fake(int sock, char *buf, int size) { int n; while (size > 0) { n = write(sock, buf, size > XFERSIZE ? XFERSIZE : size); if (n == -1) { perror("write on socket"); return; } size -= n; } } void rdwr(int fd, int sock, char *buf) { int nread; while ((nread = read(fd, buf, XFERSIZE)) > 0) { int i; for (i = 0; i < nread; ) { int nwrote = write(sock, buf, nread - i); if (i < 0) { exit(1); } i += nwrote; } } } int mmap_rdwr(int from, int to, int size) { char *buf; int done = 0, wrote; buf = mmap(0, size, PROT_READ, MMAP_FLAGS, from, 0); if ((long)buf == -1) { perror("mmap"); return (-1); } do { wrote = write(to, buf + done, size - done); if (wrote == -1) { perror("write"); break; } done += wrote; } while (done < size); if (munmap(buf, size) == -1) { perror("unmap"); } return (0); } static char logbuf[64<<10]; /* buffer into here */ static int nbytes; /* bytes buffered */ /* * HTTP server logging, compressed format. */ void logit(int sock, char *name, int size) { char buf[1024 + 16]; /* maxpathlen + others */ struct sockaddr_in sin; socklen_t len = sizeof(sin); if (getpeername(sock, (struct sockaddr*)&sin, &len) == -1) { perror("getpeername"); return; } len = sprintf(buf, "%u %u %s %u\n", *((unsigned int*)&sin.sin_addr), (unsigned int)time(0), name, size); if (nbytes + len >= sizeof(logbuf)) { write(logfile, logbuf, nbytes); nbytes = 0; } bcopy(buf, &logbuf[nbytes], len); nbytes += len; } void die() { if (nbytes) { write(logfile, logbuf, nbytes); nbytes = 0; } exit(1); } lmbench-3.0-a9/src/loop_o.c0000664000076400007640000000012107045412511015400 0ustar staelinstaelin#include "bench.h" int main() { printf("%.8f\n", l_overhead()); return (0); } lmbench-3.0-a9/src/memsize.c0000664000076400007640000000737010532366021015576 0ustar staelinstaelin/* * memsize.c - figure out how much memory we have to use. * * Usage: memsize [max_wanted_in_MB] * * Copyright (c) 1995 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. */ char *id = "$Id$\n"; #include "bench.h" #define CHK(x) if ((x) == -1) { perror("x"); exit(1); } #ifndef TOO_LONG #define TOO_LONG 10 /* usecs */ #endif int alarm_triggered = 0; void timeit(char *where, size_t size); static void touchRange(char *p, size_t range, ssize_t stride); int test_malloc(size_t size); void set_alarm(uint64 usecs); void clear_alarm(); int main(int ac, char **av) { char *where; size_t size = 0; size_t max = 0; size_t delta; if (ac == 2) { max = size = bytes(av[1]) * 1024 * 1024; } if (max < 1024 * 1024) { max = size = 1024 * 1024 * 1024; } /* * Binary search down and then binary search up */ for (where = 0; !test_malloc(size); size >>= 1) { max = size; } /* delta = size / (2 * 1024 * 1024) */ for (delta = (size >> 21); delta > 0; delta >>= 1) { uint64 sz = (uint64)size + (uint64)delta * 1024 * 1024; size_t check = sz; if (max < sz) continue; if (check < sz || !test_malloc(sz)) break; size = sz; } if (where = malloc(size)) { timeit(where, size); free(where); } exit (0); } void timeit(char *where, size_t size) { int sum = 0; size_t n; size_t s_prev; size_t range; size_t incr = 1024 * 1024; size_t pagesize = getpagesize(); unsigned long long s; if (size < 1024*1024 - 16*1024) { fprintf(stderr, "Bad size\n"); return; } range = 1024 * 1024; incr = 1024 * 1024; touchRange(where, range, pagesize); for (range += incr; range <= size; range += incr) { n = range / pagesize; set_alarm(n * TOO_LONG); touchRange(where + range - incr, incr, pagesize); clear_alarm(); set_alarm(n * TOO_LONG); start(0); touchRange(where, range, pagesize); sum = stop(0, 0); clear_alarm(); if ((sum / n) > TOO_LONG || alarm_triggered) { size = range - incr; break; } for (s = 8 * 1024 * 1024; s <= range; s_prev = s, s *= 2) if (s < s_prev) break; incr = s / 8; if (range < size && size < range + incr) { incr = size - range; } fprintf(stderr, "%dMB OK\r", (int)(range/(1024*1024))); } fprintf(stderr, "\n"); printf("%d\n", (int)(size>>20)); } static void touchRange(char *p, size_t range, ssize_t stride) { register char *tmp = p + (stride > 0 ? 0 : range - 1); register size_t delta = (stride > 0 ? stride : -stride); while (range > delta - 1 && !alarm_triggered) { *tmp = 0; tmp += stride; range -= delta; } } int test_malloc(size_t size) { int fid[2]; int result; int status; void* p; if (pipe(fid) < 0) { void* p = malloc(size); if (!p) return 0; free(p); return 1; } if (fork() == 0) { close(fid[0]); p = malloc(size); result = (p ? 1 : 0); write(fid[1], &result, sizeof(int)); close(fid[1]); if (p) free(p); exit(0); } close(fid[1]); if (read(fid[0], &result, sizeof(int)) != sizeof(int)) result = 0; close(fid[0]); wait(&status); return result; } void gotalarm(int s) { alarm_triggered = 1; } void set_alarm(uint64 usecs) { struct itimerval value; struct sigaction sa; alarm_triggered = 0; sa.sa_handler = gotalarm; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGALRM, &sa, 0); value.it_interval.tv_sec = 0; value.it_interval.tv_usec = 0; value.it_value.tv_sec = usecs / 1000000; value.it_value.tv_usec = usecs % 1000000; setitimer(ITIMER_REAL, &value, NULL); } void clear_alarm() { struct itimerval value; value.it_interval.tv_sec = 0; value.it_interval.tv_usec = 0; value.it_value.tv_sec = 0; value.it_value.tv_usec = 0; setitimer(ITIMER_REAL, &value, NULL); } lmbench-3.0-a9/src/mhz.c0000664000076400007640000003362510425062502014723 0ustar staelinstaelin/* * mhz.c - calculate clock rate and megahertz * * Usage: mhz [-c] * ******************************************************************* * * Caveat emptor and other warnings * * This code must be compiled using the optimizer! If you don't * compile this using the optimizer, then many compilers don't * make good use of the registers and your inner loops end up * using stack variables, which is SLOW. * * Also, it is sensitive to other processor load. When running * mhz with "rtprio" (real-time priority), I have never had mhz * make a mistake on my machine. At other times mhz has been * wrong about 10% of the time. * * If there is too much noise/error in the data, then this program * will usually return a clock speed that is too high. * ******************************************************************* * * Constraints * * mhz.c is meant to be platform independent ANSI/C code, and it * has as little platform dependent code as possible. * * This version of mhz is designed to eliminate the variable * instruction counts used by different compilers on different * architectures and instruction sets. It is also structured to * be tightly interlocked so processors with super-scalar elements * or dynamic instructure reorder buffers cannot overlap the * execution of the expressions. * * We have to try and make sure that the code in the various * inner loops does not fall out of the on-chip instruction cache * and that the inner loop variables fit inside the register set. * The i386 only has six addressable registers, so we had to make * sure that the inner loop procedures had fewer variables so they * would not spill onto the stack. * ******************************************************************* * * Algorithm * * We can compute the CPU cycle time if we can get the compiler * to generate (at least) two instruction sequences inside loops * where the inner loop instruction counts are relatively prime. * We have several different loops to increase the chance that * two of them will be relatively prime on any given architecture. * * This technique makes no assumptions about the cost of any single * instruction or the number of instructions used to implement a * given expression. We just hope that the compiler gets at least * two inner loop instruction sequences with lengths that are * relatively prime. The "relatively prime" makes the greatest * common divisor method work. If all the instructions sequences * have a common factor (e.g. 2), then the apparent CPU speed will * be off by that common factor. Also, if there is too much * variability in the data so there is no apparent least common * multiple within the error bounds set in multiple_approx, then * we simply return the maximum clock rate found in the loops. * * The processor's clock speed is the greatest common divisor * of the execution frequencies of the various loops. For * example, suppose we are trying to compute the clock speed * for a 120Mhz processor, and we have two loops: * SHR --- two cycles to shift right * SHR;ADD --- three cycles to SHR and add * then the expression duration will be: * SHR 11.1ns (2 cycles/SHR) * SHR;ADD 16.6ns (3 cycles/SHR;ADD) * so the greatest common divisor is 5.55ns and the clock speed * is 120Mhz. Aside from extraneous variability added by poor * benchmarking hygiene, this method should always work when we * are able to get loops with cycle counts that are relatively * prime. * * Suppose we are unlucky, and we have our two loops do * not have relatively prime instruction counts. Suppose * our two loops are: * SHR 11.1ns (2 cycles/SHR) * SHR;ADD;SUB 22.2ns (4 cycles/SHR;ADD;SUB) * then the greatest common divisor will be 11.1ns, so the clock * speed will appear to be 60Mhz. * * The loops provided so far should have at least two relatively * prime loops on nearly all architectures. * ******************************************************************* * * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Silicon Graphics is gratefully acknowledged. * Support for this development by Hewlett Packard is gratefully acknowledged. * Support for this development by Sun Microsystems is gratefully acknowledged. * ******************************************************************* */ char *id = "$Id$\n"; #include "bench.h" #include typedef long TYPE; #define TEN(A) A A A A A A A A A A #define HUNDRED(A) TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) \ TEN(A) TEN(A) TEN(A) TEN(A) TEN(A) #define MHZ(M, contents) \ char* \ name_##M() \ { \ return #contents; \ } \ \ TYPE** \ _mhz_##M (register long n, register TYPE **p, \ register TYPE a, register TYPE b) \ { \ for (; n > 0; --n) { \ HUNDRED(contents) \ } \ return p + a + b; \ } \ \ void \ mhz_##M(int enough) \ { \ TYPE __i = 1; \ TYPE *__x=(TYPE *)&__x, **__p=(TYPE **)__x, **__q = NULL; \ _mhz_##M(1, __p, 1, 1); \ BENCH1(__q = _mhz_##M(__n, __p, __i, __i); __n = 1;, enough) \ use_pointer((void*)__q); \ save_n(100 * get_n()); /* # of expressions executed */ \ } MHZ(1, p=(TYPE**)*p;) MHZ(2, a^=a+a;) MHZ(3, a^=a+a+a;) MHZ(4, a>>=b;) MHZ(5, a>>=a+a;) MHZ(6, a^=a< 0 && size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0; free(d); /* if the data point is inside the envelope of acceptable * results, then keep it, otherwise discard it */ for (i = 0, tests = 0; i < size; ++i) if (0.05 * median < values[i] && values[i] < 20.0 * median) { if (i > tests) values[tests] = values[i]; tests++; } return tests; } /* * make sure that there are enough points with significantly * different data values (greater than 5% difference) in the * data subset. */ int classes(double values[], int size) { int i; double median; double *d = (double *)malloc(size * sizeof(double)); int classid; for (i = 0; i < size; ++i) d[i] = values[i]; qsort(d, size, sizeof(double), double_compare); median = d[size/2]; if (size % 2 == 0) median = (median + d[size/2 - 1]) / 2.0; /* if the difference is less than 1/20th of the median, then * we assume that the two points are the same */ for (i = 1, classid = 1; i < size; ++i) if ((d[i] - d[i-1]) > 0.05 * median) classid++; free(d); return classid; } /* * mode * * return the most common value (within 1MHz) */ int mode(double values[], int n) { int i, n_mode, n_curr; int mode, curr; qsort(values, n, sizeof(double), double_compare); n_mode = 1; n_curr = 1; mode = (int)(values[0] + 0.5); curr = (int)(values[0] + 0.5); for (i = 1; i < n; ++i) { int v = (int)(values[i] + 0.5); if (curr != v) { curr = v; n_curr = 0; } n_curr++; if (n_curr > n_mode) { mode = curr; n_mode = n_curr; } } return mode; } /* * cross_values * * This routine will create new data points by subtracting pairs * of data points. */ void cross_values(double values[], int size, double **cvalues, int *csize) { int i, j; *cvalues = (double *)malloc(size * size * sizeof(double)); *csize = 0; for (i = 0; i < size; ++i) { (*cvalues)[(*csize)++] = values[i]; /* create new points with the differences */ for (j = i + 1; j < size; ++j) { (*cvalues)[(*csize)++] = ABS(values[i] - values[j]); } } } /* * gcd * * return the greatest common divisor of the passed values (within a * margin of error because these are experimental results, not * theoretical numbers). We do this by guessing how many instructions * are in each loop, and then trying to fit a straight line through * the (instruction count, time) points. The regression is of the * form: * * y = a + b * x * * The time for an individual instruction is "b", while "a" should * be 0. The trick is to figure out which guess is the right one! * * We assume that the gcd is the first value at which we have * significantly improved regression fit (as measured by chi2). * * We increase the number of experimental points (and generate * more small points) by adding points for the differences between * measured values (and compute the standard error appropriately). * * We want the regression line to go through the origin, so we * add an artificial point at (0,0) with a tiny standard error. */ double gcd(double values[], int size) { /* assumption: shortest inner loop has no more than this many instructions */ #define MAX_COUNT 6 int i, n, count; double min, result, min_chi2 = 0.0, a, b, sig_a, sig_b, chi2; double *y, *x = (double *)malloc(size * size * sizeof(double)); /* find the smallest value */ result = min = double_min(values, size); /* create new points by subtracting each pair of values */ cross_values(values, size, &y, &n); /* make sure the regression goes through the origin */ y[n++] = 0.0; for (count = 1; count < MAX_COUNT; ++count) { /* * given the minimum loop has "count" instructions, * guess how many instructions each other loop contains */ for (i = 0; i < n; ++i) { int m = (int)((double)count * y[i] / min + 0.5); x[i] = (double)m; } /* find the regression of the samples */ regression(x, y, NULL, n, &a, &b, &sig_a, &sig_b, &chi2); if (count == 1 || count * count * chi2 < min_chi2) { result = b; min_chi2 = chi2; } } free(x); free(y); return result; } /* * compute the gcd of many possible combinations of experimental values * and return the mode of the results to reduce the impact * of a few bad experimental measurements on the computed result. * * r - pointer to the array of experimental results * off - offset of the result we want. TRIES-1 == minimum result. */ int compute_mhz(result_t *r) { int i, j, mhz[2], n, subset, ntests; double data[NTESTS], results[1< TRIES/2) data[n++] = r[j].v[r[j].N-1-i].u / (double)r[j].v[r[j].N-1-i].n; if (n < 2 || (n = filter_data(data, n)) < 2 ||classes(data, n) < 2) continue; results[ntests++] = 1.0 / gcd(data, n); } mhz[i] = mode(results, ntests); } /* if the results agree within 1% or 1MHz, accept them */ if (ABS(mhz[0] - mhz[1]) / (double)mhz[0] <= 0.01 || ABS(mhz[0] - mhz[1]) <= 1) return mhz[0]; return -1; } void save_data(result_t* data, result_t* data_save) { int i; for (i = 0; i < NTESTS; ++i) { data_save[i] = data[i]; } } void print_data(double mhz, result_t* data) { int i, j; char *CPU_name = "CPU"; char *uname = "uname"; char *email = "email"; int speed = -1; char *names[NTESTS]; names[0] = name_1(); names[1] = name_2(); names[2] = name_3(); names[3] = name_4(); names[4] = name_5(); names[5] = name_6(); names[6] = name_7(); names[7] = name_8(); names[8] = name_9(); printf("/* \"%s\", \"%s\", \"%s\", %d, %.0f, %d, %f, %lu */\n", CPU_name, uname, email, speed, mhz, get_enough(0), l_overhead(), (unsigned long)t_overhead()); printf("result_t* data[] = { \n"); for (i = 0; i < NTESTS; ++i) { printf("\t/* %s */ { %d, {", names[i], data[i].N); for (j = 0; j < data[i].N; ++j) { printf("\n\t\t{ /* %f */ %lu, %lu}", data[i].v[j].u / (100. * data[i].v[j].n), (unsigned long)data[i].v[j].u, (unsigned long)data[i].v[j].n); if (j < TRIES - 1) printf(", "); } if (i < NTESTS - 1) printf("}},\n"); else printf("}}\n"); } printf("};\n"); } int main(int ac, char **av) { int c, i, j, k, mhz = -1; double runtime; result_t data[NTESTS]; result_t data_save[NTESTS]; char *usage = "[-d] [-c]\n"; putenv("LOOP_O=0.0"); /* should be at most 1% */ runtime = (NTESTS * TRIES * 3 * get_enough(0)) / 1000000.; if (runtime > 3.) { fprintf(stderr, "mhz: should take approximately %.0f seconds\n", runtime); } /* make three efforts to get reliable data */ for (i = 0; i < 3 && mhz < 0; ++i) { /* initialize the data arrays */ for (j = 0; j < NTESTS; ++j) insertinit(&data[j]); /* * collect the data; try to minimize impact of activity bursts * by putting NTESTS in the inner loop so a burst will affect * one data point for all expressions first, rather than all * data points for one expression. */ for (j = 0; j < TRIES; ++j) { for (k = 0; k < NTESTS; ++k) { (*loops[k])(0); insertsort(gettime(), get_n(), &data[k]); } } save_data(data, data_save); mhz = compute_mhz(data); } while (( c = getopt(ac, av, "cd")) != EOF) { switch(c) { case 'c': if (mhz > 0) { printf("%.4f\n", 1000. / (double)mhz); mhz = 0; } break; case 'd': print_data(mhz, data_save); break; default: lmbench_usage(ac, av, usage); break; } } if (mhz < 0) { printf("-1 System too busy\n"); exit(1); } if (mhz > 0) { printf("%d MHz, %.4f nanosec clock\n", mhz, 1000. / (double)mhz); } exit(0); } lmbench-3.0-a9/src/msleep.c0000664000076400007640000000050707045412511015406 0ustar staelinstaelin#include "bench.h" int main(int ac, char **av) { #if defined(sgi) || defined(sun) || defined(linux) usleep(atoi(av[1]) * 1000); return (0); #else fd_set set; int fd; struct timeval tv; tv.tv_sec = 0; tv.tv_usec = atoi(av[1]) * 1000; FD_ZERO(&set); FD_SET(0, &set); select(1, &set, 0, 0, &tv); return (0); #endif } lmbench-3.0-a9/src/names.h0000664000076400007640000001422507045412511015233 0ustar staelinstaelinchar *names[] = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "aa", "ab", "ac", "ad", "ae", "af", "ag", "ah", "ai", "aj", "ak", "al", "am", "an", "ao", "ap", "aq", "ar", "as", "at", "au", "av", "aw", "ax", "ay", "az", "ba", "bb", "bc", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bk", "bl", "bm", "bn", "bo", "bp", "bq", "br", "bs", "bt", "bu", "bv", "bw", "bx", "by", "bz", "ca", "cb", "cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "ck", "cl", "cm", "cn", "co", "cp", "cq", "cr", "cs", "ct", "cu", "cv", "cw", "cx", "cy", "cz", "da", "db", "dc", "dd", "de", "df", "dg", "dh", "di", "dj", "dk", "dl", "dm", "dn", "do", "dp", "dq", "dr", "ds", "dt", "du", "dv", "dw", "dx", "dy", "dz", "ea", "eb", "ec", "ed", "ee", "ef", "eg", "eh", "ei", "ej", "ek", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et", "eu", "ev", "ew", "ex", "ey", "ez", "fa", "fb", "fc", "fd", "fe", "ff", "fg", "fh", "fi", "fj", "fk", "fl", "fm", "fn", "fo", "fp", "fq", "fr", "fs", "ft", "fu", "fv", "fw", "fx", "fy", "fz", "ga", "gb", "gc", "gd", "ge", "gf", "gg", "gh", "gi", "gj", "gk", "gl", "gm", "gn", "go", "gp", "gq", "gr", "gs", "gt", "gu", "gv", "gw", "gx", "gy", "gz", "ha", "hb", "hc", "hd", "he", "hf", "hg", "hh", "hi", "hj", "hk", "hl", "hm", "hn", "ho", "hp", "hq", "hr", "hs", "ht", "hu", "hv", "hw", "hx", "hy", "hz", "ia", "ib", "ic", "id", "ie", "if", "ig", "ih", "ii", "ij", "ik", "il", "im", "in", "io", "ip", "iq", "ir", "is", "it", "iu", "iv", "iw", "ix", "iy", "iz", "ja", "jb", "jc", "jd", "je", "jf", "jg", "jh", "ji", "jj", "jk", "jl", "jm", "jn", "jo", "jp", "jq", "jr", "js", "jt", "ju", "jv", "jw", "jx", "jy", "jz", "ka", "kb", "kc", "kd", "ke", "kf", "kg", "kh", "ki", "kj", "kk", "kl", "km", "kn", "ko", "kp", "kq", "kr", "ks", "kt", "ku", "kv", "kw", "kx", "ky", "kz", "la", "lb", "lc", "ld", "le", "lf", "lg", "lh", "li", "lj", "lk", "ll", "lm", "ln", "lo", "lp", "lq", "lr", "ls", "lt", "lu", "lv", "lw", "lx", "ly", "lz", "ma", "mb", "mc", "md", "me", "mf", "mg", "mh", "mi", "mj", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nb", "nc", "nd", "ne", "nf", "ng", "nh", "ni", "nj", "nk", "nl", "nm", "nn", "no", "np", "nq", "nr", "ns", "nt", "nu", "nv", "nw", "nx", "ny", "nz", "oa", "ob", "oc", "od", "oe", "of", "og", "oh", "oi", "oj", "ok", "ol", "om", "on", "oo", "op", "oq", "or", "os", "ot", "ou", "ov", "ow", "ox", "oy", "oz", "pa", "pb", "pc", "pd", "pe", "pf", "pg", "ph", "pi", "pj", "pk", "pl", "pm", "pn", "po", "pp", "pq", "pr", "ps", "pt", "pu", "pv", "pw", "px", "py", "pz", "qa", "qb", "qc", "qd", "qe", "qf", "qg", "qh", "qi", "qj", "qk", "ql", "qm", "qn", "qo", "qp", "qq", "qr", "qs", "qt", "qu", "qv", "qw", "qx", "qy", "qz", "ra", "rb", "rc", "rd", "re", "rf", "rg", "rh", "ri", "rj", "rk", "rl", "rm", "rn", "ro", "rp", "rq", "rr", "rs", "rt", "ru", "rv", "rw", "rx", "ry", "rz", "sa", "sb", "sc", "sd", "se", "sf", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sp", "sq", "sr", "ss", "st", "su", "sv", "sw", "sx", "sy", "sz", "ta", "tb", "tc", "td", "te", "tf", "tg", "th", "ti", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tq", "tr", "ts", "tt", "tu", "tv", "tw", "tx", "ty", "tz", "ua", "ub", "uc", "ud", "ue", "uf", "ug", "uh", "ui", "uj", "uk", "ul", "um", "un", "uo", "up", "uq", "ur", "us", "ut", "uu", "uv", "uw", "ux", "uy", "uz", "va", "vb", "vc", "vd", "ve", "vf", "vg", "vh", "vi", "vj", "vk", "vl", "vm", "vn", "vo", "vp", "vq", "vr", "vs", "vt", "vu", "vv", "vw", "vx", "vy", "vz", "wa", "wb", "wc", "wd", "we", "wf", "wg", "wh", "wi", "wj", "wk", "wl", "wm", "wn", "wo", "wp", "wq", "wr", "ws", "wt", "wu", "wv", "ww", "wx", "wy", "wz", "xa", "xb", "xc", "xd", "xe", "xf", "xg", "xh", "xi", "xj", "xk", "xl", "xm", "xn", "xo", "xp", "xq", "xr", "xs", "xt", "xu", "xv", "xw", "xx", "xy", "xz", "ya", "yb", "yc", "yd", "ye", "yf", "yg", "yh", "yi", "yj", "yk", "yl", "ym", "yn", "yo", "yp", "yq", "yr", "ys", "yt", "yu", "yv", "yw", "yx", "yy", "yz", "za", "zb", "zc", "zd", "ze", "zf", "zg", "zh", "zi", "zj", "zk", "zl", "zm", "zn", "zo", "zp", "zq", "zr", "zs", "zt", "zu", "zv", "zw", "zx", "zy", "zz", "aaa", "aab", "aac", "aad", "aae", "aaf", "aag", "aah", "aai", "aaj", "aak", "aal", "aam", "aan", "aao", "aap", "aaq", "aar", "aas", "aat", "aau", "aav", "aaw", "aax", "aay", "aaz", "aba", "abb", "abc", "abd", "abe", "abf", "abg", "abh", "abi", "abj", "abk", "abl", "abm", "abn", "abo", "abp", "abq", "abr", "abs", "abt", "abu", "abv", "abw", "abx", "aby", "abz", "aca", "acb", "acc", "acd", "ace", "acf", "acg", "ach", "aci", "acj", "ack", "acl", "acm", "acn", "aco", "acp", "acq", "acr", "acs", "act", "acu", "acv", "acw", "acx", "acy", "acz", "ada", "adb", "adc", "add", "ade", "adf", "adg", "adh", "adi", "adj", "adk", "adl", "adm", "adn", "ado", "adp", "adq", "adr", "ads", "adt", "adu", "adv", "adw", "adx", "ady", "adz", "aea", "aeb", "aec", "aed", "aee", "aef", "aeg", "aeh", "aei", "aej", "aek", "ael", "aem", "aen", "aeo", "aep", "aeq", "aer", "aes", "aet", "aeu", "aev", "aew", "aex", "aey", "aez", "afa", "afb", "afc", "afd", "afe", "aff", "afg", "afh", "afi", "afj", "afk", "afl", "afm", "afn", "afo", "afp", "afq", "afr", "afs", "aft", "afu", "afv", "afw", "afx", "afy", "afz", "aga", "agb", "agc", "agd", "age", "agf", "agg", "agh", "agi", "agj", "agk", "agl", "agm", "agn", "ago", "agp", "agq", "agr", "ags", "agt", "agu", "agv", "agw", "agx", "agy", "agz", "aha", "ahb", "ahc", "ahd", "ahe", "ahf", "ahg", "ahh", "ahi", "ahj", "ahk", "ahl", "ahm", "ahn", "aho", "ahp", "ahq", "ahr", "ahs", "aht", "ahu", "ahv", "ahw", "ahx", "ahy", "ahz", "aia", "aib", "aic", "aid", "aie", "aif", "aig", "aih", "aii", "aij", "aik", "ail", "aim", "ain", "aio", "aip", "aiq", "air", "ais", "ait", "aiu", "aiv", "aiw", "aix", "aiy", "aiz", "aja", "ajb", "ajc", "ajd", "aje", "ajf", "ajg", "ajh", "aji", "ajj", "ajk", "ajl", "ajm", "ajn", "ajo", "ajp", "ajq", "ajr", "ajs", "ajt", "aju", "ajv", "ajw", "ajx", "ajy", "ajz", "aka", "akb", "akc", "akd", "ake", "akf", "akg", "akh", "aki", "akj", "akk", "akl", "akm", "akn", "ako", "akp", "akq", "akr", "aks", "akt", "aku", "akv", "akw", "akx", "aky", "akz", "ala", "alb", "alc", "ald", "ale", "alf", "alg", "alh", "ali", "alj", "alk", "all", }; lmbench-3.0-a9/src/par_mem.c0000664000076400007640000000341610715547567015565 0ustar staelinstaelin/* * par_mem.c - determine the memory hierarchy parallelism * * usage: par_mem [-L ] [-M len[K|M]] [-W ] [-N ] * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void compute_times(struct mem_state* state, double* tlb_time, double* cache_time); /* * Assumptions: * * 1) Cache lines are a multiple of pointer-size words * 2) Cache lines are no larger than 1/8 of a page (typically 512 bytes) * 3) Pages are an even multiple of cache lines */ int main(int ac, char **av) { int i; int c; int warmup = 0; int repetitions = (1000000 <= get_enough(0) ? 1 : TRIES); size_t maxlen = 64 * 1024 * 1024; double par; struct mem_state state; char *usage = "[-L ] [-M len[K|M]] [-W ] [-N ]\n"; state.line = getpagesize() / 16; state.pagesize = getpagesize(); while (( c = getopt(ac, av, "L:M:W:N:")) != EOF) { switch(c) { case 'L': state.line = atoi(optarg); if (state.line < sizeof(char*)) state.line = sizeof(char*); break; case 'M': maxlen = bytes(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } for (i = MAX_MEM_PARALLELISM * state.line; i <= maxlen; i<<=1) { par = par_mem(i, warmup, repetitions, &state); if (par > 0.) { fprintf(stderr, "%.6f %.2f\n", i / (1000. * 1000.), par); } } exit(0); } lmbench-3.0-a9/src/par_ops.c0000664000076400007640000003176110715547567015614 0ustar staelinstaelin/* * par_ops.c - benchmark of simple operation parallelism * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" void initialize(iter_t iterations, void* cookie); #define FIVE(m) m m m m m #define TEN(m) FIVE(m) FIVE(m) #define FIFTY(m) TEN(m) TEN(m) TEN(m) TEN(m) TEN(m) #define HUNDRED(m) FIFTY(m) FIFTY(m) #define MAX_LOAD_PARALLELISM 16 struct _state { int N; int M; int K; int int_data[MAX_LOAD_PARALLELISM]; double double_data[MAX_LOAD_PARALLELISM]; }; double max_parallelism(benchmp_f* benchmarks, int warmup, int repetitions, void* cookie) { int i; double baseline, max_load_parallelism, load_parallelism; max_load_parallelism = 1.; for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) { benchmp(initialize, benchmarks[i], NULL, 0, 1, warmup, repetitions, cookie); save_minimum(); if (gettime() == 0) return -1.; if (i == 0) { baseline = (double)gettime() / (double)get_n(); } else { load_parallelism = baseline; load_parallelism /= (double)gettime(); load_parallelism *= (double)((i + 1) * get_n()); if (load_parallelism > max_load_parallelism) { max_load_parallelism = load_parallelism; } } } return max_load_parallelism; } #define REPEAT_0(m) m(0) #define REPEAT_1(m) REPEAT_0(m) m(1) #define REPEAT_2(m) REPEAT_1(m) m(2) #define REPEAT_3(m) REPEAT_2(m) m(3) #define REPEAT_4(m) REPEAT_3(m) m(4) #define REPEAT_5(m) REPEAT_4(m) m(5) #define REPEAT_6(m) REPEAT_5(m) m(6) #define REPEAT_7(m) REPEAT_6(m) m(7) #define REPEAT_8(m) REPEAT_7(m) m(8) #define REPEAT_9(m) REPEAT_8(m) m(9) #define REPEAT_10(m) REPEAT_9(m) m(10) #define REPEAT_11(m) REPEAT_10(m) m(11) #define REPEAT_12(m) REPEAT_11(m) m(12) #define REPEAT_13(m) REPEAT_12(m) m(13) #define REPEAT_14(m) REPEAT_13(m) m(14) #define REPEAT_15(m) REPEAT_14(m) m(15) #define BENCHMARK(benchmark,N,repeat) \ void benchmark##_##N(iter_t iterations, void *cookie) \ { \ register iter_t i = iterations; \ struct _state* state = (struct _state*)cookie; \ repeat(DECLARE); \ \ repeat(INIT); \ while (i-- > 0) { \ repeat(PREAMBLE); \ TEN(repeat(BODY)); \ } \ \ repeat(SAVE); \ } #define PARALLEL_BENCHMARKS(benchmark) \ BENCHMARK(benchmark, 0, REPEAT_0) \ BENCHMARK(benchmark, 1, REPEAT_1) \ BENCHMARK(benchmark, 2, REPEAT_2) \ BENCHMARK(benchmark, 3, REPEAT_3) \ BENCHMARK(benchmark, 4, REPEAT_4) \ BENCHMARK(benchmark, 5, REPEAT_5) \ BENCHMARK(benchmark, 6, REPEAT_6) \ BENCHMARK(benchmark, 7, REPEAT_7) \ BENCHMARK(benchmark, 8, REPEAT_8) \ BENCHMARK(benchmark, 9, REPEAT_9) \ BENCHMARK(benchmark, 10, REPEAT_10) \ BENCHMARK(benchmark, 11, REPEAT_11) \ BENCHMARK(benchmark, 12, REPEAT_12) \ BENCHMARK(benchmark, 13, REPEAT_13) \ BENCHMARK(benchmark, 14, REPEAT_14) \ BENCHMARK(benchmark, 15, REPEAT_15) \ \ benchmp_f benchmark##_benchmarks[] = { \ benchmark##_0, \ benchmark##_1, \ benchmark##_2, \ benchmark##_3, \ benchmark##_4, \ benchmark##_5, \ benchmark##_6, \ benchmark##_7, \ benchmark##_8, \ benchmark##_9, \ benchmark##_10, \ benchmark##_11, \ benchmark##_12, \ benchmark##_13, \ benchmark##_14, \ benchmark##_15 \ }; #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N ^= s##N; s##N ^= r##N; r##N |= s##N; #define DECLARE(N) register int r##N, s##N; #define INIT(N) r##N = state->int_data[N] + 1; s##N = (N+1) + r##N; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(integer_bit) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) a##N += b##N; b##N -= a##N; #define DECLARE(N) register int a##N, b##N; #define INIT(N) a##N = state->int_data[N] + 57; \ b##N = state->int_data[N] + 31; #define PREAMBLE(N) #define SAVE(N) use_int(a##N + b##N); PARALLEL_BENCHMARKS(integer_add) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N *= s##N; #define DECLARE(N) register int r##N, s##N, t##N; #define INIT(N) r##N = state->int_data[N] - N + 1 + 37431; \ s##N = state->int_data[N] - N + 1 + 4; \ t##N = r##N * s##N * s##N * s##N * s##N * s##N * \ s##N * s##N * s##N * s##N * s##N - r##N; \ r##N += t##N; #define PREAMBLE(N) r##N -= t##N; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(integer_mul) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = (s##N / r##N); #define DECLARE(N) register int r##N, s##N; #define INIT(N) r##N = state->int_data[N] - N + 1 + 36; \ s##N = (r##N + 1) << 20; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(integer_div) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N %= s##N; r##N |= s##N; #define DECLARE(N) register int r##N, s##N; #define INIT(N) r##N = state->int_data[N] - N + 1 + iterations; \ s##N = state->int_data[N] - N + 1 + 62; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(integer_mod) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N ^= i##N; s##N ^= r##N; r##N |= s##N; #define DECLARE(N) register int64 r##N, s##N, i##N; #define INIT(N) r##N = state->int_data[N] - N + 1; \ r##N |= r##N << 32; \ s##N = iterations + state->int_data[N] - N + 1; \ s##N |= s##N << 32; \ i##N = (s##N << 2) - (int64)1; #define PREAMBLE(N) i##N -= 1; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(int64_bit) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) a##N += b##N; b##N -= a##N; #define DECLARE(N) register int64 a##N, b##N; #define INIT(N) a##N = state->int_data[N] - N + 1 + 37420; \ a##N += (int64)(0xFE + state->int_data[N] - N + 1)<<30; \ b##N = state->int_data[N] - N + 1 + 21698324; \ b##N += (int64)(0xFFFE + state->int_data[N] - N + 1)<<29; #define PREAMBLE(N) #define SAVE(N) use_int((int)a##N + (int)b##N); PARALLEL_BENCHMARKS(int64_add) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = (r##N * s##N); #define DECLARE(N) register int64 r##N, s##N, t##N; #define INIT(N) r##N = state->int_data[N] - N + 1 + 37420; \ r##N += (int64)(state->int_data[N] - N + 1 + 6)<<32; \ s##N = state->int_data[N] - N + 1 + 4; \ t##N = r##N * s##N * s##N * s##N * s##N * s##N * \ s##N * s##N * s##N * s##N * s##N - r##N; \ r##N += t##N; #define PREAMBLE(N) r##N -= t##N; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(int64_mul) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = (s##N / r##N); #define DECLARE(N) register int64 r##N, s##N; #define INIT(N) r##N = state->int_data[N] - N + 37; \ r##N += r##N << 33; \ s##N = (r##N + 17) << 13; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(int64_div) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = (s##N % r##N) ^ r##N; #define DECLARE(N) register int64 r##N, s##N; #define INIT(N) r##N = (int64)state->int_data[N]; s##N = 0; #define PREAMBLE(N) s##N++; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(int64_mod) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N += r##N; #define DECLARE(N) register float r##N, s##N; #define INIT(N) r##N = (float)state->double_data[N] + 1023.0; \ s##N = (float)state->K; #define PREAMBLE(N) r##N += s##N; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(float_add) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N *= r##N; r##N *= s##N; #define DECLARE(N) register float r##N, s##N; #define INIT(N) r##N = 8.0f * (float)state->double_data[N]; \ s##N = 0.125 * (float)state->M * state->double_data[N] / 1000.0; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); use_int((int)s##N); PARALLEL_BENCHMARKS(float_mul) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = s##N / r##N; #define DECLARE(N) register float r##N, s##N; #define INIT(N) r##N = 1.41421356f * (float)state->double_data[N]; \ s##N = 3.14159265f * (float)(state->int_data[N] - N + 1); #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); use_int((int)s##N); PARALLEL_BENCHMARKS(float_div) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N += r##N; #define DECLARE(N) register double r##N, s##N; #define INIT(N) r##N = state->double_data[N] + 1023.; \ s##N = (double)state->K; #define PREAMBLE(N) r##N += s##N; #define SAVE(N) use_int((int)r##N); PARALLEL_BENCHMARKS(double_add) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N *= r##N; r##N *= s##N; #define DECLARE(N) register double r##N, s##N; #define INIT(N) r##N = 8.0f * state->double_data[N]; \ s##N = 0.125 * (double)state->M * state->double_data[N] / 1000.0; #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); use_int((int)s##N); PARALLEL_BENCHMARKS(double_mul) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE #define BODY(N) r##N = s##N / r##N; #define DECLARE(N) register double r##N, s##N; #define INIT(N) r##N = 1.41421356 * state->double_data[N]; \ s##N = 3.14159265 * (double)(state->int_data[N] - N + 1); #define PREAMBLE(N) #define SAVE(N) use_int((int)r##N); use_int((int)s##N); PARALLEL_BENCHMARKS(double_div) #undef BODY #undef DECLARE #undef INIT #undef PREAMBLE #undef SAVE void initialize(iter_t iterations, void* cookie) { struct _state *state = (struct _state*)cookie; register int i; if (iterations) return; for (i = 0; i < MAX_LOAD_PARALLELISM; ++i) { state->int_data[i] = i+1; state->double_data[i] = 1.; } } int main(int ac, char **av) { int c; int warmup = 0; int repetitions = (1000000 <= get_enough(0) ? 1 : TRIES); double par; struct _state state; char *usage = "[-W ] [-N ]\n"; state.N = 1; state.M = 1000; state.K = -1023; while (( c = getopt(ac, av, "W:N:")) != EOF) { switch(c) { case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } par = max_parallelism(integer_bit_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "integer bit parallelism: %.2f\n", par); par = max_parallelism(integer_add_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "integer add parallelism: %.2f\n", par); par = max_parallelism(integer_mul_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "integer mul parallelism: %.2f\n", par); par = max_parallelism(integer_div_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "integer div parallelism: %.2f\n", par); par = max_parallelism(integer_mod_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "integer mod parallelism: %.2f\n", par); par = max_parallelism(int64_bit_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "int64 bit parallelism: %.2f\n", par); par = max_parallelism(int64_add_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "int64 add parallelism: %.2f\n", par); par = max_parallelism(int64_mul_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "int64 mul parallelism: %.2f\n", par); par = max_parallelism(int64_div_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "int64 div parallelism: %.2f\n", par); par = max_parallelism(int64_mod_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "int64 mod parallelism: %.2f\n", par); par = max_parallelism(float_add_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "float add parallelism: %.2f\n", par); par = max_parallelism(float_mul_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "float mul parallelism: %.2f\n", par); par = max_parallelism(float_div_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "float div parallelism: %.2f\n", par); par = max_parallelism(double_add_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "double add parallelism: %.2f\n", par); par = max_parallelism(double_mul_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "double mul parallelism: %.2f\n", par); par = max_parallelism(double_div_benchmarks, warmup, repetitions, &state); if (par > 0.) fprintf(stderr, "double div parallelism: %.2f\n", par); return(0); } lmbench-3.0-a9/src/rhttp.c0000664000076400007640000000527007045412511015264 0ustar staelinstaelin/* * rhttp.c - simple HTTP transaction latency test * * usage: rhttp hostname [port] remote-clients -p file file * * This turns into a bunch of * rsh remote http hostname file file file [port] * with the results aggragated and reported. * * The program "http" must be in your path on the remote machine. * * XXX - the way this should work is like so: * parent process reading file names from stdin * multiple child processes connected to the parent process * while more file names * wait for a child process to be idle * feed it ~10 filenames * the child processes need to be able to tell the parent that they * want more work. They also need to pass back the results. * * Copyright (c) 1994-1997 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Silicon Graphics is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" int main(int ac, char **av) { char *name = av[0], *server, *prog; int i, j; uint64 total = 0; uint64 usecs = 0; char *args[1024]; if (ac < 5) { usage: fprintf(stderr, "Usage: %s hostname [port] remote-clients -p file ...\n", name); exit(1); } server = av[1]; av++, ac--; /* eat server */ if (atoi(av[1]) != 0) { prog = av[1]; av++, ac--; /* eat port */ } else { prog = "80"; /* http */ } for (i = 1; i < ac; ++i) { if (!strcmp("-p", av[i])) { i++; break; } } args[0] = "rsh"; args[2] = "http"; args[3] = server; j = 4; while (i < ac) { args[j++] = av[i++]; } args[j++] = prog; args[j] = 0; for (i = 1; i < ac; ++i) { if (!strcmp("-p", av[i])) { break; } args[1] = av[i]; for (j = 0; args[j]; j++) { printf("%s ", args[j]); } printf("\n"); if (fork() == 0) { char name[30]; sprintf(name, "/tmp/rhttp%d", i); creat(name, 0666); close(2); dup(1); execvp(args[0], args); perror(args[0]); exit(1); } } for (i = 1; i < ac; ++i) { if (!strcmp("-p", av[i])) { break; } wait(0); } system("cat /tmp/rhttp*; rm /tmp/rhttp*"); exit(1); for (i = 1; i < ac; ++i) { int fd, n, m = 0; float f1 = 0, f2 = 0; char buf[30]; if (!strcmp("-p", av[i])) { break; } sprintf(buf, "/tmp/http%d", i); fd = open(buf, 0); unlink(buf); /* * Avg xfer: 3.9KB, 235.0KB in 2038 millisecs, 115.31 KB/sec */ n = read(fd, buf, XFERSIZE); buf[n] = 0; sscanf(buf, "Avg xfer: %fKB, %fKB in %d millisecs,", &f1, &f2, &m); if (m > usecs) { usecs = m; } total += f2; } total <<= 10; usecs *= 1000; settime(usecs); latency((uint64)1, total); } lmbench-3.0-a9/src/seek.c0000664000076400007640000000203410450256150015044 0ustar staelinstaelinchar *id = "$Id$\n"; /* * Seek - calculate seeks as a function of distance. * * Usage: seek file size * * Copyright (c) 1994,1995,1996 Larry McVoy. All rights reserved. */ #include "bench.h" #define STRIDE 1024*1024 main(ac, av) int ac; char *av[]; { char buf[512]; int disk; off64_t size; off64_t begin, end; int usecs; if (ac != 3) { exit(1); } if ((disk = open(av[1], 0)) == -1) { exit(1); } size = bytes(av[2]); /* * We flip back and forth, in strides of 1MB. * If we have a 100MB disk, that means we do * 1, 99, 2, 98, etc. */ end = size; begin = 0; seekto(disk, begin, SEEK_SET); read(disk, buf, sizeof(buf)); while (end > begin) { end -= STRIDE; start(); seekto(disk, end, SEEK_SET); read(disk, buf, sizeof(buf)); usecs = stop(); printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.); begin += STRIDE; start(); seekto(disk, begin, SEEK_SET); read(disk, buf, sizeof(buf)); usecs = stop(); printf("%.04f %.04f\n", (end - begin) / 1000000., usecs/1000.); } exit(0); } lmbench-3.0-a9/src/stats.h0000664000076400007640000000400407124166502015264 0ustar staelinstaelin#ifndef _STATS_H #define _STATS_H #include "bench.h" #include "timing.h" #define ABS(x) ((x) < 0 ? -(x) : (x)) int int_compare(const void *a, const void *b); int uint64_compare(const void *a, const void *b); int double_compare(const void *a, const void *b); typedef int (*int_stat)(int *values, int size); typedef uint64 (*uint64_stat)(uint64 *values, int size); typedef double (*double_stat)(double *values, int size); int int_median(int *values, int size); uint64 uint64_median(uint64 *values, int size); double double_median(double *values, int size); int int_mean(int *values, int size); uint64 uint64_mean(uint64 *values, int size); double double_mean(double *values, int size); int int_min(int *values, int size); uint64 uint64_min(uint64 *values, int size); double double_min(double *values, int size); int int_max(int *values, int size); uint64 uint64_max(uint64 *values, int size); double double_max(double *values, int size); double int_variance(int *values, int size); double uint64_variance(uint64 *values, int size); double double_variance(double *values, int size); double int_moment(int moment, int *values, int size); double uint64_moment(int moment, uint64 *values, int size); double double_moment(int moment, double *values, int size); double int_stderr(int *values, int size); double uint64_stderr(uint64 *values, int size); double double_stderr(double *values, int size); double int_skew(int *values, int size); double uint64_skew(uint64 *values, int size); double double_skew(double *values, int size); double int_kurtosis(int *values, int size); double uint64_kurtosis(uint64 *values, int size); double double_kurtosis(double *values, int size); double int_bootstrap_stderr(int *values, int size, int_stat f); double uint64_bootstrap_stderr(uint64 *values, int size, uint64_stat f); double double_bootstrap_stderr(double *values, int size, double_stat f); void regression(double *x, double *y, double *sig, int n, double *a, double *b, double *sig_a, double *sig_b, double *chi2); #endif /* _STATS_H */ lmbench-3.0-a9/src/stream.c0000664000076400007640000001556310721052253015422 0ustar staelinstaelin/* * steam.c - lmbench version of John McCalpin's STREAM benchmark * * usage: stream * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" struct _state { double* a; double* b; double* c; double scalar; int len; }; void initialize(iter_t iterations, void* cookie); void cleanup(iter_t iterations, void* cookie); /* These are from STREAM version 1 */ void copy(iter_t iterations, void* cookie); void scale(iter_t iterations, void* cookie); void add(iter_t iterations, void* cookie); void triad(iter_t iterations, void* cookie); /* These are from STREAM version 2 */ void fill(iter_t iterations, void* cookie); /* NOTE: copy is the same as in version 1 */ void daxpy(iter_t iterations, void* cookie); void sum(iter_t iterations, void* cookie); /* * Assumptions: * * 1) Cache lines are a multiple of pointer-size words * 2) Cache lines are no larger than 1/4 a page size * 3) Pages are an even multiple of cache lines */ int main(int ac, char **av) { int version = 1; int parallel = 1; int warmup = 0; int repetitions = -1; int c; uint64 datasize; struct _state state; char *p; char *usage = "[-v ] [-M [K|M]] [-P ] [-W ] [-N ]\n"; state.len = 1000 * 1000 * 3 * sizeof(double); state.scalar = 3.0; while (( c = getopt(ac, av, "v:M:P:W:N:")) != EOF) { switch(c) { case 'v': version = atoi(optarg); if (version != 1 && version != 2) lmbench_usage(ac, av, usage); break; case 'P': parallel = atoi(optarg); if (parallel <= 0) lmbench_usage(ac, av, usage); break; case 'M': state.len = bytes(optarg); break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } /* ensure that we can malloc the desired space */ while (!(p = malloc(state.len))) state.len /= 2; free(p); /* convert from bytes to array length */ state.len /= 3 * sizeof(double); datasize = sizeof(double) * state.len * parallel; if (version == 1) { benchmp(initialize, copy, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM copy latency", state.len * get_n()); fprintf(stderr, "STREAM copy bandwidth: "); mb(2 * datasize * get_n()); } benchmp(initialize, scale, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM scale latency", state.len * get_n()); fprintf(stderr, "STREAM scale bandwidth: "); mb(2 * datasize * get_n()); } benchmp(initialize, add, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM add latency", state.len * get_n()); fprintf(stderr, "STREAM add bandwidth: "); mb(3 * datasize * get_n()); } benchmp(initialize, triad, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM triad latency", state.len * get_n()); fprintf(stderr, "STREAM triad bandwidth: "); mb(3 * datasize * get_n()); } } else { benchmp(initialize, fill, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM2 fill latency", state.len * get_n()); fprintf(stderr, "STREAM2 fill bandwidth: "); mb(datasize * get_n()); } benchmp(initialize, copy, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM2 copy latency", state.len * get_n()); fprintf(stderr, "STREAM2 copy bandwidth: "); mb(2 * datasize * get_n()); } benchmp(initialize, daxpy, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM2 daxpy latency", state.len * get_n()); fprintf(stderr, "STREAM2 daxpy bandwidth: "); mb(3 * datasize * get_n()); } benchmp(initialize, sum, cleanup, 0, parallel, warmup, repetitions, &state); if (gettime() > 0) { if (parallel <= 1) save_minimum(); nano("STREAM2 sum latency", state.len * get_n()); fprintf(stderr, "STREAM2 sum bandwidth: "); mb(datasize * get_n()); } } return(0); } void initialize(iter_t iterations, void* cookie) { int i; struct _state* state = (struct _state*)cookie; if (iterations) return; state->a = (double*)malloc(sizeof(double) * state->len); state->b = (double*)malloc(sizeof(double) * state->len); state->c = (double*)malloc(sizeof(double) * state->len); if (state->a == NULL || state->b == NULL || state->c == NULL) { exit(1); } for (i = 0; i < state->len; ++i) { state->a[i] = 1.; state->b[i] = 2.; state->c[i] = 0.; } } #define BODY(expr) \ { \ register int i; \ register int N = state->len; \ register double* a = state->a; \ register double* b = state->b; \ register double* c = state->c; \ register double scalar = state->scalar; \ \ state->a = state->b; \ state->b = state->c; \ state->c = a; \ \ for (i = 0; i < N; ++i) { \ expr; \ } \ } void copy(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(c[i] = a[i];) } } void scale(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(b[i] = scalar * c[i];) } } void add(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(c[i] = a[i] + b[i];) } } void triad(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(a[i] = b[i] + scalar * c[i];) } } /* * STREAM version 2 benchmark kernels * * NOTE: copy is the same as version 1's benchmark */ void fill(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(a[i] = 0;) } } void daxpy(iter_t iterations, void *cookie) { struct _state* state = (struct _state*)cookie; while (iterations-- > 0) { BODY(a[i] = a[i] + scalar * b[i];) } } void sum(iter_t iterations, void *cookie) { register double s; struct _state* state = (struct _state*)cookie; s = 0.0; while (iterations-- > 0) { BODY(s += a[i];) } use_int((int)s); } void cleanup(iter_t iterations, void* cookie) { struct _state* state = (struct _state*)cookie; if (iterations) return; free(state->a); free(state->b); free(state->c); } lmbench-3.0-a9/src/timing.h0000664000076400007640000000247310432651725015427 0ustar staelinstaelin/* * $Id$ */ #ifndef _TIMING_H #define _TIMING_H char *p64(uint64 big); char *p64sz(uint64 big); double Delta(void); double Now(void); void adjust(int usec); void bandwidth(uint64 bytes, uint64 times, int verbose); uint64 bytes(char *s); void context(uint64 xfers); uint64 delta(void); int get_enough(int); uint64 get_n(void); void kb(uint64 bytes); double l_overhead(void); char last(char *s); void latency(uint64 xfers, uint64 size); void mb(uint64 bytes); void micro(char *s, uint64 n); void micromb(uint64 mb, uint64 n); void milli(char *s, uint64 n); void morefds(void); void nano(char *s, uint64 n); uint64 now(void); void ptime(uint64 n); void rusage(void); void save_n(uint64); void settime(uint64 usecs); void start(struct timeval *tv); uint64 stop(struct timeval *begin, struct timeval *end); uint64 t_overhead(void); double timespent(void); void timing(FILE *out); uint64 tvdelta(struct timeval *, struct timeval *); void tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0); void use_int(int result); void use_pointer(void *result); uint64 usecs_spent(void); void touch(char *buf, size_t size); size_t* permutation(size_t max, size_t scale); int cp(char* src, char* dst, mode_t mode); long bread(void* src, long count); #if defined(hpux) || defined(__hpux) int getpagesize(); #endif #endif /* _TIMING_H */ lmbench-3.0-a9/src/timing_o.c0000664000076400007640000000021107045412511015716 0ustar staelinstaelin#include #include "bench.h" int main() { putenv("LOOP_O=0.0"); printf("%lu\n", (unsigned long)t_overhead()); return (0); } lmbench-3.0-a9/src/tlb.c0000664000076400007640000001054010715547567014722 0ustar staelinstaelin/* * tlb.c - guess the cache line size * * usage: tlb [-c] [-L ] [-M len[K|M]] [-W ] [-N ] * * Copyright (c) 2000 Carl Staelin. * Copyright (c) 1994 Larry McVoy. Distributed under the FSF GPL with * additional restriction that results may published only if * (1) the benchmark is unmodified, and * (2) the version in the sccsid below is included in the report. * Support for this development by Sun Microsystems is gratefully acknowledged. */ char *id = "$Id$\n"; #include "bench.h" int find_tlb(int start, int maxpages, int warmup, int repetitions, double* tlb_time, double* cache_time, struct mem_state* state); void compute_times(int pages, int warmup, int repetitions, double* tlb_time, double* cache_time, struct mem_state* state); #define THRESHOLD 1.15 /* * Assumptions: * * 1) Cache lines are a multiple of pointer-size words * 2) Cache lines no larger than 1/8 a page size * 3) Pages are an even multiple of cache lines */ int main(int ac, char **av) { int tlb, maxpages; int c; int print_cost = 0; int warmup = 0; int repetitions = (1000000 <= get_enough(0) ? 1 : TRIES); double tlb_time, cache_time; struct mem_state state; char *usage = "[-c] [-L ] [-M len[K|M]] [-W ] [-N ]\n"; maxpages = 16 * 1024; state.width = 1; state.pagesize = getpagesize(); state.line = sizeof(char*); tlb = 2; while (( c = getopt(ac, av, "cL:M:W:N:")) != EOF) { switch(c) { case 'c': print_cost = 1; break; case 'L': state.line = atoi(optarg); break; case 'M': maxpages = bytes(optarg); /* max in bytes */ maxpages /= getpagesize(); /* max in pages */ break; case 'W': warmup = atoi(optarg); break; case 'N': repetitions = atoi(optarg); break; default: lmbench_usage(ac, av, usage); break; } } /* assumption: no TLB will have less than 16 entries */ tlb = find_tlb(8, maxpages, warmup, repetitions, &tlb_time, &cache_time, &state); if (tlb > 0) { if (print_cost) { compute_times(tlb * 2, warmup, repetitions, &tlb_time, &cache_time, &state); fprintf(stderr, "tlb: %d pages %.5f nanoseconds\n", tlb, tlb_time - cache_time); } else { fprintf(stderr, "tlb: %d pages\n", tlb); } } /* for (i = tlb<<1; i <= maxpages; i<<=1) { compute_times(i, warmup, repetitions, &tlb_time, &cache_time, &state); } /**/ return(0); } int find_tlb(int start, int maxpages, int warmup, int repetitions, double* tlb_time, double* cache_time, struct mem_state* state) { int i, lower, upper; for (i = start; i <= maxpages; i<<=1) { compute_times(i, warmup, repetitions, tlb_time, cache_time, state); if (*tlb_time / *cache_time > THRESHOLD) { lower = i>>1; upper = i; i = lower + (upper - lower) / 2; break; } } /* we can't find any tlb effect */ if (i >= maxpages) { state->len = 0; return (0); } /* use a binary search to locate point at which TLB effects start */ while (lower + 1 < upper) { compute_times(i, warmup, repetitions, tlb_time, cache_time, state); if (*tlb_time / *cache_time > THRESHOLD) { upper = i; } else { lower = i; } i = lower + (upper - lower) / 2; } return (lower); } void compute_times(int pages, int warmup, int repetitions, double* tlb_time, double* cache_time, struct mem_state* state) { int i; result_t tlb_results, cache_results, *r_save; r_save = get_results(); insertinit(&tlb_results); insertinit(&cache_results); state->len = pages * state->pagesize; state->maxlen = pages * state->pagesize; tlb_initialize(0, state); if (state->initialized) { for (i = 0; i < TRIES; ++i) { BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0); insertsort(gettime(), get_n(), &tlb_results); } } tlb_cleanup(0, state); state->len = pages * state->line; state->maxlen = pages * state->line; mem_initialize(0, state); if (state->initialized) { for (i = 0; i < TRIES; ++i) { BENCH1(mem_benchmark_0(__n, state); __n = 1;, 0); insertsort(gettime(), get_n(), &cache_results); } } mem_cleanup(0, state); /* We want nanoseconds / load. */ set_results(&tlb_results); *tlb_time = (1000. * (double)gettime()) / (100. * (double)get_n()); /* We want nanoseconds / load. */ set_results(&cache_results); *cache_time = (1000. * (double)gettime()) / (100. * (double)get_n()); set_results(r_save); /* fprintf(stderr, "%d %.5f %.5f\n", pages, *tlb_time, *cache_time); /**/ } lmbench-3.0-a9/src/version.h0000664000076400007640000000011110723011605015576 0ustar staelinstaelin#define MAJOR 3 #define MINOR -9 /* negative is alpha, it "increases" */ lmbench-3.0-a9/src/webpage-lm.tar0000664000076400007640000017000007274037675016523 0ustar staelinstaelinwebpage-lm/ 40775 6732 143 0 6426334120 11170 5ustar lmnobodywebpage-lm/pictures/ 40775 6732 143 0 6426333303 13030 5ustar lmnobodywebpage-lm/pictures/me-small.jpg100664 6732 143 37644 6426333303 15374 0ustar lmnobodyÿØÿàJFIFÿþHCREATOR: XV Version 3.10a Rev: 12/29/94 Quality = 75, Smoothing = 0 ÿÛC    $.' ",#(7),01444'9=82<.342ÿÛC  2!!22222222222222222222222222222222222222222222222222ÿÀóq"ÿÄ ÿĵ}!1AQa"q2‘¡#B±ÁRÑð$3br‚ %&'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyzƒ„…†‡ˆ‰Š’“”•–—˜™š¢£¤¥¦§¨©ª²³´µ¶·¸¹ºÂÃÄÅÆÇÈÉÊÒÓÔÕÖרÙÚáâãäåæçèéêñòóôõö÷øùúÿÄ ÿĵw!1AQaq"2B‘¡±Á #3RðbrÑ $4á%ñ&'()*56789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz‚ƒ„…†‡ˆ‰Š’“”•–—˜™š¢£¤¥¦§¨©ª²³´µ¶·¸¹ºÂÃÄÅÆÇÈÉÊÒÓÔÕÖרÙÚâãäåæçèéêòóôõö÷øùúÿÚ ?ä‚S‚T¾]8Gí]'šDÒœ¦T§æ•À„/à•0JvÊ.)Û8â§ N Jà@#æ—Ëö«:]œP2”»*q;Ë¢àW K²¬ø¥Ø=(¸öRìÅO²—g\ û)vTû)vQp+ì£eXÙFÊW¶Ê]•cgµ=¨¸öSJU­”ž]©JM•hÇHc¢àT(sÅ!J·åñÒšc  »8¦”«~]4¥1 tÓ[)M)EÀ¦c¦”ö«…)†?jw™JiJ¸c¦è™JiJ¶R˜Sš`Td¦«…)…(¸ÊS=ªÛ%0¥r¡JaJ¶R˜SÚÜ«²›²¬”ÇjnÊr=¿ç4UƒÒŠ‘\è6S‚t©ÂS‚q@ˆtà•`%(Ú‹JvÊœ%(J@@²¬yt»:P2˜íN Sç¥;fh §l©‚S¶P;öqÅ.ʰœX¬Ÿz]•(·Q+Kœ¨RsØæjMœP;¶PdGíJ#â‹[e*ÎÊ6P+lâ/г³Ú”Š»9£eYÙÍ]b®ÊO.­èÙ@¬T)IåÕ¿/Ú“Ëö ,S)M)W tñT§ß/Ú­”¦ùtІ?jaJºR˜RÄSdéQ”«Æ:ŒÇ@Œ~ÔÓ5pÇL1ûS¸šÊ6sEÂÅ}”›*ÆÊMœt§qX®R˜ÉV ÓYh¸X¯²“eL¤H»—‘AZ.+ÊS U¢´Â´î+JS U²•Z.+ŠS U–Z˿֬ì$;—˜ÿË4䯥&ì®Æ äì‹)Vuû»s¦é£Ë<åØn#Ôÿ]a¼¾,¶˜G5”®I #ï~U—Ö)÷6XZ‡@ê+:}F?µ­ª›¶û±!™äôÏjCTÔ´ùoá3­¬L¡åð2£vé\Ç›q%]”A*qÁëMUæ^ék ÊýãkU×õ5ž[WT·e%X!ÉøV»;f,Ç©'$ÓNsÍ%UÛÜÑE-…«𤻙ä*½­!³CʶÿžÇò¢«QJè‹yžæž¤ O ëZ\å±JvÊ-<%DA)Á*Pœt§¢ã±_jvÞ•(J]•7ˆ¶R„È©¶Ò…Èã¡¢áb-œÒ…ëR…¥ÛJã±Z]¾Õ QK¶‹ŽÄ[{Q¶¥ K¶‹…ˆ‚Ѷ¥ Å.Þh¸ìC·ž”m©¶Ñ¶•ÂÄ[hÛRí¥ÛEÂÄFhÛR‘Íh¸X„­j]´˜¢áÊE¶šV¥b«Ôõ¬ë½oL±Ü./`FQ¸©qœ}:Ñpåoaš¦¡o¥Ù=Ìò"…`» Ïzó¹þ#\É1[¦Ïá9Ç¿ÿZ³¼w­Zë¼OgpÒÃAzÉÎ3øW/ÜØÎ=ñCz ©Þiž>6öþD¶ŒÇvË<Ð µÛXê6÷ñʳYpW{{ŸÖ¼Wìò»+F™;w9zþ•ÒøCÄYê‚ÚhVHål/2xçÓžõ*ET£Õ#ÕŠÓ ñR¤¨ê¹ùXòT‘‘íJvÖ‰œ®%rµ U—ÚI¸k—Wó¥–‹9ò€Ì·6>¿ÃÖÔ­â_MÜÖ~ÔòÛkÍÔç¸Úá‹]!ãóµs7˜…™ÇÊ3ÜxŸÒ²m|;;î¥sÉà~}MmÆ‹U ª0éYÎêÌÖ5U?€ï-¼K¤Ãm1´¨ˆ6ª²“€*9ü[`‡ %¢ãù×N;ÔJ¡‚– w¬ÖúÌΓPñLRYÜÁm`ƒÎV¾0IÉçµy4ð\¡¤‰Ñ}H®¦æö8™¨Èë\õî¦×Q4d(ÈêkXÒ„>Ƥ幜I=i(¢™bÐ))GZšŠ(©$÷¸î-ä¤r£8ê¡"¬+Ét«yWR'ûN;;Ä$m¹VOQ‚ê.íZŸN8ýzÌ*ëɵ|î>ÕìW70›‹û"H¬åŒ¨™bÜ…ˆààùžr*Ÿ†¼5ªiʰÞ-‹YDˆñ†vöÎ?­G¶ÓRù`wzÖ¾¤¶¦ê¬â4Ç$ÖÿÄ= 'X†êÙQa¹"F=>µák‘&¯mg(ÛŽrè8­*ÎN“”F1ö–™Þi~°·¶’2 ÊÊãÛƒ·Záu=-ôýFuÂȤ`ŸO_å]Œ6÷°kÒ´“«QŽ©$f­øDMNä¤è8u\’0x¯•gNw“ºg£(§£1[ÅšIÓciôþÖ8-nÂ/›Ôæ—C×ô»M¢g¼´¶*=› žyô÷Æ}ë9|=OöinϚ˵p7~ð`÷ÆêzVlХම–ÚŽ<<ªä†=¸žžœç¯§§©Fñ<ùµÙž›¨i—ZqšË™cü¢rêݹçšå<”A…^@+‹:ÅÚ+îdÈãp‘‡ÜÖ¶“â4Gms¸ÌÇh|p}3]TT’´™Ç^)ë]îá…‚HÞ[œ7OΫ bÉ“í ñóqüêk“ Ñ2LFG Ö æ‡%¤3β1í[ØÂ<¯stÜÆàu ô Õ[›¨â…ܺŒ ó\ÌÚÙóæÜÄ1ÙI'òÅQ=x9¥{©§Ô»=óÝβm‹=1Æ>•^æH䘘‘R1ÂÞ¡¢¦æÉ$$qÞE÷f£Å »€n"ãû§4ëkòºÈßsÓ½Q4õ2Dr .})èKO¹±ý—üôj+;Λþz?ýõE+ÄŽYw:}UÒâ™î58dº”S g¹ëGPñ$:º&anÑÇ#•Ø3ž@Ï‘ÿê®JÕã‚å–x7Ø(ÄŒÕû}BÆfo³È¤°ùQžy® ÓWºGte}ÉáÒ.ášA5ºÊ0ßék¥Çx.6OóK6H¼æ®8žn„NЋܾóÅÝ‚y?OZƛŚtWbٙĤãz}{Ʊ­õö¼´žK¹¤²A(hR%\°ÂgžIñÛÐÕo þ°ÜÊ‹ Êì‘–ù™rqÎ1ÏçJX‡Ð¥A/ˆÚñŠ&Ó¦û%”QMtT2XcŽÿẕºñV§u ÝÆö¶ÂU3(^¾3ó~µ_š ¼76Z@¸Ý’ò’IL`Nq×ô¯>êzŒòïŠÄ‡%U„A½sŽ?iÔ’'܉ëCÉÒÚYÏÚÞ2ÅHŒ‚qÇF'žµ èzEƲ—Ò†[©1‹ÌêW¹Ý8â¼ê×Ç÷[ù["‹µ\‚>­Üõì+fËz ›•Ó²î!L¥»`qùVr„–å+=ŽïY½†ÎÅ¥–Ñî— yh›‰ÏëVÖºq¸¸Ä*±‡1à–U핊Ÿ×leÓ%û6µrŒí{„ÀÎ8Àï^k«x¯R¾¶û4·„ÅÞ@»ýŽ*£' ¢µ:=[â%ÕÜ’[XÛF±¦ùFâÞ¼v\×+¨Ü_a[»ÄÊà„az*ºjsÙ-¼ƒÈ™NVl=›Óð_lÙ™¦gXÔ:¦NTcÖº!Mö3”ÑÙYj Ý™‹ÄSù²¶æ%•¨mÇ'··ég]ÓY[þ&6§b刕x½kçÄW•‚ $žÕu,Õ#†æ=óD¨'Ô^ÖÇOãYê׆ÒÙc–Þ<=«OgT¨ù¹Ršñní"¾A,RFå¡ã%ׯ vÇ9íWµ‹¹Ã÷%—SÜqœ×#á}J8-å2"°Ã3ÕÚhÖϯ[ÈÏ G§º”ÞÃE<£ÓßòõJN-m=YN<—¹çí{m$ë¨ ñ‰RR¤’I ŸÌ~”ÍoL–AÃæÍµ,Äpáõ®×ÆN•6¯bB•kxv²&6„*1øš¡q]1ØâR»‚O@×µOO—]t*ssGSÍ"‘X£S‚A¯GÔ|(.ì¾Úˆ³Ã‚|Ø z‘ÿë®uì4ë4v‘r÷ÏO¦+x55x³7'$µ9繸™Ã<®Ì:Ôæöús³ÌrOQƒúV”wZ|)¾ IÀ%OZ÷RVgKt ž Í]­ÔWmìeî9ä÷¢Š*MýiE!4äRN1@áI¶¶,ô½È$›êê¥GªGJ"«“Æj«u#^ÈÊÅ(ÈïG>´¢¤²Z)sõ¢¤“vêêÚægÀ£ä Î:œûÔWV–™##» Îsþ>•V;¨¢.UŽp¤öþª·õ¼AcG$)FF~‡ñ®G'¡Ùxµ©œH*y'¢€}ªXˬ‘IQÆ*O~?*¹<"Í *¹¼@ñŸéU#¾Š×;¢I (#îš´Ü–ˆÍ¤ž¬K™!–}ñ!Œà| åO©öí]_…|W-Ë[]1ÊW'0= â$¹i•ç TbâT?xþ4åGš6bU,îzäæÞ{Iäû\p®ÝÛ¡˜nr=ÆZɵ¼’k"GU2e™÷mã§=³õí\mž·äC±ãb¡Ê eOåëÖ¢mCÌʇ;W×Ü×*ÂÉhmí–æœ–Æ@ó–WT8,1·<àqì*0ÒAq¶³•`ÊøY:úôÍ/V6ÒÌȆHÙN¼ƒùõüªÒY[Úá&@¥U¶î˜öÏ8<·x»HJÏb[†}NfÍÄK– Ç^¿Sš¬eŽ%pUŸrícœ©;‡ùük>lÛ9]û‘— ç¨týj¤·¯æ£!噫&öØ™T±ÔXÜÆï2ÖÈ,1ÀÇ'u#ß ïŠè­´øÖh®7ŠIs7Ì õèzñ^gý¡tWi•Šà Ó+oNñuåœg¹Dš"6à®ôü…gW 7¬K…x­ÞÄ1¨\ ·ErüÂÑ60äÛšÖ¶ÔdDÄ} ?§éI<°:â Ê£î©õ5¢Š[ÕÞ£¦‘™ºF Rp?:šÌÙf™æE 0QGMÃÕy«¶ [þíð%lñšèÇ'Õ}†ãþyþ¢ŠØü¨¥ÊŒùÙË'½ b •'@õr:FsQÓ”žÝhààúR0äÍ!¥ÆNE:N€cpi´†8p3šrÍ"Œ#ÅÁëO;HÂŽýhµÂâÉ+ÊAfÉÆ*2Ï'@}‡øÓ'•!ï ªÁæO,‹q(Þª#ùWŸåô¤†ÐK(D¹‡'¹Èþ•Ræê[™ÊåÛÎ)ÑùÑ ”ŒŒý)‰nbšÆèÄä‡NŒ§ùROuqÇ$Ìà ÿ×ïQI1q€¸©nI?Z‹v:S»ˆãÛÚ›ÏÖœÇ"¥1’6‘±¸g¨úúR8ëW-ôÿ1IdXÐô-üGÐõT•,v+ÿhäÕý)|Ûô’l¼qã9=G§åšˆLêÎŽ–žKÉ3Kò ô\ÕÑ|6€-•ýÁûÏ*§ä3ÿ³W%ªjsܪDøF1ÑR=k®ølÌl/”ýÁ*‘é’9þB¼ê¼ÞÍóu:!kèsþ-»þe?~Iˆ9ôSÇòSú;ëúšÁ’¨Ý+Љ¨|gB«¥Þ\¼ò„?€ÏþÍC÷)&Óܹ®¬6öºm²…;G­qúÚGD9c÷˜tÚºMn_·ësÀ£ýJ'=3’­sz½„Vº£Û®J§“þÈ?Ö°¢—6¡95#-LjÛ™ôÇOκ? ^H«qp%Ws$hNsÔã ¬K="Ûü3ÄÄœ¼•¯¡[G¦K«$¤µmÀŒ0È~u¦!ÅÅ®¢¥{ž‡t»íUÉ|A³kÿ[̘ÝÃq'  çŠÜÐæ[Ë&?˜á³Ú²¼g»þËõ(xÿ}k+V‹*¬}Ö47ian±ä1™÷¨WVß6mOSYlèbÆÂdÏ,MFX×Ñsj¦‹ô’ãËŽ%.9;‡OΛo£Hø38Qè95ørT¥Iý¬Ág\¼‡=Jz=Ei-ph¶ÿ{Ìr:õÔñ}ž—˦3Ía-õÊÂ#YQùÕrìßy‰¥tƒ‘½Ùµq©G]„?<ãµVmI Â@6EfâŠ9™JÐ:£˜öíù±‚sU>Ñ#Mæîùýj*&Û)E"÷Û'ÿž‡òTTó2l‡­ÃÂeT%8ä ‡È`r=±VÏžý2ÏݦÆY’³Æ3ž? Žg¹µÕ±•ÑH>cì@x,}ªiô‹«I' Œã#-SÉ7‘nn ' “·ƒúÒGq·ÛöÉQ²êS‚*9§º/–2¥]Û¬ƒ%w+q2GåUf¶žÜ¯›¦àK dâ¶ÚöI¢X’ô»–?1 œzc,–×Ò[yKpìTà #ØuíRªÉ|V§±Î3³’Xäž´ÚÛš÷qC±VVAËóҬŠí–7¸HIÄžP—ùÕ:ñ[©7±ÍÓÇLW[6“£ÏCmÇÚ{díÇ>œÖKxvö&`íÐ1ÆG·G¾ž éIy©¤…oŒŸ,Ëq“Þ¶´™4Å[jp/œå.03õ?ΤU³‰å…¬`’=ÁÁóIّ؃ž´:ömX&Õîs<ãÚ“Wc•awi4¯ˆï;wîn¥F|/p¬æH¢ÇÔzv?Ëò¨úÕ=™^ÂG%WôË·ÎÑù ®îFsZw^1±òdb¹fÆ0}{þ•­—ömÔsHIJ1#k˜{秤kÁìÈ•)@ºšé¢ 0ØÞÝ)·šä7F(¡yø\wÿ íHÄK:²yl8b@ü 2âe[QqåÎèzŒ“ùcÞ´U ö‘ÎÕTö9[/ܨ’i‘U‘¶óƒ×Ÿ}§ÞÚ•{”8#‚@öö®–~ÉÙ·#¡A’ÏÛœZŽóR‡SÒçä’!†3ßz»¥;êŽZÚÖ[¹„p©g# 8«ÐhwR¦æ Ïü´8?•ligµ´V7që’™ÉõïQjº°‚4û3ÆìÄ‚AÎ)¥ÜnrnÈm¶ºžQ¸e÷ç½dßÃm Œ"ŸÍã9ëÏÔUY®¦‰–BÜ÷5=”î¦K‰øI'ñè(Ó¡I5«eEVvÚÉàÜÖÝ”FÖÔ¼ƒ Üãù ¤n-üä6ðùD¬ÙǽlÚZI¨ÞCk–‘ÎÕÔÖu‘J펲²ºÕ&[FÎÿÄ@ùWÜžÕÛÜß[x#ÂÆe{é âsÆ~ƒúV¬ÖöÞðþËtRÃŒ°ûî{šò-Vú]cS%¤2'æ'©õúWn´µÒ(êŠQ^e$ig•æ³»’ÌÌy$õ5ê §&ÖþÜÿ «Ä¯=hÖ(Âæº‡·žGˆÄü³ÆÉ׸ù¿¡«­ïDÕFÊÇUqd­â‹Ù ´QGr¹Z+¯ížvݪLLF QÇ8ü+²Ô¢’úþhQJ¦ÐKö8?þªM[KKí(G<ؾáýá\¨£-Jjèå¼'ÍÒíÉý*_'Øío¦ÏË"ªŒ`€§$þƒó£ÂP3Þ˜qµ£fíŠ×Ö<­[OÔ Å¸-ï•ë­:’µ_" ´FW‚ïÚ;¯³“ò“ŒC[ž-„·†µ%Çü±-ùsý+ˆÑfò5ƒŠô=x Ÿ ßÈwZHñÓRãj«äÊ«µÏÛÉÍ(Qž SKߘ>Õï`Ÿ”ïGQÒ•Q˜áT“ì3@†þ´˜äûTÞDØÿTÿ÷ɦùRg=Å0¸Î”»IRÙ˜§42(ËFÃÜŠwÙåïàûPD”u­aç,=©$³§n?|Œ\è«EYò#ÿžÃò¢¢ÂæE‹-³ °Ü`ŽZlú;ÛÊ“nìŽ{g<×晴+.ìuÐqý;еu©’Œ$ˆ(ÛÆzû~b¸¹ê«ü”ß©ŠñýHIœƒÁQÁéž•ÌèN@n1†¯aw„s"€7e2¼Oïj³³wY4ÈqœFzûu­^VÓFjŸ7ÂÌ%{yÑü®:7J×—"L™Xu$Ÿ˜úóÍoÝÈçNb! ’xçÈŸZÂ’p@u]Ã,Ý ÍT&§­‰”\t¹,9º™V4Ÿ˜FIùÖ„èðDÐÉ©J€Ë7æúV\7Dn$ð§OÐu®€kÖ±ÙÖ #)Œõϯ=>µ9“VC…šÜαf[g‘¯Ì;Tžwc±@÷­{xu‹‹tx5dåW'ó¬w¶Ó'•¤ ,h@mŠû¸úâ´§mko¶ÊhS·ñÇ¥gQ_X­} †š3AìïåtݧÛNýÙðÉøÜæ™jR$¸Ô,ç·hSlrÛ¸àzd uõõ¨müGöY·î ñžGæ¿~õj_Å í–sÔ¨ü¿ZÇ–ªjÑ-¸ue]/Q´hä3ïÏvr@?í•r÷QšÞAåMn˜ä™ÕO#æéœW3«Çqu#ÛYÅl„ä*ç¨õ? *_øH.%²{Y‹e+•=«G‡móX•UZƺj6—·H÷X”e‹Äå¶€3ÓŒþ´— in¯=ÍŒÓBNKÄ6”'Üc“í\öq%µîØŠI<ÆÍ…oÌŠèn|Lm¡òL‰ ðÃ;Ž}¹â‰Ò”d”5ù„j'x£m¨Ù´ÑŸ´¼ïù•ǫ̀8Çò®Ž-Eí­Œ–Öëy …A#¾W9ý=딹ÖZéGŸ¦Â¹%‹ºOÐâ«O¨iòD1¥¢ÉÓ+#ùS•k]~¿ä%VÛ3¸’ËKñ*ÓÛ¤3%þWb˜"¨Í43Œ6Nš•5[¹dbÚ‹&p>lœŽž•&Âd!䵸‘À\6ÐsìEuF¤â­3N-Þ&h…Zm‚Eç¡Ï’Û¼ ‚çc[WqZ Øº{$«×;F:äyõô©ôߤìòÝåaÇ@~b}½){x¨óIX~ÎW²g4™i’HéÔ4}=¯®/f\ -Óg|{çVM׆,lÂÞy’,* +–l œð~¹úUc©hIe´™[¯`Hõô¬§V5cd™¢¦ã-Mÿø„xŽ1kd®°r7äe¿ƒóï\šéí§]Ío8"HØ©ÈÅvþÖtI–Xí4ô‚HÈÇõÏ?qšü„š”³ËòyÎdß’óÈÚ}1\ôdù7¡»²\Û•.Ò­hwc¬Ú\†9Ž@O¸î?*Ç{k±Jw´jÛK‚Jƒéž”±}®ÙVãÊsp”í'Ó5Øàœl™šª¹®Ï ÁŒÞG´å?1O÷?ýqùÑ$" ¸'iä Îðn­µá‹g`¾dCÊu<ã?JÒÔ‰K LC(ù{`׋R ;'©Ì(Bñü€äN†X”ubsÇçRhñ:\ýžI“KóL;(êA÷&² ”Ld ÆVàÊí–úØ}+cDQlÏ&2ͅϹ?ášsÑ]š/# \Ò>É!ºJÉbU›ú~UÒÚJ·¾»›¬¿@V¬êјE l‘OFô–ºlvâhâȶš2 zŽÕ ¦ªý+8žNf§Ä-nHlH§¡èkl]iúš¯c7ÝYÕ~`Þ§æÇÿÕs§É ¿–Ëf Ð뽉 ··µ2K´‘Jì'>§Scz{ÒzQÌÕ2¾†Šm˜ Ù”3l79%Þ'ïÔsÞ²®`¸CæH$ÚíÃ8Áj‰.¥ˆaøÍ:{û»¥ 5Ì® ä+1Àú Ê0”YÑ)Å¢xæA  Œ3“ïL[žØäUd‘îS†õ¦Õò"9Ùjâõæ*TÁP àÕ`ç9#°Í6Š¥•œ›wc²sœóFìCIF)’.óœä’:sÒ—Í}…Aàõ¦ÑE‡p,Ä䓟Zr’yôïL¥Ô:vðGÔRß1éVT.OO\Qa6"¨^qøÓd“'ƒMy8ÀéïP–Ï4F–³ye—™$ínEMq« â Õ¼D€Q@$~¿­bæ”r G³…ïmKæ•­s§µ’Çnf²‰WËùÅ÷±ÔþDÒ$uÙÿ^O“8Ðíã5Ï,ì)b@è=*xîç‡iŽwL ¬F3YûºeûUÕ·Z…²…qp„ó¼ñø`Ôö¶–r^‹‹‰Íà åÈFO¦k=oP‹ §9ë¸ç­RŽI¸µFLäùgi'êrj–³e*èm^Ò¿[­1–Ë  ÜnyGL{ ­>³ªFs,00 ¨Kp@äÈ=«Ög·†XZ9cŽeakÉ|O£ QÙ3ZO—‹ÛÕ þµ8lG¶|²ø»Ž¤ÖÄšrX>ª¶òÈϺ¢Æ£™Xžþƒ=ú×¢5ì¯À‡kYÊ›LEFÔ°;ô®wÁÞ6¨º¦ ˜™†b‡Ü§Þº‰/U¤ýÔˆá€5ÍŒª¥;AÞÆ´"Ò»scáØç¼XŽ ¢!€yÆqÓ<Õífiÿ³–ö̥ų(wrž£Ôb¨ÝXÚë6mr¤†ês‚ ;¢M>[v2É-ݱ“g*G±ÍeM§^¥KGt†µŸ“ÑeçŒåTû÷«³ÚEo¦G$nQ2»1î:ZƒU†=>ÒæÞÛe\ýÖ$’ â©Cp[L¸BNИÄf¦Rè\c}QºXL¦'· úRJ%Mà#bUFÚ}ñYvÚ’Ý›hà*×È\ÿñgÐõ©Þ*½¸Ó¼;rÐ+4ψÓhÏ^§òÍM8·$‰Ÿºyåÿ†¦µAshe'f ÏÐŽÏ’¢ù•xt]¼ýxÇzÛ‹Tºho£x@.Ù ñŽER{µ¾O.æò5‰óc;X|pz£-FÒár‰!«6zŽFqTžÆÚm¥Âà Ä*rßʪ˧^Ë0aºP,öãéUÊÝ[´hªdž>µ×/²Îg.è»ýŸÚ3)Bp<§âZ­5–¤U‰›Í_ùç+sôÁâ®C«¼‘¤1ºÇ(–sžõaîãŽ2Ìà¿Þ8;rHÈëÿ×¥Í8½‡Ë·2fI Õ—•´c*Hü»:„éMk[Å,OÜ`þb¶—QfŽ?•Pñ¹Á^ýóÍC¨EaÇ;”QbïמÓš¥RW¶¨—cmêXªº’ÊÃ¥S), ÊÈÑA«7ou&ÐÄF:{zšé¼=mi5£ˆÉûÆA•ùVÓ¨éÂò3ŒåhœÓ^Éä¬míôÁéëUrÌsž3ë^‰ÿô/©å*«üËÆ@?OJå®,b·½k{ØÌ88Ž@>Vþ®qYÒÄBW².t¤·fU«ý‘üþCÿ}¯øÑZ{hw1öRì`ÑŠ(®‚BŠ( 4Í~4îcŠmšP)°¤‚I ¡¥ì9ædÓ‰­òô 9#Þ‚»ŽiÈ6Œž¾”Ä5rOSK·<æ€ÛY€õâÇ Ï­5†zri„PXý)ÈØ<ô¤Æ†ÑVZ?62Ê>aÏÅU¤ÆÕ…Í”S£­lé‚(î¡v9ØÁ±Œç‘EÍli©$—€ÄÑ.ÞKÈ@U÷©ŸÂÁnC‚O:Ò9G›»$)ýkrÒG’0‚ßç­sÖ-§#+}®K©‡UŒîý·ÖºH~EB»‡LWÎÖ²v±éÇb>9 ’Œ42 ²™Î˜¬ý6ÍtËï-Q$<# SøÁ­™¦Ucv¤È œ늬±¤Á¢lÉÉËA‡¨"°ækN…¯2ÑPOj eù±¸zgšƒ¬}̃(Æ™£[ ím °=TçÛÞ¡!Ø©bq­ê©¿) †`1ÐWÿd?„¥¸¹Õõ©î Û‹cÐ Ÿ× ÖnxÑ/."Ûq~àÇþUOldš¡¢ø²×Nñ6£²æÎñŽlŒ ëì=ë¶œ$ܬº/ÂÆSÒ'g«ì¹”Û²ï6b¿®+ÆòþõÓ*‘œRµšØtÓƒ×TÆZèkÊ¡T7#.qœÕ›Û;½>Îi%ˆOn—RÀý?áKö‹ègIX¤»OÜq´Œôç§è*–µoö››ë@Öò2þþ#'  Œãß8ãŽõs”×3Щ¨r·d‡Ó/-œÚÞd ‡†ú¯øUH–{w(ѬˆË’Œd˜Ïôª,Íáòz縩~×<ˆœ6óü³Ò½%¶Øó¹“Ü%2Díû¶ 8SÎ*•œz”ù<Øø’2¤ŒüõEÔ֨͆ksMñ Öûáœ*VË2(R¹êx¬TUcó*Al †óP3Éý8©œc%i8»£Ð4ýNY-¶²( VÁï€{Ó®­m®Â â,$8QŒî8üÁç·5ÁYÝÍg2¾\¢žT7ºåÕ໲ìÓf@Ah˜óŸÇÿ¯^uZ„¯®T•™gþ­#ÐÿßÃEE™ÿç«ÿß_ýz*9§üÌžXv8*(¢½ƒŒ)(¢RÓzQ·=x¤4dЙÁÀ¦–ƒ4Üš(F;Òð}©´¹ü¨ úÓ·{L#ÒŒg¥(ç'½‰i9”QE€³o!V<ö¦ÜÂc}Àa[ô5 ¶Òë[Æ—–E3–ꡬäù]ÍaucŠs£#•`A©ðż–c„^XÿJÒú¥wa‰Ãg õ®›C>–Kçži>XŒìßJæ–VMÁxVàƒÍIÞ Àïœâ³©(Úöô.Lö-/VÓoÁ[A´}Ð1µąˆ­xΕy&›¨$ˆÀr9^•ëzeÙ½±Šå° .XÄqý+ÂÅaý“ºÙð’”nIu¹—É“%J‚Œ:«â•chdIzïùdÇsëSžJ±êh?1Ç`k´ÆìÏùíUVÆÖÎܬIå¨Ü@îç®=*ñäW?â~ÛLòí÷+\ÊpªO ?¼}¿T!)>Xƒ•µg)âmJm>yâYß̸ ÉÌKŒ1öÝø\ÚÝÊ|€YÚ% í ´p1Àõ÷©Þf¼I¦¿-<®¿)Uù†_§?ýj¦Dϱn ¡á½qÈükÞ£N0‡+ÜâœÜ¥q$”Ȳ’ìß1㧇ÔÓmçžÙ¼ØYѦžÒxC: Yw3…ãÇó¨Ruh&F@X€T±9ôü3[«5¦Æz¦^m^Ve–RU×#*¸;¸÷öÿëTÖúÔ­"ßæÂãw~JÃÆì’sŒ ”ÌU<µÎ1úTº1µ¬5VWÜë­õ gÌYºƒÇLgðçüæœÀÕ^.…TôwÎ\ŒW¡y"‘ƒ¼ÃÜÕØu'xÄRÉ’\ô8íƒè1Ía,;ZÄÞ5“Ñ—åÓ즈6Ü‚ՠ޹ü?Æ¢¼ðñ mÁ,ʯÌpGáQQ3+ƒœÍ×=ñþx­u$”°-óãñÆ{àúRn¬5Ajr9ì=ºíÁÑj‰”;à¯^àcÓʱN 9ŽMãå8ù³ë×§=«"}8n}’®Pnàž¹ãùVЬžæS¤ÖÄ«¢¼0ƒ#ÄÊäóŸ@¼u5šbC’°— „tç$Ž”¶:Í“…— AEÝ·ž¢¶­'µÔUœ<êß}ÈùùàN¿Î³”ªCV8ÆÑêÛK5ãG¹‰' 0};UXæd‘XeYOÞS‚+ZàFg(³eÇÌçݺþ5“=¼¶òí•H'z‚=Aï]•÷2š±½ý¯7ý¥ÿÀeÿ+Šcé/ò#ÚH€¡õÒ1O'µ2ºD%-PRÑE%¦ŠP=è>Ô¨4wæŠ)Ãé@ Š\N< ÷¦ƒž 1IÍ6žÜZN3ÒÆÑNÆ{bŒJr»#V*Gpi´ª2ÀR0‘æ—|ŽY±ÔÔÊK#Û‡¡ç56ÜRdŽj-6‹Ç‚¤Ø<µvÑ?ï<ЭØ@ÿ¢®TÆ´-¥ŠdÛ'QÐÖsºF´ÚnÝFÉ;oB$RyÚ:®ãÁ7ßh´–ÅÛæˆî?—õ®«G&FW<V†©>™¬A1`¨Í²Bzm'Ÿñü+ŸKÚRhÒå‘ë‡Ð~5“{®Ã§ß%¼¥Nð[ Ü€;‘UuïÛhàC÷çe,ªAêk€Y¿µµvº),ŽdWœmQŽç·==«ÍÃá\Óœ´F²¨£¢ÜèaÓž£ðãñ«÷‰Ð@ ‡œã8'Öš'•Õ‘€ôšÙfÈL{ V†E*H,2=ÅU—Q\–+éáf!ó»®y­;mT34í È9Î:ä+5¢‹ÉP®¦mØÀô#¾jPïi4i4n»#¦FyÿõÖSŒeиÊQê]q—X’Ø79F ôãééMm%ˆY -ÎN3Óÿ­8Ý*Ù;ĦݙȱýqŠ‘¦hí­§xÉB˜R¸è¢ÂöPì„0ƒÐñG’=hó—û´†èg•§ÊÉti äŒýáŠ<½H$Œàçí苊\¬ byôyôåuà⣖rŠdÝФµhw‘ÇZ taMKÜ0§y‘úqG+@¨Òby'ži G­Iæ¡ïŒö¨ÞM«A4Òbxzv&寽M@glcµ!rG Ur1(S[!û—¨ü)w‚)„Ò¯AUdivK¼I¸ˆ©÷£i#=¨²fM¿ð¦ç¦ìcÔñOT=ãÖ]²2…óØÔ‹ÞO&­,DiZ5õÍ.qû>¤J? x=)DdŽô»G~©l´„ïŒþ4‘Ç>ôŒËü?7} ¤K·&¸QïÎry£pëEƒ™t$ó ëÒ“Î1QîôisêE; ˜~üwÉ4…‰ïLÈϽy¢ÁÌ5Ÿ,¾¦™?JŒ§çUdCrDþhÀõ¥ß“ÅUÚFh GÒŽPöŒ³“ëHI'In8ÆzvîÆ¤»ßa3ŽÔ=? vü(õâ€##Žj3<â¬mÇlÑ·óô¦¥b\.U ƒÇ9¦ý*ÙLóŠ¡èªRD:o Ä|ç¨a‘ëO•¤WÔ,í© 朧µ+/œS[“±/œã½ ' §R(ˆŒÏáOžØ ŒóŽjE`01CbHLâž® d€j<ŸA@>¦ũ2S"ôÚ(ó²Ôaˆ=1œ 9C“ù‹ÙiÊÎA!~•slÎW$ô†iñG(ÔË<┲𤊬gÏÍH#ÛËõô©²+™¾„¹lqŒPÀqÆj™”aV•<Ò=(åcæOBPˆÝ±Mhײþ4¡¶ä³õ¥(Í-CN¤&ÏJ<·ô©„ÊJ<õ98§yv%Æ=È„N{RˆsŠC3·¥!võž¤û£ü±×p¦¸_á¦}¨É…;jÂî÷¥ÔSrGdú vØà2sښɓÀ¥Ü}(~™–£Ñî©Vö§20è3š_;îŠ>ÑíKR—*V ýÚP§‚G"9¨´j ®àCnï@ Fy£Ï$t£Ï$t¥¨ùì Ç^ôdúdÒyçÒMŽÔY™wŸj LR³ž(ᥨî„1ƒÒ˜ÑzŒSËdÿ*]Ǧ)݉¤È|¾sN0:T˜8é@-ÐŒS»* b}¥‹Ôæ¥ãÔ‡iÚx©»*Enôt¨ŽàyëFâ{{V¶0¸âÙ4Ð 8íÞœ©žØ§„拤 6"¨¸¥<)æ¸ùÔܾV&æOóqÛ>æ˜Wš]¼ñFƒÔ_5½iŒÎÝø§lOJ6óFfÆíAÔ˜ã¥&Ñߟ¥(Ê8Å<­&Þ3EÕ gŽÔz\È õ#­1Y‰Ž¼qAŽhúPy qÅ.qI@ƒcš) O¥qÜÐIÏó bŠ_z%ô©=±J€Ó&•ÊQ"Á§ç¦ÈÏ (Ü„~4®>A¢F8éE‡ÎÉw÷Í=eê¡oPAô H}(ähÀŽAŠvìRnç˜r~•DlHNÈ=ME´ç9¤^‚•†¤ÉJŠa•[Œ)ÙÈ< FF3øR†ü©ûAëÖšÉÛ4]š àѺ“f;Ðô£@ÔxéJs`ñHSÃwT»”’Ëò'–¹5!èhÚH ­lQ÷ ïQãÓ𣋠þD¿.y4»Ôpµ(ãÜÑ`R$gÝ£ÌéLëÒŽƒ­+qáÍ)oP? u£w§z,3Í.=*LäѸb˜®#JB§à@4ÿ0zr(»‘9û¦ŒRù˜:÷¦6â3š.(`ÏjpV<ÔgrÒù­ÔõíUgДÐãžâ‚{S„Ür3Š<ÕúRÔ4îFc= 9§AÇzq†”8ïƒEØ$–‰N:G›ØŽ=éGÞ¹;ô©ùwÜMé×m1þliá£ãŠVqžÅ=«î@É“MÛƒÖ¦Î8À ýUÌÜHÃsƒJv“A\v¨³µ³N×í¹6ÑÉm?•"Ê1ȧ‰¿5:”šc2zуR™#<¤óò¢ï°ì»‘íoO­)Rzæ¤ó>”yŸJ.ÂÈiü)è‡ééKæã°§£ƒÔf“lqH<±ê8¤(Ùâž@ ⌵MÙ¥ˆÈ#­&=*Pøá…!_A‘EÅb08éI´~=ª_ÂŽO8ý)Ü\¤[H>ÔcÖ¥Ç|~t„|Ý(¸r‘ìàÑŒ•!ROÒ‘ƒ/\QqrØŒ/sKQK»¿nÇ¥1XnÜô4­8œóÇg”†ôôœç®iÛ²1Á£8à@½ž}èØ&—8ì(È9ãŠ`3m!NjeÚIÏí«Ô9ƒ–ä;šB•ch’(ƒÓ9¥Ì…m¼š@„w«W®Þ(*¿Ý£œ=™TjO|ÔÂ5i¬„°âŽk‡#D}(¸]Ý9íGSœP1Ï4ÓON)Ý"˜X“íMì;gZÔŠ]üâuõ¦®KhˆŒ(±Ö§UãÎ1O˜J ¯½©<ÆcƒÖ§1H#Á§t²!íÖ—Ìd ±åŒŠCy¥Ìƒ’CáóíN±êx¦Šð)6sïE¢;ɉ2jt`j¨g­Jüªd‘p“êXÜ=i¤äýê‹ éFA¨å/˜—’IߌS žyâ›È#Ú—õ4ì+‰¼žüPzõ£ð£94ŨÎŒcƒA<â€:ž´Ný3KŒ 8÷ b{ ^‹×43Ú…úâ€ÿ]8 ðizÐzÐÄc± Æ=~”Õâ—q<æ–¥h8F{šPtäÔg u>ÕsBM‰É"ÉÎ9j@¸z«8 ¹Óå´Eœg=`>SÖ«ïõëOžôr±©&;žß­¥?£94€ol nÊ“=°(ŒS¸ˆÄg&”'½;­(?)¢ì,†ƒÇ4¼úRç¯4¥‰íHã&ŒûÑ×µ($v bOZ>´¼ƒ(ü(¤ôm·Z_ÚRO¥7hÇ#šZ\ûQ“Ž‚ ÷x=häóŠ^AéAäôúP1?.N1G>”„v}sšSÉþTƒ cÐߊE)'ÒŒûq@„­&;ž”¼­)$ö `y4‚Æh9Æ0( sô¥ät*B9Í)I'·4øOjB™§äžÔŸ‡4îØ~”…=êCœô œÑvD[08ÒÎ9©¹#§JSÏaNâq?ÿÙwebpage-lm/gifs/rib_bar_wh.gif100664 6732 143 1360 6426333302 15003 0ustar lmnobodyGIF87a¢ÿÿÿÀÀÀ€€€ÿ€05`,@þºÜþ0ÊI«½˜Ž=È&™ŨFŒ…A®Û­±–x´¨@ ×,†¬x|1S£\̆ p: ‡SÈ.#ÜÑXç¨ h—Õæšk\ð¼~OUhi p[nu?€|‚ Œ4*Ir|X„—›:œž{V“w•]O:,(Š0$/ O*(fTE~¶e_']3Re³0¾FT³¿§6E¨ÁvVŸÏÐÑWÒÕÖÖ  !I“.#*a"º7oãBŠC“í m°)ê æ/mï0¿9øñ^T‹>dQ“žZ"`$\³ D”a?¨ ñ#„Â1»&‰¥þï"”.¯¼]Év Œ¿’(E\ 5Ò½[6âåâà‹Výù —qE¿IåÞ,lS§¿¢êNâ;Q§¬£ X`E*ê«%1? «iƒœGYȨ¼¸•+À] „½ô" Ø,DÄf@˨ÔRêÅ@ÔBß9šö V‰¤óëí~¿ÑÚý'`imeß%ÔšSK8P=ùè£)­ á…¡„tbÁa€§ÀŠE4q”Oxõœ8”@õ\´VZdŒwNFbA°¡ØRná¡w^ îQa,õçàWYTÛr†éã“sÍòÛ×<ÂÛdC¶OZ¬ ‡™’% $fFAN•|¤Ç%Q…ÕÞ}ïÁ4%x¸YtÛWa$Ý–nV7>)ÍiÚv*ýÈ”ÿ(—§Mñ‘7–›¾è”¡*”Úšä-ˆ`WüUÖIŸÚ·]Ž7_©¦"šé€°:DU¬´–;webpage-lm/gifs/pookline.gif100664 6732 143 1405 6426333302 14525 0ustar lmnobodyGIF87a~ ñÿÿÿÀÀÀ,~ þŒ©Ëí£œ´Ú‹³Þ¼û†âH–扦êʶî ÇòL×¶#D9´?=® ò„>"pˆ,&Ê&óÙøEÓeÕy…2¤[j×úÕ.¸co|Ɔ/r¢Ís#à>ù޳ðQ}€¿å8&XwX‡˜§ØÇèH©08i˜x¹ˆÙ¨ÉÙ™ ºúè9qšŠú ÚÊêà Û [KË`›‹» ÛË«à œ \LŒ`œŒ| ÜÌlà  ]Mm½½Úýê= ¾+>þm~~KNØFù×î/_ß~?oώϯß_i_@€oèùs#¡Áÿ*X✅ N”x‡âEþ‹1îÑØ‘ãGïH&Ì÷PdI‘'¢|é2&Ôa^;u O;œ8u¶á™³ÇN >PôhR¢B2í¨´©Ñ§#£B¥ZÒjU¬Cy.õ*UkV®NÁ^5»íXµ]{†%;•mY·g馵»oÛ uùÞõ›ð^¤oåÆÅûGìà¯zç vL¸oä¿“W^\ñ³®Ã8ó¼ ôæ²I6õ踥YŸvöjœ­i¿¶÷l_AeOý[upß×€~œxoÝÙ'w¾œ7aåÓ¡W—n<:/íµ±ïì¾ýzøìâÁ›¿í:ùR˜·×õWü7Çä×§¿Ì~~üÏàô÷ç?xîÝ7‚w(¸ƒ88_‚ûIø…ZX †N¸ ‡ zø ˆvX!‰š˜!Š–øa°ŒØâ‰1¦8ãŠ2†È"Ž7¨#=²âÎD’4¤¥y¤‘£|"J“¤,YI‘Iù•ïXä”Ij %’]*é$“O†é%™eމ¦˜jF¹e•ìóf:å C§:rþç1y.³ç3}Nóç5j3(7wÆYçœv&Šç¡z:ê'¤‘2Šè¢–*Ši£7lÊi§ž~ j¨¢ŽJj©¦žŠjªª®Êj«¬;webpage-lm/gifs/blueline100644 6732 143 1124 6426333302 13734 0ustar lmnobodyÿØÿàJFIFÿÛC    $.' ",#(7),01444'9=82<.342ÿÛC  2!!22222222222222222222222222222222222222222222222222ÿÀ€"ÿÄÿÄ#!1QaAq2±ÁðÿÄÿÄ1ÿÚ ?öËÊ™Î?Ûh”l…ÿ™ÂFÛeN’ !!}ý¡3ZQ´é3öü‚àBÓ”o ÒFÒ „…çyéî ‘ è „ô‰ékÂBÈ^wÂFó·¤-:HHÛÊßè Q¾³È4 m”g_°T$mùO¶{ø‰Jî3Ð4€Ìpf8 ôf8 Çz Çc€ÏFc€Ìpg ÌpîšÀÀ·Çœ/MÁ5ë­XS^‘¦V "…«Ïq Ö1 "šÕÖp½6È"šÔÕjþ5ëZ»*–§ž´h§ž-ñÆ«WIнu€gMjmºôØÓ‹Wu«¼)­VŠF‘9«¶V¯ERõðµuǰgMjNÞñ¿¦ŠyèMÿÙwebpage-lm/gifs/eyes.gif100644 6732 143 175 6426333303 13634 0ustar lmnobodyGIF89a ¢ÿÿÿÿÿþ¿¿¿!ù, B(ºÜî#Æ'äd#èpñÖÃ’} Gr)¡é·F®¶ <Ö–rs6‡KºXoô ÞŠ°Ý̨ežK‘ïÙ鮕¥}$;webpage-lm/gifs/eyesleft.gif100644 6732 143 175 6426333303 14507 0ustar lmnobodyGIF89a ¢ÿÿÿÿÿþ¿¿¿!ù, B(ºÜî#Æ'äd#h=`ÞòmF‚U a¶’Ö[V1ªXƒ7¬[•+Ê¥^ÃÌ(Z^Ž*悘ÄXDXš„âá*;webpage-lm/gifs/new.gif100644 6732 143 164 6426333303 13456 0ustar lmnobodyGIF89a ¡ÿÿÿÿÿ!ù, EŒo‚Š!ÜâyÏÑD«“6ûÿq@‘Àif¤Æ ç“–*Û¦æëº±8±"ñz°Ü̉ȊÁ"2¹IA-ÌÅªÈ ´MTR;webpage-lm/gifs/line1.gif100644 6732 143 416 6426333303 13675 0ustar lmnobodyGIF89a€ ¢ÿÿÿçççâââ¿¿¿|||ooo!ù,€ Ó8ºÜþƒHã¬8ëÍ»ÿ`(Ždižhª®lë¾p¬^-ßx®ï|ïÿÀ p8“4lĤrÉl:ŸÐ¨T$)(ŠêuÊíz¿à°x¼«[3Kn»ßð¸|ÎbG ç3…€Ï³±vt‚ƒ„…†‡8F6|V{Yˆ—˜™š›OžŸ ¡F¤£€žª¡®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇȲyËÌÍÎgYŽ|lÏÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòê ;webpage-lm/gifs/cclip3.gif100644 6732 143 1200 6426333303 14052 0ustar lmnobodyGIF87aI!ò–ÍÍÍÿÿÿÿÿ,I!þ(ºÜþ0ÊI«½8ëÍ»ÿ`(Ždižhª®lë¾p,Ïtmßx®ï|ïÿÀ PȤrÉl‹€tŠšZ¯X¨sËíz¿àô0˜ÍãO6@h» ¸|N Ûïø¼ž—V”ÏgZbropsq†ˆr}{Ž‘’Z€ƒsW‡…œŒ„Œ“£¤¥¦^S——‚¡ˆ›Rm‰oŒ®¬§¼½¾¿*YR«XRln𳉱‹t¹qÀÔÕÖ×­Vª»dÍSlžáÏæ®ÞØêëì¼Ú–ªé ®°ÎͲöæúºíýþÿ(@Œ Ñj K¢,WJœHщ‚ v‹’þh†AAI²¤É“(Sª\ɲ¥Ë—0cÊœI³¦Í›8s꤉+•ÆBmr¸ˆè¹[H“*]Ê´©Ó§P£JJµªÕ«X³jÝʵ«×¯`ÊK¶,RîZ.¤[’fãÊK·®Ý»xóêÝËWêEx«‚Ò)r[};+^̸±ãÇ#KžL¹äYŸñ‚¾‚˺+‡EUMºô)·´n€ìÙ‘nB1ôšŸéÛ¸s³°2/¢f׈ÛÈ6$üí4ÝÈ“+ß <ø&iÄ‹Ÿ;¾¼ºõëZƒò8{¸g“ØÃ‹Oéá[éºÞÈ#Ͼ½ûatwgû½ýûíÑÉMgOþÿZ'›qê=àâ†L4œDG‚F8‘Æ1Ízf¨¡/ žcá"n(∑tø+ú…HâŠ,Úq¼µ(ãŒ4Öhã8æ¨ã$ ;webpage-lm/gifs/sgi_logo.gif100664 6732 143 7642 6426333303 14521 0ustar lmnobodyGIF89aÔ<÷ÌÌÌ÷÷÷ÍÍÍ£££•••´´´áááüüüÅÅÅÀÀÀØØØîîîèè袢¢RRR444222555888;;;FFFxxxÐÐÐæææQQQ777666MMMƒƒƒÎÎΜœœLLL000999>>>EEENNNSSSGGG§§§úúúºººHHH(((---:::DDD|||ÜÜÜ———===eee„„„©©©bbbJJJ­­­³³³333$$$+++...111øøøýýýÛÛÛŽŽŽIII???fff¦¦¦ËËËòòòõõõ¶¶¶dddUUUÖÖÖ'''***,,,ˆˆˆÔÔÔÓÓÓŠŠŠTTTÿÿÿððð’’’KKK!!!%%%///AAAiii………ªªªóóó]]]&&&ÈÈÈ}}}kkk‰‰‰®®®ÆÆÆ___ãããÑÑÑ@@@"""###ŒŒŒuuu<<¶„@3˜€„ Pá©ÀÀ@ApA^Њ‚  £È@,”rIÜ œØØCG@ðO€6ü_HhXCŽôp þ?Œ> ;$G´¨ ø5&` V˜zÀ"d¡”@Å`1DH 0Á²P… hH°U ±‰ B„€Aô]RÃ8bÄ€Ià&,° LxÂÔŒ€SD#èá€M”§8²K ƒ`ʧT"}€DZa7Îðv´ãuˆG´’# LÅ&§@4°a€€ 烄ày(„à”À3ÀÄð'<â PŠà0Y@€Ê ‚7¦•àœ£þ 2N9Öñ•9dž9ÅYÎtªˆ¬|ç: ‚‰T¤"yþ˜€*ìðL¼ÀkéA¿F ”A .Ø„~’Šð€,8(q5(%g`Ãá†ÀÁ›Ø×""Ox†Ó¤&•gSúNU–s¥ëT^q=˜º3z¦°g$1 (`­øüPAí@sØÀ(¡ƒÀmXÄ$XqˆÅlð€šÀlà2i,ÎtÊÂñ¤þ“cZÓ‰R¶®!ãä!M‡wVÞ°­2í_ô<¡Ó|¢G`(d`Š6¨†-ŒÁ%(Q5d`HŠP€ Kžð&Y:À `ófïúÖ¼6¯®xÕ«iÑZ@´®¶´ªþ­ZRÏ{~ÂF‹ $1O`âjÑ’  Ð„6¼)-èBáR ‚I@"øÀÉJ,`Ÿ DÚIZ·Æ‘´¯u§9ßÊRׯ֮ç%/ñâ˰bDÁL€ -ÜàÀ„&Ú6…´Á 7Ä€Š/LÁr`Bh¶UOD`'ÛžP‰ë@?É_]çÊaõ¶Õ»éïKuxSÓ¦½²õð<  {:àè2¡KD‚ƒ…aŠ@Å]ÈÀ p@J¤á ŒxÑÔ`=Ð`b`DH@¶<àÀÂz¨›†SìÖ.¿2¼&&owEfÔz9Ä­ý¡þ\,SŒ! PQŠWP@Y@B*‘¨@:¨‚‰M €hr|Aàð„.h ^pÝP4Õ!çä2x_»é–µ F1ŠgÛé§´±2% R1œ 58Dô02Ø»9(!…I¤zÈA4 ƒ”=ØÁa2`„F&áÒ Éô‡;=ê3W{çI­ésÔ"d9O<–2xA"b (³9A ~@äØ90…f/y‚ HuØAœ‰S2d¦ævzK½Ö s7ž¢V«™»[f.䙸€"Ä„L(" ˆþº ¼×bò ãÛ#AÔP8!EHmíƒÏtÚ¬ýyL#®m„›Zè%6o“^^ øUDöà„Rˆü˜Â LîÙX¸ ËŠÄ„"€­‡dxâ@ÔЃ.r ¤Ü­)Nâ –ÔÌMÏ6JW+í†×Ô¬ªTD!H±ƒLL ¯0Á Làˆ Táä…(‚€ ˜bì¡pp ¡©#žwX\n#è J  åŸfá‡0ÄÆpß[»‚ľ‰Ì?‘t`D6@‚€›ø@ÎNŸ‘ÔgÅY(%”€*´a7E=Cþ:œ@Epyæ?14$- ˜‚`À|¨2+„Ñ„T‹;t`ŒÚý=ùï‹BüŽpŸ@hKè‡14Ð~ñ~C8sð§ H6áM \ K·7+ð!•y9@cP©!<‹P‡‚@Ђtá‚ ètrá`À; ,02p]qX ð¹·˜y–4¥py 30§`-8…r1…0¨€E"!ðuP$F 1 5„‹ APE¨¶#403N`8mÐUƒ@yøx¸‡ á‡Ë"X@ Wâ>Àþ<°j¥@@¸hØk˜'BpЧ J0 îr‡~ˆq… ñ‰ ˆ¢¸,ƒ@  DÅ3 _cw„ Ð¸ ϦP #p C ª€‘ðM€wxV(Ëx…ËxTøŒÏÈŒчÌè‚ÍH…Ð(ÒHŠ¥¨r0 ­ ú"p!@è}Kp˜ð•PH ­°Ð 9ÐzQŠÚ¸‡™‡ŸhÝhÙ‡ (ŠØH™éÞ°q@` _0-_°¢P @kSˆ†@‡ Zš Ð_D§xþÝI“é5ùÊHŠ)‘4Y“ ‘192 ©`´@TxÀ cl% °Ð!eh† †C¯ † &…Ñ¡“7i“a)”á‘@‰“™“lÙ–ÒÁY00~0cД`q 0•§ð Ð ±ðe™–Ù–bI ÎXßh“y–‘Á ÀÀx°x™{9 ~ Š`‰À nЇ–™˜b‰–‰“ˆy¬i )‘2éšh¹›Ò‘` § Mé |SiMà² ŽŠ’9–°9d‘k™˜2Éc‹›Õy“û!“p7Д› pœM`9°ѸéÒÙ“k›×ˆ›²ùšAɘ²I%v0 äéS!´ 6 ª ‡ÜøºÙ“ÚÙ‚Z ŸÚ¹“ ޵ɠû™“–)® ¤vA!å°±¡ï‰… h¢¡ˆ¢í7”¡¢,£2:£4*;webpage-lm/index.html100664 6732 143 25005 6426333302 13305 0ustar lmnobody Larry McVoy's home page

Larry McVoy's home page

Notice: I'm moving to a new job. New email is lm@who.net.

Who am I?

I'm an engineer for Silicon Graphics, working in the networking group. I spend most of my time waving my hands and convincing other people they want to work on stuff that I think is important. The name server is an example, I got John Schimmel to work on that. I'm constantly trying to figure out how to make things go fast, which is why I wrote the lmbench benchmark suite. Lmbench measures the basic building blocks of a computer system. Occasionally, I have to do real work, like the BDS stuff mentioned below.

I live in San Francisco and divide my time there between my girlfriend, woodworking, playing pool, and riding motorcycles.

Current stuff I'm working on (slides)

Papers I've written

Personal stuff (lots of pictures)

Me, my relatives, friends, etc.

  • Me and my nephew Jacob at Ocean Beach in San Francisco. He was about 2 years old and still hadn't hit the terrible twos, I think his Mom must have done a good job. Here he is with his Mom about 7 months pregnant. The next one turned out to be a boy named Zeke.
  • My brother Chris trying to look smart.
  • I used to be even more crazy than I am now; here's a picture of me doing some stupid rollerblading tricks.
  • My favorite picture of me.
  • I work at home a lot and this is what that is like. My cat was pretty sick in that picture, but I nursed her back to the land of the living.
  • Me studying.
  • A really old picture of me in Mexico, with really long hair juggling.

My cats

  • I like cats and I have had two over the last 18 years (whew) or so. Here's Zoey after she's had a few. Looks possessed, doesn't she? Here's a better picture of her. Until she died around Christmas of 1994, she had outlasted all of my girlfriends - I had her for almost 14 years. I still miss her and sometimes look for her when I go into the kitchen - it's weird to think she's gone. I eventually decided not to mope over her forever and went and found Mama cat at the pound. That's Linux running on the PC next to her, she fixes a lot of mouse driver bugs. Here's another picture of Mama cat on the workbench . And one more of here in my van - she likes to travel, no kidding. One last shot of her.
  • November '96: Mama cat is missing. We're still looking for her, but it has been two weeks and it isn't looking very hopeful.
  • January '97: Mama cat is still missing. I go to the pound about once a week with no luck. It sucks.

Fishing

  • I like to fly fish (yeah, I tie my own, ooh, wow) and I took a trip with my friend John Weitz. John is a hot shot photographer and here he is at work. Here's John catching a trout in the Trinity Alps. This is me fishing in the upper Sacramento River. John was talking some shots of a cool old shed, so I took one too. Here's a shot that John took of me sitting in the doorway of that shed (warning: it's ~60Kb).
  • This is the ultimate in fishing tall tails, except I have pictures to prove it happened. I was fishing in Canada and thought I had hooked some weeds. I was reeling it in when all at once it took off. Funny sort of fish, it felt weird. When I get closer, I saw that I had two fish - a little one that had hit the lure, and a big Northern Pike that had hit the little pike. I thought for sure he would let go when he saw me, but I guess he was hungry, because I picked him up. Pretty wild, huh?

Wilderness

  • I like to backpack a lot and I have some friends that go with me. Here's Neil with his dog Elvis and here they are again hard at work.
  • Me cross country skiing in the Sierra back country. It was a weekend trip to Ostrander Hut/Lake (cool place). I think that is Yosemite Valley in the background, doesn't that look like half dome to you? Here's the same view about 5 years earlier with my friends John G., Bernd N., and Andy A.
  • My Dad's Mad River canoe with a cover that my sister made (pretty cool cover, if you ask me, it kept us dry). We go canoeing in Canada quite a bit.

Woodworking

  • I am not just a computer nerd, I'm also a woodworking nerd, and I'm especially nerdy about hand planes. Many of those are a hundred years old, some are more than that ("they don't make 'em like they used to" definitely applies to tools). Here's my first effort at a real woodworking project, what else, a toolbox. Here's a view with the drawers open. The little box on top is a jewelry box (or whatever) I made for an old girlfriend. I live in San Francisco, in a flat, so my workshop is out on my back deck. That's a small jointer in the foreground and a table saw clamped to the rails in the background. It's a bit cramped, but it has a nice view. I finally decided to build a workbench. Here's the benchtop in the process of being hand planed flat (lotso shavings, huh?).
  • I do stuff on commission sometimes, this is my last girlfriend with a bookshelf I built for a friend at work. It was pretty simple since it was a first try, but he liked it. Here's another picture of the bookshelf.
  • Here I am proudly showing off a little TV cabinet made out of pine with some really interesting grain. That's the heartwood of the pine. Here's a closeup picture of the cabinet.
  • Because space is tight in San Francisco, I think my next project will be a tall, thin chest of drawers sort of like a lingerie chest, only sized for guy's clothes. It's about six feet tall by 18 inches square, which I think is about right. This was drawn in James Clark's implementation of pic, in the groff tool suite. Perverse, I know.
  • Here is a document on flattening hand planes, something that is frequently required for good performance.

Amusements

  • A song composed in my honor. No kidding. It's pretty cute but you might need to know a little about Sun's internal politics to completely get it.

  • A letter that Sun's lawyers recently sent. It's amazing how frigging self centered people can be. I got yer Java right here, buddy.

    A few days later, the net responds.

  • Here are a bunch of quotes that I either liked or were attributed to me. A lot of these are pretty nerdy engineer inside jokes, you've been warned.


Larry McVoy, lm@sgi.com

Page accesses since Wed Jun 26 1996: [Sorry, counter is a GIF image!]

webpage-lm/URLS100664 6732 143 370 6426334120 11755 0ustar lmnobody./pictures/me-small.jpg ./gifs/snow-bg2.jpg ./gifs/rib_bar_wh.gif ./gifs/spam-not.gif ./gifs/pookline.gif ./gifs/blueline ./gifs/eyes.gif ./gifs/eyesleft.gif ./gifs/new.gif ./gifs/line1.gif ./gifs/cclip3.gif ./gifs/sgi_logo.gif ./index.html ./URLS lmbench-3.0-a9/ACKNOWLEDGEMENTS0000664000076400007640000000346510716240255015454 0ustar staelinstaelinLMbench was originally developed by Larry McVoy while he worked at Sun Microsystems. Larry continued development while working at Silicon Graphics, and was joined by Carl Staelin, who works for Hewlett-Packard Laboratories. LMbench would not be the successful cross-platform benchmark that it is today without the efforts and assistance of a wide range of people. From volunteers who run it on various hardware and report bugs, to managers who provide financial and other support, to peers and colleagues who request features or provide feedback on design elements. All such help has been critical to making LMbench a success. Below is a partial list of all those people who helped support the development of LMbench in one form or other, such as benchmark suggestions, bug reports, and so forth. All omissions are accidental, and if your name was not included, please accept our humble apologies. The people who have helped LMbench include, in alphabetic order: Ralf Baechle, Christian Bau, Nelson H. F. Beebe, Anton Blanchard, Joel Berman, Paul Borrill, Ed Bradford, Len Brown, Robert G. Brown, Bruce Chapman, Tim Chen, Mark Culotta, Fred Douglis, Lars-Eke Eriksson, Josh Fisher, Marc Fleischmann, John Fort, Andy Glew, Achim Gratz, Richard Henderson, Rick Jones, Lev Iserovich, Michael A. Julier, Frans Kaashoek, Brad Knowles, Richard Littin, Bil Long, Udi Manber, John Mashey, David Miller, Dejan Milojicic, Ingo Molnar, David Mosberger, Will Newton, Satya Nishtala, Kevin Normoyle, Neal Nuckolls, Steve Piatz, Tim Prince, James Riden, Sam Roberts, Philip Roth, Chris Ruemmler, Olli Savia, Scott Schwartz, Wayne Scott, David Singleton, Mike Slifcak, Stephan Somogyi, Ratnakar Tiwari, Linus Torvalds, Dan Truong, Dirk Twiehaus, Duc Vianney, Ramya Vijay, Hai Vo-Ba, David T. Wang, Brian Whitney, David Wilson, Mitch Wright. lmbench-3.0-a9/CHANGES0000664000076400007640000000715707313653256014204 0ustar staelinstaelinlmbench3-alpha1 Added new benchmark line, which determines the cache line size Added new benchmark tlb, which determines the effective TLB size. Note that this may differ from the hardware TLB size due to OS TLB entries and super-pages. Added new benchmark par_mem, which determines the possible speedup due to multiple memory reads progressing in parallel. This number usually depends highly on the portion of the memory hierarchy being probed, with higher caches generally having greater parallelism. Added new benchmark cache, which determines the number of caches, their sizes, latency, and available parallelism. It also reports the latency and available parallelism for main memory. Added new benchmark lat_ops, which attempts to determine the latency of basic operations, such as add, multiply and divide, for a variety of data types, such as int, int64, float and double. Added new benchmark par_ops, which attempts to determine the available scaling of the various basic operations for various data types. Added new benchmark stream, which reports memory bandwidth numbers using benchmark kernels from John McCalpin's STREAM and STREAM version 2 benchmarks. Added new benchmark lat_sem, which reports SysV semaphore latency. Added getopt() command line parsing to most benchmarks. Added a new benchmark timing harness, benchmp(), which makes it relatively easy to design and build benchmarks which measure system performance under a fixed load. It takes a few parameters: - initialize: a function pointer. If this is non-NULL the function is called in the child processes after the fork but before any benchmark-related work is done. The function is passed a cookie from the benchmp() call. This can be a pointer to a data structure which lets the function know what it needs to do. - benchmark: a function pointer. This function takes two parameters, an iteration count "iters", and a cookie. The benchmarked activity must be run "iters" times (or some integer multiple of "iters". This function must be idempotent; ie., the benchmark harness must be able to call it as many times as necessary. - cleanup: a function pointer. If this is non-NULL the function is called after all benchmarking is completed to cleanup any resources that may have been allocated. - enough: If this is non-zero then it is the minimum amount of time, in micro-seconds, that the benchmark must be run to provide reliable results. In most cases this is left to zero to allow the harness to autoscale the timing intervals to the system clock's resolution/accuracy. - parallel: this is the number of child processes running the benchmark that should be run in parallel. This is really the load factor. - warmup: a time period in micro-seconds that each child process must run the benchmarked process before any timing intervals can begin. This is to allow the system scheduler time to settle in a parallel/distributed system before we begin measurements. (If so desired) - repetitions: If non-zero this is the number of times we need to repeat each measurement. The default is 11. - cookie: An opaque value which can be used to pass information to the initialize(), benchmark(), and cleanup() routines. This new harness is now used by: bw_file_rd, bw_mem, bw_mmap_rd, bw_pipe, bw_tcp, bw_unix, lat_connect, lat_ctx, lat_fcntl, lat_fifo, lat_mem_rd, lat_mmap, lat_ops, lat_pagefault, lat_pipe, lat_proc, lat_rpc, lat_select, lat_sem, lat_sig, lat_syscall, lat_tcp, lat_udp, lat_unix, lat_unix_connect, and stream. lmbench-3.0-a9/COPYING0000664000076400007640000004307607045412511014231 0ustar staelinstaelin GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 675 Mass Ave, Cambridge, MA 02139, USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS Appendix: How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) 19yy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19yy name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. lmbench-3.0-a9/COPYING-20000664000076400007640000001314007130617074014363 0ustar staelinstaelin%M% %I% %E% The set of programs and documentation known as "lmbench" are distributed under the Free Software Foundation's General Public License with the following additional restrictions (which override any conflicting restrictions in the GPL): 1. You may not distribute results in any public forum, in any publication, or in any other way if you have modified the benchmarks. 2. You may not distribute the results for a fee of any kind. This includes web sites which generate revenue from advertising. If you have modifications or enhancements that you wish included in future versions, please mail those to me, Larry McVoy, at lm@bitmover.com. ========================================================================= Rationale for the publication restrictions: In summary: a) LMbench is designed to measure enough of an OS that if you do well in all catagories, you've covered latency and bandwidth in networking, disks, file systems, VM systems, and memory systems. b) Multiple times in the past people have wanted to report partial results. Without exception, they were doing so to show a skewed view of whatever it was they were measuring (for example, one OS fit small processes into segments and used the segment register to switch them, getting good results, but did not want to report large process context switches because those didn't look as good). c) We insist that if you formally report LMbench results, you have to report all of them and make the raw results file easily available. Reporting all of them means in that same publication, a pointer does not count. Formally, in this context, means in a paper, on a web site, etc., but does not mean the exchange of results between OS developers who are tuning a particular subsystem. We have a lot of history with benchmarking and feel strongly that there is little to be gained and a lot to be lost if we allowed the results to be published in isolation, without the complete story being told. There has been a lot of discussion about this, with people not liking this restriction, more or less on the freedom principle as far as I can tell. We're not swayed by that, our position is that we are doing the right thing for the OS community and will stick to our guns on this one. It would be a different matter if there were 3 other competing benchmarking systems out there that did what LMbench does and didn't have the same reporting rules. There aren't and as long as that is the case, I see no reason to change my mind and lots of reasons not to do so. I'm sorry if I'm a pain in the ass on this topic, but I'm doing the right thing for you and the sooner people realize that the sooner we can get on to real work. Operating system design is a largely an art of balancing tradeoffs. In many cases improving one part of the system has negative effects on other parts of the system. The art is choosing which parts to optimize and which to not optimize. Just like in computer architecture, you can optimize the common instructions (RISC) or the uncommon instructions (CISC), but in either case there is usually a cost to pay (in RISC uncommon instructions are more expensive than common instructions, and in CISC common instructions are more expensive than required). The art lies in knowing which operations are important and optmizing those while minimizing the impact on the rest of the system. Since lmbench gives a good overview of many important system features, users may see the performance of the system as a whole, and can see where tradeoffs may have been made. This is the driving force behind the publication restriction: any idiot can optimize certain subsystems while completely destroying overall system performance. If said idiot publishes *only* the numbers relating to the optimized subsystem, then the costs of the optimization are hidden and readers will mistakenly believe that the optimization is a good idea. By including the publication restriction readers would be able to detect that the optimization improved the subsystem performance while damaging the rest of the system performance and would be able to make an informed decision as to the merits of the optimization. Note that these restrictions only apply to *publications*. We intend and encourage lmbench's use during design, development, and tweaking of systems and applications. If you are tuning the linux or BSD TCP stack, then by all means, use the networking benchmarks to evaluate the performance effects of various modifications; Swap results with other developers; use the networking numbers in isolation. The restrictions only kick in when you go to *publish* the results. If you sped up the TCP stack by a factor of 2 and want to publish a paper with the various tweaks or algorithms used to accomplish this goal, then you can publish the networking numbers to show the improvement. However, the paper *must* also include the rest of the standard lmbench numbers to show how your tweaks may (or may not) have impacted the rest of the system. The full set of numbers may be included in an appendix, but they *must* be included in the paper. This helps protect the community from adopting flawed technologies based on incomplete data. It also helps protect the community from misleading marketing which tries to sell systems based on partial (skewed) lmbench performance results. We have seen many cases in the past where partial or misleading benchmark results have caused great harm to the community, and we want to ensure that our benchmark is not used to perpetrate further harm and support false or misleading claims. lmbench-3.0-a9/ChangeSet0000664000076400007640000000000010236067407014744 0ustar staelinstaelinlmbench-3.0-a9/Makefile0000664000076400007640000000307710306552274014640 0ustar staelinstaelin# Makefile for top level of lmbench # $Id$ # Possible things to $(MAKE): # # build (default) go to the source directory and build the benchmark # results go to the source directory and build and run the benchmark # rerun run the benchmark again # see see the results that came with this release # Go to the results directory and read the Makefile. # doc.lpr print the documentation # doc.x preview the documentation (needs X, groff, pic, etc) # clean go to the subdirs and $(MAKE) clean # get $(MAKE) sure all files are checked out # shar build a shippable shar archive SHELL=/bin/sh build: cd src && $(MAKE) results: FRC cd src && $(MAKE) results rerun: cd src && $(MAKE) rerun see: cd results && $(MAKE) summary >summary.out 2>summary.errs cd results && $(MAKE) percent >percent.out 2>percent.errs doc.lpr: cd doc && $(MAKE) PS && lpr *.PS doc.x: cd doc && $(MAKE) x clobber clean: for i in doc src results scripts; do \ echo ===== $$i =====; \ (cd $$i && $(MAKE) clean); \ done /bin/rm -rf bin/* -bk clean get: for i in doc src results scripts; do \ echo ===== $$i =====; \ (cd $$i && bk get -q); \ done @co -q info: for i in doc src results scripts; do \ echo ===== $$i =====; \ (cd $$i && info); \ done release: scripts/mkrelease scripts/mkrelease scripts/mkrelease: cd scripts && co mkrelease # XXX - . must be named lmbench for this to work shar: $(MAKE) clean co -q Makefile $(MAKE) get cd .. && \ find lmbench -type f -print | egrep -v 'noship|RCS' > /tmp/FILES cd .. && shar -S -a -n lmbench1.0 -L 50K < /tmp/FILES FRC: lmbench-3.0-a9/README0000664000076400007640000000106107045412511014042 0ustar staelinstaelinREADME for lmbench 2alpha8 net release. To run the benchmark, you should be able to say: cd src make results If you want to see how you did compared to the other system results included here, say make see Be warned that many of these benchmarks are sensitive to other things being run on the system, mainly from CPU cache and CPU cycle effects. So make sure your screen saver is not running, etc. It's a good idea to do several runs and compare the output like so make results make rerun make rerun make rerun cd Results && make LIST=/* lmbench-3.0-a9/hbench-REBUTTAL0000664000076400007640000002331707045412511015564 0ustar staelinstaelinIn June of 1997, Margo Seltzer and Aaron Brown published a paper in Sigmetrics called "Operating System Benchmarking in the Wake of Lmbench: A Case Study of the Performance of NetBSD on the Intel x86 Architecture". This papers claims to have found flaws in the original lmbench work. With the exception of one bug, which we have of course fixed, we find the claims inaccurate, misleading, and petty. We don't understand what appears to be a pointless attack on something that has obviously helped many researchers and industry people alike. lmbench was warmly received and is widely used and referenced. We stand firmly behind the work and results of the original benchmark. We continue to improve and extend the benchmark. Our focus continues to be on providing a useful, accurate, portable benchmark suite that is widely used. As always, we welcome constructive feedback. To ease the concerns of gentle benchmarkers around the world, we have spent at least 4 weeks reverifying the results. We modified lmbench to eliminate any effects of . clock resolution . loop overhead . timing interface overhead Our prediction was that that this would not make any difference and our prediction was correct. All of the results reported in lmbench 1.x are valid except the file reread benchmark which may be 20% optimistic on some platforms. We've spent a great deal of time and energy, for free, at the expense of our full time jobs, to address the issues raised by hbench. We feel that we were needlessly forced into a lose/lose situation of arguing with a fellow researcher. We intend no disrespect towards their work, but did not feel that it was appropriate for what we see as incorrect and misleading claims to go unanswered. We wish to move on to the more interesting and fruitful work of extending lmbench in substantial ways. Larry McVoy & Carl Staelin, June 1997 -------------------------------------------------------------------------- Detailed responses to their claims: Claim 1: "it did not have the statistical rigor and self-consistency needed for detailed architectural studies" Reply: This is an unsubstantiated claim. There are no numbers which back up this claim. Claim 2: "with a reasonable compiler, the test designed to read and touch data from the file system buffer cache never actually touched the data" Reply: Yes, this was a bug in lmbench 1.0. It has been fixed. On platforms such as a 120 Mhz Pentium, we see change of a 20% in the results, i.e., without the bug fix it is about 20% faster. Claim 3: This is a multi part claim: a) gettimeofday() is too coarse. Reply: The implication is that there are number of benchmarks in lmbench that finish in less time than the clock resolution with correspondingly incorrect results. There is exactly one benchmark, TCP connection latency, where this is true and that is by design, not by mistake. All other tests run long enough to overcome 10ms clocks (most modern clocks are microsecond resolution). Seltzer/Brown point out that lmbench 1.x couldn't accurately measure the L1/L2 cache bandwidths. lmbench 1.x didn't attempt to report L1/L2 cache bandwidths so it would seem a little unreasonable to imply inaccuracy in something the benchmark didn't measure. It's not hard to get this right by the way, we do so handily in lmbench 2.0. b) TCP connection latency is reported as 0 on the DEC Alpha. Reply: We could have easily run the TCP latency connection benchmark in a loop long enough to overcome the clock resolution. We were, and are, well aware of the problem on DEC Alpha boxes. We run only a few interations of this benchmark because the benchmark causes a large number of sockets to get stuck in TIME_WAIT, part of the TCP shutdown protocol. Almost all protocol stacks degrade somewhat in performance when there are large numbers of old sockets in their queues. We felt that showing the degraded performance was not representative of what users would see. So we run only for a small number (about 1000) interations and report the result. We would not consider changing the benchmark the correct answer - DEC needs to fix their clocks if they wish to see accurate results for this test. We would welcome a portable solution to this problem. Reading hardware specific cycle counters is not portable. Claim 4: "lmbench [..] was inconsistent in its statistical treatment of the data" ... "The most-used statistical policy in lmbench is to take the minimum of a few repetitions of the measurement" Reply: Both of these claims are false, as can be seen by a quick inspection of the code. The most commonly used timing method (16/19 tests use this) is start_timing do the test N times stop_timing report results in terms of duration / N In fact, the /only/ case where a minimum is used is in the context switch test. The claim goes on to try and say that taking the minimum causes incorrect results in the case of the context switch test. Another unsupportable claim, one that shows a clear lack of understanding of the context switch test. The real issue is cache conflicts due to page placement in the cache. Page placement is something not under our control, it is under the control of the operating system. We did not, and do not, subscribe to the theory that one should use better ``statistical methods'' to eliminate the variance in the context switch benchmark. The variance is what actually happened and happens to real applications. The authors also claim "if the virtually-contiguous pages of the buffer are randomly assigned to physical addresses, as they are in many systems, ... then there is a good probability that pages of the buffer will conflict in the cache". We agree with the second part but heartily disagree with the first. It's true that NetBSD doesn't solve this problem. It doesn't follow that others don't. Any vendor supplied operating system that didn't do this on a direct mapped L2 cache would suffer dramatically compared to it's competition. We know for a fact that Solaris, IRIX, and HPUX do this. A final claim is that they produced a modified version of the context switch benchmark that does not have the variance of the lmbench version. We could not support this. We ran that benchmark on an SGI MP and saw the same variance as the original benchmark. Claim 5: "The lmbench bandwidth tests use inconsistent methods of accessing memory, making it hard to directly compare the results of, say memory read bandwidth with memory write bandwidth, or file reread bandwidth with memory copy bandwidth" ... "On the Alpha processor, memory read bandwidth via array indexing is 26% faster than via pointer indirection; the Pentium Pro is 67% faster when reading with array indexing, and an unpipelined i386 is about 10% slower when writing with pointer indirection" Reply: In reading that, it would appear that they are suggesting that their numbers are up to 67% different than the lmbench numbers. We can only assume that this was delibrately misleading. Our results are identical to theirs. How can this be? . We used array indexing for reads, so did they. They /implied/ that we did it differently, when in fact we use exactly the same technique. They get about 87MB/sec on reads on a P6, so do we. We challenge the authors to demonstrate the implied 67% difference between their numbers and ours. In fact, we challenge them to demonstrate a 1% difference. . We use pointers for writes exactly because we wanted comparable numbers. The read case is a load and an integer add per word. If we used array indexing for the stores, it would be only a store per word. On older systems, the stores can appear to go faster because the load/add is slower than a single store. While the authors did their best to confuse the issue, the results speak for themselves. We coded up the write benchmark our way and their way. Results for a Intel P6: pointer array difference L1 $ 587 710 18% L2 $ 414 398 4% memory 53 53 0% Claim 5a: The harmonic mean stuff. Reply: They just don't understand modern architectures. The harmonic mean theory is fine if and only if the process can't do two things at once. Many modern processors can indeed do more than one thing at once, the concept is known as super scalar, and can and does include load/store units. If the processor supports both outstanding loads and outstanding stores, the harmonic mean theory fails. Claim 6: "we modified the memory copy bandwidth to use the same size data types as the memory read and write benchmark (which use the machine's native word size); originally, on 32-bit machines, the copy benchmark used 64-bit types whereas the memory read/write bandwidth tests used 32- bit types" Reply: The change was to use 32 bit types for bcopy. On even relatively modern systems, such as a 586, this change has no impact - the benchmark is bound by memory sub systems. On older systems, the use of multiple load/store instructions, as required for the smaller types, resulted in lower results than the meory system could produce. The processor cycles required actually slow down the results. This is still true today for in cache numbers. For example, an R10K shows L1 cache bandwidths of 750MB/sec and 377MB/sec with 64 bit vs 32 bit loads. It was our intention to show the larger number and that requires the larger types. Perhaps because the authors have not ported their benchmark to non-Intel platforms, they have not noticed this. The Intel platform does not have native 64 bit types so it does two load/stores for what C says is a 64 bit type. Just because it makes no difference on Intel does not mean it makes no difference. lmbench-3.0-a9/results/0000775000076400007640000000000010723011655014666 5ustar staelinstaelinlmbench-3.0-a9/results/Makefile0000664000076400007640000002321010306552274016330 0ustar staelinstaelin# Makefile for lmbench results. # $Id$ # # Usage: make [ LIST="aix/* sunos/* ..." ] [ what ] # # What to make: # print Prints the results 1 per page. # ps Saves the postscript of 1 per page in PS/PS # 4.ps Saves the postscript of 4 per page in PS/PS4 # 8.ps Saves the postscript of 8 per page in PS/PS8 # x Previews 1 per page using groff -X # summary [default] Ascii summary of the results # stats Do statistics over a set of results # roff Print the ascii summaries into a roff file # slides Makes the pic for inclusion in slides # # This Makefile requires groff, gpic, and perl. You could try it with # other *roff processors; I have no idea if it works. # # XXX - this is all out of date. # # There are three sorts of graphical results: # # 1. Bargraphs comparing each system in the LIST on the measurements listed # in the BG list below (pretty much everything). # 2. A 2-D graph for each system in LIST, displaying context switch times # as a function of (# of processes, size of each process). # 3. A 2-D graph for each system in LIST, displaying memory read times as # a function of (stride size, memory size). # # The bargraphs are in a format of my own - the perl script in scripts # called bargraph takes them as input and produces pic as output. # It is a pretty straightforward format, you could probably incorparate # into some Windows spreadsheet if you wanted to. See tmp/*.bg after # running make in this directory. # # The 2-D graphs are in a format that can (probably) be read by Xgraph. # I've added a few extensions for titles, etc., that you could just # take out. See tmp/mem.* after running a make in this directory. # # This Makefile is of marginal usefulness to a site with just one machine. # I intend to make results available so that people can compare, as well # as a service where you can compare your results against the "best of # the breed" for each vendor, as well as against best of the lot. # List of result files to process. Defaults to everything. LIST= `$(SCRIPTS)getlist $(LST)` # Grrrrr SHELL=/bin/sh SCRIPTS=../scripts/ SRCS= ../scripts/allctx ../scripts/allmem ../scripts/bargraph \ ../scripts/bghtml ../scripts/getbg ../scripts/getbw \ ../scripts/getctx ../scripts/getdisk ../scripts/getlist \ ../scripts/getmax ../scripts/getmem ../scripts/getpercent \ ../scripts/getresults ../scripts/getsummary ../scripts/gifs \ ../scripts/graph ../scripts/html-list ../scripts/html-man \ ../scripts/os ../scripts/percent ../scripts/save \ ../scripts/stats ../scripts/xroff MISC= tmp/misc_mhz.bg \ tmp/lat_ctx.bg \ tmp/lat_ctx8.bg \ tmp/lat_nullsys.bg \ tmp/lat_signal.bg \ tmp/lat_pagefault.bg \ tmp/lat_mappings.bg \ tmp/lat_fs_create.bg PROC= tmp/lat_nullproc.bg \ tmp/lat_simpleproc.bg \ tmp/lat_shproc.bg LATENCY= \ tmp/lat_pipe.bg \ tmp/lat_connect.bg \ tmp/lat_udp_local.bg \ tmp/lat_rpc_udp_local.bg \ tmp/lat_tcp_local.bg \ tmp/lat_rpc_tcp_local.bg BANDWIDTH= \ tmp/bw_pipe.bg \ tmp/bw_tcp_local.bg \ tmp/bw_file.bg \ tmp/bw_reread.bg \ tmp/bw_mmap.bg \ tmp/bw_bcopy_libc.bg \ tmp/bw_bcopy_unrolled.bg \ tmp/bw_mem_rdsum.bg \ tmp/bw_mem_wr.bg BG= $(MISC) $(PROC) $(LATENCY) $(BANDWIDTH) MK=@$(MAKE) -s PRINT=groff -p | lpr -h PS=groff -p | $(SCRIPTS)save PS/PS PS8UP=groff -p | mpage -P- -8 -a | $(SCRIPTS)save PS/PS8 PS4UP=groff -p | mpage -P- -4 -a | $(SCRIPTS)save PS/PS4 SIZE=-big IMAGE=pbm CLOSE= GMEM=$(CLOSE) -grid -logx -xm -below GCTX=$(CLOSE) -grid -below GDISK=-below -close -grid -nolines #IMAGE=gifmono summary: $(SRCS) @$(SCRIPTS)getsummary $(LIST) percent: $(SRCS) @$(SCRIPTS)getpercent $(LIST) stats: $(SRCS) $(SCRIPTS)getsummary $(LIST) | $(SCRIPTS)percent roff: echo .nf > summary.roff echo .ft CB >> summary.roff echo .ps 12 >> summary.roff echo .po .35i >> summary.roff echo .sp .5i >> summary.roff make LIST="$(LIST)" summary >> summary.roff echo .bp >> summary.roff echo .sp .5i >> summary.roff make LIST="$(LIST)" percent >> summary.roff list: @echo $(LIST) print: ctx mem disk bwfile bwmem 8: $(MK) LIST="$(LIST)" PRINT="groff -p | mpage -P -8 -a | lpr -h" print 8.ps 8ps 8up: $(MK) LIST="$(LIST)" PRINT="$(PS8UP)" print 4.ps 4ps 4up: $(MK) LIST="$(LIST)" PRINT="$(PS4UP)" print ps: $(MK) LIST="$(LIST)" PRINT="$(PS)" print smallps: $(MK) LIST="$(LIST)" SIZE= PRINT="groff -p | $(SCRIPTS)save PS/smallPS" print x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" print ctx.x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" ctx ctx.ps4: $(MK) LIST="$(LIST)" PRINT="$(PS4UP)" ctx mem.x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" mem disk.x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" disk bwfile.ps: $(MK) LIST="$(LIST)" PRINT="$(PS)" bwfile bwfile.x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwfile bwmem.ps: $(MK) LIST="$(LIST)" PRINT="$(PS)" bwmem bwmem.x: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" bwmem smallx: $(MK) LIST="$(LIST)" PRINT="$(SCRIPTS)xroff -p" SIZE= print slides: $(MK) LIST="$(LIST)" SIZE=-slide bargraphs.slides ctx.slides mem.slides paper: $(MK) LIST="$(LIST)" tbl.paper ctx.paper mem.paper # XXX - this has to be made incremental, doing everything over from # scratch makes you want a Ghz machine. html: dirs -make clean #$(SCRIPTS)bghtml $(BG) $(SCRIPTS)html-list $(LIST) $(MK) LIST="$(LIST)" summary > HTML/summary.out 2> HTML/summary.errs #make LIST="$(LIST)" percent > HTML/percent.out 2> HTML/percent.errs $(MK) LIST="$(LIST)" SIZE= PRINT="$(PS)" \ GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print $(MK) LIST="$(LIST)" SIZE= NOOP=-noop PRINT="$(PS)" \ GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print gs -sOutputFile=HTML/ctx%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS < /dev/null gs -sOutputFile=HTML/mem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.1 < /dev/null gs -sOutputFile=HTML/disk%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.2 < /dev/null gs -sOutputFile=HTML/bwfile%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.3 < /dev/null gs -sOutputFile=HTML/bwmem%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.4 < /dev/null gs -sOutputFile=HTML/ctx-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.5 < /dev/null gs -sOutputFile=HTML/mem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.6 < /dev/null gs -sOutputFile=HTML/bwfile-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.7 < /dev/null gs -sOutputFile=HTML/bwmem-unscaled%02d.$(IMAGE) -sDEVICE=$(IMAGE) -q -dNOPAUSE PS/PS.8 < /dev/null $(SCRIPTS)/gifs rm HTML/*.pbm HTML/___tmp* htmltest: dirs -make clean #$(SCRIPTS)bghtml $(BG) $(SCRIPTS)html-list $(LIST) $(MK) LIST="$(LIST)" summary > HTML/summary.out 2> HTML/summary.errs #make LIST="$(LIST)" percent > HTML/percent.out 2> HTML/percent.errs $(MK) LIST="$(LIST)" SIZE= PRINT="$(PS)" \ GMEM="$(GMEM) -cut -gthk1" GCTX="$(GCTX) -cut -gthk1" print bghtml: $(SCRIPTS)bghtml $(BG) html-list: $(SCRIPTS)html-list $(LIST) ctx: dirs $(SCRIPTS)getctx $(LIST) > tmp/FILES @if [ -s tmp/FILES ]; \ then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ for i in `cat tmp/FILES`; \ do $(SCRIPTS)graph $(SIZE) $(GCTX) $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT); \ else echo No context switch data in $(LIST); \ fi disk: dirs if [ X$(NOOP) = X ]; then \ $(SCRIPTS)getdisk $(LIST) > tmp/FILES; \ if [ -s tmp/FILES ]; \ then for i in `cat tmp/FILES`; \ do $(SCRIPTS)graph $(SIZE) $(GDISK) $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT); \ else echo No disk data in $(LIST); \ fi; \ fi mem: dirs $(SCRIPTS)getmem $(LIST) > tmp/FILES if [ -s tmp/FILES ]; \ then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ for i in `cat tmp/FILES`; \ do $(SCRIPTS)graph $(SIZE) $(GMEM) -nomarks $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT); \ else echo No memory latency data in $(LIST); \ fi bwfile: dirs $(SCRIPTS)getbw $(LIST) > tmp/FILES if [ -s tmp/FILES ]; \ then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ for i in `cat tmp/FILES`; \ do $(SCRIPTS)graph $(SIZE) $(GMEM) -logy $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT); \ else echo No file bandwidth data in $(LIST); \ fi bwmem: dirs $(SCRIPTS)getbw -all $(LIST) > tmp/FILES if [ -s tmp/FILES ]; \ then $(SCRIPTS)getmax $(NOOP) -graph `cat tmp/FILES`; \ for i in `cat tmp/FILES`; \ do $(SCRIPTS)graph -halfgrid -gthk_5 -thk2 -medium \ -nomarks -nolabels -grapheach $(GMEM) \ -logy %P="'`basename $$i`'" $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT); \ else echo No memory bandwidth data in $(LIST); \ fi tbl.paper: $(SCRIPTS)getbg -paper $(LIST) bargraphs.1st: dirs $(SCRIPTS)getbg -nosort $(LIST) #$(SCRIPTS)getmax -v $(PROC) #$(SCRIPTS)getmax -v $(LATENCY) #$(SCRIPTS)getmax -v -half $(BANDWIDTH) bargraphs: bargraphs.1st for i in $(BG); \ do $(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i; \ echo .bp; \ done | sed '$$d' | $(PRINT) bargraphs.slides: bargraphs.1st for i in $(BG); \ do $(SCRIPTS)bargraph $(SIZE) -nobox -sideways $$i > $${i}.pic; \ done bargraphs.8up: bargraphs.1st for i in $(BG); \ do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ echo .bp; \ done | sed '$$d' | $(PS8UP) latency.8up: bargraphs.1st for i in $(LATENCY); \ do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ echo .bp; \ done | sed '$$d' | $(PS8UP) bw.8up: bargraphs.1st for i in $(BANDWIDTH); \ do $(SCRIPTS)bargraph -sideways $(SIZE) -nobox $$i; \ echo .bp; \ done | sed '$$d' | $(PS8UP) get: # nothing to do clean: /bin/rm -f PS/* GIF/* HTML/* tmp/* summary.roff -bk clean distclean: /bin/rm -fr PS GIF HTML tmp summary.roff dirs: @if [ ! -d tmp ]; then mkdir tmp; fi @if [ ! -d PS ]; then mkdir PS; fi @if [ ! -d HTML ]; then mkdir HTML; fi lmbench-3.0-a9/scripts/0000775000076400007640000000000010723011657014656 5ustar staelinstaelinlmbench-3.0-a9/scripts/Makefile0000775000076400007640000000010410306552274016316 0ustar staelinstaelin# Makefile for lmbench scripts subdir. #$Id$ get: get -s clean: lmbench-3.0-a9/scripts/README0000775000076400007640000000054407045412511015541 0ustar staelinstaelin$Id$ This directory contains scripts used to generate or post process lmbench output. You probably do not want to be here or run these by hand, the Makefiles in ../src and ../results invoke these. There are some useful scripts here, however, in particular the graphing scripts. If you are interested in groff graphing tools, check out ../doc/*graph.1. lmbench-3.0-a9/scripts/SHIT0000775000076400007640000003747307045412511015366 0ustar staelinstaelin # Go find perl if we are running this as a shell script. eval 'exec perl -Ssw $0 "$@"' if 0; # Mimic the BSD tool, sccs, for RCS. # $Id$ # # Note - this reflects a lot of my personal taste. I'll try and list the # important differences here: # # A bunch of unused commands are not implemented. It is easy to add them, # mail me if you want me to add something. Please include a spec of what # you want the command to do. Mail lm@engr.sgi.com. # # I look at RCS file internals and know about certain fields as of revision # 5.x. # # This interface does not require a list of files/directories for most # commands; the implied list is *,v and/or RCS/*,v. Destructive commands, # such as clean -f, unedit, unget, do *not* have an implied list. In # other words, # rccs diffs is the same as rccs diffs RCS # but # rccs unedit is not the same as rccs unedit RCS # # If you add (potentially) destructive commands, please check for # them in main() and make sure that the autoexpand does not happen. # # TODO: # Make it so that you can pass a list of files/dirs via stdin. # # It might be nice to have all the "system" args printed out in # verbose and/or learn mode. Depends on whether you want people # to learn RCS or not. &init; &main; sub init { $0 =~ s|.*/||; # Add commands here so that -w shuts up. $lint = 0; &clean() && &create() && &example() && &get() && &edit() && &unedit() && &unget() && &diffs() && &delta() && &help() && &prs() && &prt() && &deledit() && &delget() && &enter() && &info() && &ci() && &co() && &fix() && &print() if $lint; } sub help { if ($#_ == -1) { &usage; } # Handle all the aliases. if ($_[0] eq "unedit" || $_[0] eq "unget") { &help("clean"); } elsif ($_[0] eq "clean") { } warn "Extended help on @_ not available yet.\n"; } sub usage { print < use as the description message (aka -d) delta - check in a revision -y use as the log message (aka -d) -s diffs - diff the working file against the RCS file fix - redit the last revision get - get the working file[s] (possibly for editing) history - print history of the files print - print the history and the latest contents Alias Real command Effect ----- ------------ ------ ci - delta check in a revision co - get check out a revision enter - create -g initialize a file without a get afterward unedit - clean -f remove working file even if modified unget - clean -f remove working file even if modified edit - get -e check out the file for editing prs - history print change log history prt - history print change log history An implied list of *,v and/or RCS/*,v is implied for most commands. The exceptions are commands that are potentially destructive, such as unedit. EOF exit 0; } sub main { local($cmd); local(@args); local(@comma_v); $cmd = "oops"; $cmd = shift(@ARGV) if $#ARGV > -1; &help(@ARGV) if $cmd eq "help" || $cmd eq "oops"; $dir_specified = $file_specified = 0; foreach $_ (@ARGV) { # If it is an option, just pass it through. if (/^-/) { push(@args, $_); } # If they specified an RCS directory, explode it into ,v files. elsif (-d $_) { $dir_specified = 1; warn "Exploding $_\n" if $debug; push(@args, grep(/,v$/, &filelist($_))); push(@args, grep(/,v$/, &filelist("$_/RCS"))); } # If it is a file, make it be the ,v file. else { if (!/,v$/) { # XXX - what if both ./xxx,v and ./RCS/xxx,v? if (-f "$_,v") { $_ .= ",v"; } else { if (m|/|) { m|(.*)/(.*)|; $f = "$1/RCS/$2,v"; } else { $f = "RCS/$_,v"; } if (-f $f) { $_ = $f; } } } if (-f $_) { $file_specified = 1; warn "Adding $_\n" if $debug; push(@args, $_); } else { warn "$0: skipping $_, no RCS file.\n"; } } } # Figure out if it is a potentially destructive command. These # commands do not automagically expand *,v and RCS/*,v. $destructive = ($cmd eq "clean" && $args[0] eq "-f") || $cmd eq "unedit" || $cmd eq "unget"; # If they didn't specify a file or a directory, generate a list # of all ./*,v and ./RCS/*,v files. unless ($destructive || $dir_specified || $file_specified) { warn "Exploding . && ./RCS\n" if $debug; push(@args, grep(/,v$/, &filelist("."))); push(@args, grep(/,v$/, &filelist("RCS"))); } unless ($cmd =~ /^create$/) { @comma_v = grep(/,v$/, @args); if ($#comma_v == -1) { ($s = "$cmd @ARGV") =~ s/\s+$//; die "$0 $s: No RCS files specified.\n"; } } # Exit codes: # 0 - it worked # 1 - unspecified error # 2 - command unknown $exit = 2; warn "Trying &$cmd(@args)\n" if $debug; eval(&$cmd(@args)); if ($exit == 2) { warn "Possible unknown/unimplemented command: $cmd\n"; &usage; } else { exit $exit; } } # Read the directory and return a list of files. # XXX - isn't there a builtin that does this? sub filelist { local(@entries) = (); local($ent); opendir(DFD, $_[0]) || return (); foreach $ent (readdir(DFD)) { $ent = "$_[0]/$ent"; next unless -f $ent; push(@entries, $ent); } warn "filelist($_[0]): @entries\n" if $debug; @entries; } # Take a list of ,v files and return a list of associated working files. sub working { local(@working, $working) = (); foreach $comma_v (@_) { # Strip the ,v. # Strip the RCS specification. ($working = $comma_v) =~ s|,v$||; $working =~ s|RCS/||; push(@working, $working); } @working; } # Same as "clean -f" - throw away all changes sub unedit { &clean("-f", @_); } sub unget { &clean("-f", @_); } # Get rid of everything that isn't edited and has an associated RCS file. # -e remove edited files that have not been changed. # -f remove files that are edited with changes (CAREFUL!) # This implies the -e opt. # -d Check in files that have been modified. If no message, prompt # on each file. This implies -e. # -y Like -d for people that are used to SCCS. # -m Like -d for people that are used to RCS. # # Note: this does not use rcsclean; I don't know when that showed up. And # the 5.x release of RCS I have does not install it. sub clean { local(@working); local($e_opt, $f_opt, $d_opt, $s_opt) = (0,0,0,0); local($msg); local(@checkins) = (); while ($_[0] =~ /^-/) { if ($_[0] eq "-s") { $s_opt = 1; shift(@_); } elsif ($_[0] eq "-e") { $e_opt = 1; shift(@_); } elsif ($_[0] eq "-f") { $f_opt = $e_opt = 1; shift(@_); } elsif ($_[0] =~ /^-[dym]/) { $d_opt = $e_opt = 1; if ($_[0] =~ /^-[dym]$/) { $msg = $_[0]; } else { ($msg = $_[0]) =~ s/-[ydm]//; $msg = "-m'" . $msg . "'"; } shift(@_); } else { die "$0 clean: unknown option: $_[0]\n"; } } @working = &working(@_); for ($i = 0; $i <= $#_; ++$i) { # No working file? if (!-f $working[$i]) { warn "No working file $working[$i] for $_[$i]\n" if $debug; next; } # Read only? Unlink. if (!-w $working[$i]) { warn "rm $working[$i]\n" unless $s_opt; # Make sure there is an RCS file if (-f $_[$i]) { # XXX - what if ro and edited? unlink($working[$i]) unless $n; } else { warn "clean: no RCS file for $working[$i]\n"; } next; } # If they just want to know about it, tell them. if ($e_opt == 0) { open(RCS, $_[$i]); while ($r = ) { last if $r =~ /locks/; } @locks = (); while ($r = ) { # XXX - I use "comment" a delimiter. last if $r =~ /comment/; $r =~ s/^\s+//; chop($r); push(@locks, $r); } close(RCS); if ($#locks > -1) { warn "$working[$i]: being edited: @locks\n"; } else { warn "$working[$i]: " . "writeable but not edited?!?\n"; } next; } # See if there have actually been any changes. # Notice that this is cmp(1) in about 10 lines of perl! open(RCS, "co -q -p -kkvl $_[$i] |"); open(WORK, $working[$i]); $diff = 0; while ($r = ) { unless (($w = ) && ($r eq $w)) { $diff = 1; last; } } if ($w = ) { $diff = 1; } close(RCS); close(WORK); if ($diff) { if ($f_opt) { warn "Clean modified $working[$i]\n" unless $s_opt; unless ($n) { unlink($working[$i]); system "rcs -q -u $_[$i]"; } } elsif ($d_opt) { push(@checkins, $_[$i]); } else { warn "Can't clean modified $working[$i]\n"; } next; } else { warn "rm $working[$i]\n" unless $s_opt; unless ($n) { unlink($working[$i]); system "rcs -q -u $_[$i]"; } } } # Handle files that needed deltas. if ($#checkins > -1) { warn "ci -q $msg @checkins\n" if $verbose; system "ci -q $msg @checkins"; } $exit = 0; } # Create - initialize the RCS file # -y - use as the description message for all files. # -d - use as the description message for all files. # -g - don't do the get # # Differs from sccs in that it does not preserve the original # files (I never found that very useful). sub create { local($arg, $noget, $description, $cmd) = ("", "", ""); foreach $arg (@_) { # Options... if ($arg =~ /^-[yd]/) { ($description = $arg) =~ s/^-[yd]//; $arg = ""; warn "Desc: $description\n" if $debug; next; } if ($arg eq "-g") { $noget = "yes"; $arg = ""; next; } next if ($arg =~ /^-/); # If no RCS subdir, make one. if ($arg =~ m|/|) { # full path ($dir = $arg) =~ s|/[^/]+$||; mkdir("$dir/RCS", 0775); } else { # in $CWD mkdir("RCS", 0775); } } $exit = 0; if ($description ne "") { $cmd = "ci -t-'$description' @_"; } else { $cmd = "ci @_"; } warn "$cmd\n" if $verbose; system "$cmd"; system "co @_" unless $noget; } # Like create without the get. sub enter { &create("-g", @_); } # Edit - get the working file editable sub edit { &get("-e", @_); } # co - normal RCS sub co { &get(@_); } # Get - get the working file # -e Retrieve a version for editing. # Same as co -l. # -p Print the file to stdout. # -k Suppress expansion of ID keywords. # Like co -kk. # -s Suppress all output. # # Note that all other options are passed to co(1). sub get { local($arg, $working, $f, $p); $f = $p = 0; foreach $arg (@_) { # Options... $arg = "-l" if ($arg eq "-e"); $arg = "-kk" if ($arg eq "-k"); $arg = "-q" if ($arg eq "-s"); $f = 1 if ($arg eq "-f"); $p = 1 if ($arg eq "-p"); # XXX - what if -sp? next if $arg =~ /^-/ || $p; # Check for writable files and skip them unless someone asked # for co's -f option. ($working = $arg) =~ s|,v$||; $working =~ s|RCS/||; if ((-w $working) && $f == 0) { warn "ERROR [$arg]: writable `$working' exists.\n"; $arg = ""; } } @files = grep(/,v/, @_); if ($#files == -1) { warn "$0 $cmd: no files to get. @_\n"; $exit = 1; } else { system "co @_"; $exit = 0; } } # Aliases for history. sub prt { &history(@_); } sub prs { &history(@_); } # History - change history sub command sub history { local(@history); open(RL, "rlog @_|"); # Read the whole history while ($r = ) { # Read the history for one file. if ($r !~ /^[=]+$/) { push(@history, $r); next; } &print_history(@history); @history = (); } close(RL); print "+-----------------------------------\n"; $exit = 0; } sub print_history { for ($i = 0; $i <= $#_; ++$i) { # Get the one time stuff if ($_[$i] =~ /^RCS file:/) { $_[$i] =~ s/RCS file:\s*//; chop($_[$i]); print "+------ $_[$i] -------\n|\n"; } # Get the history if ($_[$i] =~ /^----------------------------/) { local($rev, $date, $author, $lines) = ("", "", "", ""); $i++; die "Bad format\n" unless $_[$i] =~ /revision/; $_[$i] =~ s/revision\s+//; chop($_[$i]); $rev = $_[$i]; $i++; die "Bad format\n" unless $_[$i] =~ /date/; @parts = split(/[\s\n;]+/, $_[$i]); for ($j = 0; $j <= $#parts; $j++) { if ($parts[$j] =~ /date/) { $j++; $date = "$parts[$j] "; $j++; $date .= "$parts[$j]"; } if ($parts[$j] =~ /author/) { $j++; $author = $parts[$j]; } if ($parts[$j] =~ /lines/) { $j++; $lines = "$parts[$j] "; $j++; $lines .= "$parts[$j]"; } } print "| $rev $date $author $lines\n"; while ($_[++$i] && $_[$i] !~ /^----------------------------/) { print "| $_[$i]"; ### unless $rev =~ /^1\.1$/; } print "|\n"; $i--; } } } # Show changes between working file and RCS file # # -C -> -c for compat with sccs (not sure if this is needed...). sub diffs { local(@working); local($diff) = "diff"; local($rev) = ""; while ($_[0] =~ /^-/) { if ($_[0] eq "-C") { $diff .= " -c"; shift(@_); } elsif ($_[0] =~ /^-r/) { $rev = $_[0]; shift(@_); } elsif ($_[0] eq "-sdiff") { # XXX - screen size $diff = "sdiff -w80"; shift(@_); } else { $diff .= " $_[0]"; shift(@_); } } @working = &working(@_); for ($i = 0; $i <= $#_; ++$i) { # No working file? if (!-f $working[$i]) { warn "No working file $working[$i] for $_[$i]\n" if $debug; next; } # Read only? Skip. next unless (-w $working[$i]); # Show the changes print "\n------ $working[$i]$rev ------\n"; fflush(stdout); # XXX - flush stdout. if ($diff =~ /^sdiff/) { system "co -q -p -kkvl $rev $_[$i] > /tmp/sdiff.$$" . "&& $diff /tmp/sdiff.$$ $working[$i]"; # XXX - interrupts? unlink("/tmp/sdiff.$$"); } else { system "co -q -p -kkvl $rev $_[$i] |" . " $diff - $working[$i]"; } } $exit = 0; } # delta - check in the files sub delta { local($description) = (""); local($i, @working); @working = &working(@_); for ($i = 0; $i <= $#_; ++$i) { # Options... if ($_[$i] =~ /^-[yd]/) { ($description = $_[$i]) =~ s/^-[yd]/-m/; $description = "'" . $description . "'"; $_[$i] = ""; next; } $_[$i] = "-q" if $_[$i] eq "-s"; $_[$i] = "" unless -f $working[$i]; } $exit = 0; warn "ci $description @_\n" if $verbose; system "ci $description @_"; } # Allow RCS interface ci sub ci { &delta(@_); } # delget sub delget { &delta(@_); &get(@_); # If there was a description, delta nuked it... } # deledit sub deledit { &delta(@_); &get("-e", @_); # If there was a description, delta nuked it... } # info - who is editing what sub info { local(@working); @working = &working(@_); for ($i = 0; $i <= $#_; $i++) { open(RCS, $_[$i]); while ($r = ) { last if $r =~ /locks/; } @locks = (); while ($r = ) { # XXX - I use "comment" a delimter. last if $r =~ /comment/; $r =~ s/^\s+//; chop($r); push(@locks, $r); } close(RCS); if ($#locks > -1) { warn "$working[$i]: being edited: @locks\n"; } } $exit = 0; } # Fix - fix the last change to a file sub fix { foreach $f (@_) { next unless -f $f; open(F, $f); while () { last if /head\s\d/; } close(F); unless ($_ && /head/) { warn "$0 $cmd: No head node found in $f\n"; next; } s/head\s+//; chop; chop; $rev = $_; ($working = $f) =~ s/,v//; $working =~ s|RCS/||; system "co -q $f && rcs -o$rev $f && rcs -l $f && chmod +w $working"; } $exit = 0; } # print - print the history and the latest revision of the file sub print { local($file); foreach $file (@_) { &history($file); &get("-s", "-p", $file); } $exit = 0; } # Example - example sub command # -Q change this option to -q just to show how. sub example { local($arg, $working); foreach $arg (@_) { # Options... $arg = "-Q" if ($arg eq "-q"); } warn "rlog @_\n" if $verbose; system "rlog @_"; $exit = 0; } RCS bghtml html-list man2html lmbench-3.0-a9/scripts/TODO0000775000076400007640000000013507045412511015345 0ustar staelinstaelinMake graph take a %T and %T2 and put %T above %T2 Or make it take \n in the title and deal. lmbench-3.0-a9/scripts/allctx0000775000076400007640000000255010425064337016077 0ustar staelinstaelin # Extract the context switching information from lmbench result files. # Usage: getctx file file.... # # Hacked into existence by Larry McVoy (lm@sun.com now lm@sgi.com). # Copyright (c) 1994 Larry McVoy. GPLed software. # $Id$ eval 'exec perl -Ss $0 "$@"' if 0; $first = 1; foreach $file (@ARGV) { open(FD, $file); $file =~ s|.*/||; $file =~ s/\.\d+//; while () { chop; if (/^\[lmbench/) { @_ = split; if ($_[3] eq "SunOS") { $_[3] .= "-$_[5]"; } $uname = "@_"; } if (/Mhz/) { $mhz = $_; } if (/^.size=/) { s/size/Process size/; s/ ovr/\toverhead/; @info = &getinfo($uname, $mhz); ($f = $file) =~ s|.*/||; print "\n" unless $first; $first = 0; print "%T $info[3] $info[$#info]Mhz\n"; print "$_\n"; while () { last if /^Null/ || /^Pipe/ || /^Memor/; next if /\$Id/; s/ ovr/\toverhead/; s/size/Process size/; print ; } last; } } } exit 0; # Try and create sensible names from uname -a output sub getinfo { local(@info); local($name); local($mhz) = sprintf("%.0f", $_[1]); @info = split(/\s+/, $_[0]); $name = pop(@info); chop($name); if ($name eq "mips") { $name = "$info[$#info]@$mhz"; } elsif ($_[0] =~ /HP-UX/) { $name = "$info[7]@$mhz"; } elsif ($_[0] =~ /SunOS/) { $name = "$info[7]@$mhz"; } else { $name .= "@$mhz"; } push(@info, $name); @info; } lmbench-3.0-a9/scripts/allmem0000775000076400007640000000243110425064337016055 0ustar staelinstaelin # Extract the memory latency graph data from lmbench result files. # # Hacked into existence by Larry McVoy (lm@sun.com now lm@sgi.com). # Copyright (c) 1994 Larry McVoy. GPLed software. # $Id$ eval 'exec perl -Ss $0 "$@"' if 0; # Uses a stride of 128 #print "\"%X Array size\n\"%Y Latency in nanoseconds\n"; foreach $file (@ARGV) { open(FD, $file); $file =~ s|.*/||; while () { chop; if (/^\[lmbench/) { @_ = split; if ($_[3] eq "SunOS") { $_[3] .= "-$_[5]"; } $uname = "@_"; } if (/Mhz/) { $mhz = $_; } if (/^Memory load latency/) { @info = &getinfo($uname, $mhz); ($f = $file) =~ s|.*/||; print "\"$file $info[3] $info[$#info]\n"; while () { next unless /^"stride=128/; last; } while () { if (/^\s*$/) { print "\n"; last; } print; } last; } } } exit 0; # Try and create sensible names from uname -a output sub getinfo { local(@info); local($name); local($mhz) = sprintf("%.0f", $_[1]); @info = split(/\s+/, $_[0]); $name = pop(@info); chop($name); if ($name eq "mips") { $name = "$info[$#info]@$mhz"; } elsif ($_[0] =~ /HP-UX/) { $name = "$info[7]@$mhz"; } elsif ($_[0] =~ /SunOS/) { $name = "$info[7]@$mhz"; } else { $name .= "@$mhz"; } push(@info, $name); @info; } lmbench-3.0-a9/scripts/bargraph0000775000076400007640000002141007045424353016374 0ustar staelinstaelin# $Id$ eval 'exec perl -Ss $0 "$@"' if 0; # A simple bargraph preprocessor for GNU pic / troff package. # Hacked into existence by Larry McVoy (lm@sun.com now lm@sgi.com). # Copyright (c) 1994 Larry McVoy. GPLed software. # # TODO # Make this work with sideways graphs. # # Input format is: # # 3 foo bar # 9 bigger foo # "Silly example # # and output is # # bigger # foo # +----------+ # | | # foo | | # bar | | # +----------+ | | # | | | | # +----------+ +----------+ # ------------------------------- # 3 9 # # Silly example # # Input options: # specifier value default # %ps 10 # %ft HB # %labelgap 1.5 # %xsize 7 # %ysize 6 # %Title n|s none # %titleplus 0 # %label%d