pax_global_header00006660000000000000000000000064150412344400014507gustar00rootroot0000000000000052 comment=512f63021f6f0cec8e5e13be744dba0b4704cfae wtarreau-mhz-512f630/000077500000000000000000000000001504123444000144055ustar00rootroot00000000000000wtarreau-mhz-512f630/LICENSE000066400000000000000000000020611504123444000154110ustar00rootroot00000000000000Copyright (C) 2016-2023 Willy Tarreau Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. wtarreau-mhz-512f630/Makefile000066400000000000000000000002751504123444000160510ustar00rootroot00000000000000CC := gcc CFLAGS := -O3 -Wall -fomit-frame-pointer all: mhz mhz: %: %.o $(CC) $(LDFLAGS) -o $@ $^ %.o: %.c $(CC) $(CFLAGS) $(INCLUDE) -o $@ -c $< clean: rm -f mhz *.o *~ wtarreau-mhz-512f630/README.md000066400000000000000000000144631504123444000156740ustar00rootroot00000000000000# MHz - CPU frequency measurement utility It's fairly common, especially in embedded systems, to be fooled by a CPU frequency not matching the believed one. Most of the time the reason stems from missing frequency operation points, but it can also be caused by bogus clock drivers, device tree issues, hardware issues, boot loader issues, firmware issues, different silicon quality, thermal throttling and even (long ago) vendors cheating by only reporting high values in cpufreq. This tool runs long loops of inexpensive arithmetic operations (XOR) that are not optimizable so that we're almost certain that most CPU cores will run them at a rate of one operation per clock cycle, not less, not more. This has been tested on various cores including old 180 MHz MIPS and less old MIPS32/MIPS64, many 32- and 64-bit Arm cores of v5/v7/v8/v9 architectures, arious i386 and x86_64 up to 6 GHz, RISC-V, PowerPC, Sparc64 and Alpha EV6, all of them reporting accurate values. The program first runs a self-calibration so that the short loop takes at least 20 milliseconds. Each measure is run 5 times and the lowest measure is returned in order to hide as much as possible latency spikes caused by interrupt processing and context switches. This delay of 20ms is chosen so that it remains accurate on systems with clocks of 100, 250 and 1000 Hz. The longer loop is 5 times more instructions, so a full sample cannot take less than 120 ms and since 5 samples are needed, a full test takes at least 600ms. Various numbers are reported by default, including the timings of the 50- and 250- microsecond loops, and the measured TSC (time stamp counter) frequency, when available. Most modern processors have their TSC running at a fixed frequency which is often higher than the base frequency but lower than the maximum frequency. Knowing the exact frequency is useful when performing clock measurements on code, as the frequencies of the instructions and the TSC rarely match nowadays. ## Usage The usage message can be obtained with `-h` on the command line: ``` $ ./mhz -h Usage: ./mhz [-h|-c|-t]* [lines [heat [count]]] -h show this help -c show CPU freq only (in MHz) -i report integral frequencies only -t show TSC freq only (in MHz) lines number of measurements (one line per measurement). Def: 1 heat pre-heat time in microseconds. Def: 0 count calibration value, higher is slower but more accurate. Def: auto ``` The currently supported arguments are: - `-c` : will only report the CPU frequency in MHz, not the rest of the fields. - `-t` : will only report the TSC frequency in MHz, not the rest of the fields. - `-i` : truncate the frequency to the nearest integer value. This is essentially used with `-c` or `-t` for use in scripts. - `lines` : this is the number of measurements to perform before quitting. By default a single measurement is run. Increasing this value allows to collect more values to observe variations, or simply to monitor the CPU frequency in real time. - `heat` : forces the CPU to run a busy loop of that many microseconds before starting measurements. This can be needed on modern CPUs which switch to low-power mode after a few milliseconds of idle, so as not to affect the first measurement by the time it takes to wake from low-power. - `count` : this is the loop count value to be used as a starting point. There is little point in setting it, as it will automatically be adjusted if not sufficient. However it can sometimes be helpful to permit to run multiple tests under the exact same conditions, e.g. by copying the value from a previous run, or by forcing a large value. ## Examples Run 3 measures and exit: ``` $ ./mhz 3 ``` Run 3 measures after a pre-heating period of 500ms: ``` $ ./mhz 3 500000 ``` Show the integer frequencies of all CPUs on the machine: ``` ncpu=$(nproc) for ((i=0; i #include #include #include #include static int cpu_only; static int tsc_only; static int use_ints; /* returns current time in microseconds */ static inline unsigned long long microseconds(void) { #ifdef CLOCK_MONOTONIC struct timespec tv; clock_gettime(CLOCK_MONOTONIC, &tv); return (tv.tv_sec * 1000000000ULL + tv.tv_nsec) / 1000ULL; #else struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000000ULL + tv.tv_usec; #endif } #if defined(__i386__) || defined(__x86_64__) #define HAVE_RDTSC 1 #endif #ifdef HAVE_RDTSC static inline unsigned long long rdtsc(void) { unsigned int a, d; asm volatile("rdtsc" : "=a" (a), "=d" (d)); return a + ((unsigned long long)d << 32); } #else #define rdtsc() 0 #endif /* performs read-after-write operations that the CPU is not supposed to be able * to parallelize. The "asm" statements are here to prevent the compiler from * reordering this code. */ #define dont_move(var) do { asm volatile("" : "=r"(var) : "0" (var)); } while (0) #define run1cycle_ae() do { a ^= e; dont_move(a); } while (0) #define run1cycle_ba() do { b ^= a; dont_move(b); } while (0) #define run1cycle_cb() do { c ^= b; dont_move(c); } while (0) #define run1cycle_dc() do { d ^= c; dont_move(d); } while (0) #define run1cycle_ed() do { e ^= d; dont_move(e); } while (0) #define run1cycle_eb() do { e ^= b; dont_move(e); } while (0) #define run5cycles() \ do { \ run1cycle_ae(); \ run1cycle_ba(); \ run1cycle_cb(); \ run1cycle_dc(); \ run1cycle_ed(); \ } while (0) #define run10cycles() \ do { \ run5cycles(); \ run5cycles(); \ } while (0) #define run100cycles() \ do { \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ run10cycles(); \ } while (0) /* performs 50 operations in a loop, all dependant on each other, so that the * CPU cannot parallelize them, hoping to take 50 cycles per loop, plus the * loop counter overhead. */ static __attribute__((noinline,aligned(64))) void loop50(unsigned int n) { unsigned int a = 0, b = 0, c = 0, d = 0, e = 0; do { run10cycles(); run10cycles(); run10cycles(); run10cycles(); run10cycles(); } while (__builtin_expect(--n, 1)); } /* performs 250 operations in a loop, all dependant on each other, so that the * CPU cannot parallelize them, hoping to take 250 cycles per loop, plus the * loop counter overhead. Do not increase this loop so that it fits in a small * 1 kB L1 cache on 32-bit instruction sets. */ static __attribute__((noinline,aligned(64))) void loop250(unsigned int n) { unsigned int a = 0, b = 0, c = 0, d = 0, e = 0; do { run10cycles(); run10cycles(); run10cycles(); run10cycles(); run10cycles(); run100cycles(); run100cycles(); } while (__builtin_expect(--n, 1)); } long run_once(long count) { long long tsc_begin; long long tsc_duration50 __attribute__((unused)); long long tsc_duration250 __attribute__((unused)); long long us_begin, us_duration50, us_duration250; long long us_duration; int retries = 24; // up to 16M longer than initial estimate unsigned int i; char mhz[20]; while (1) { /* now run the 50 cycles loop. We'll pick the lowest value * among 5 runs of the short loop. */ us_duration50 = LLONG_MAX; for (i = 0; i < 5; i++) { us_begin = microseconds(); tsc_begin = rdtsc(); loop50(count); tsc_duration50 = rdtsc() - tsc_begin; us_duration = microseconds() - us_begin; if (us_duration < us_duration50) us_duration50 = us_duration; } if (us_duration50 < 20000 && retries) { /* we want at least 20 milliseconds, so let's * raise the count. We double as long as the * duration is < 10ms and raise by 25% next. */ count = (us_duration50 < 10000) ? count * 2 : count * 5 / 4; retries--; continue; } /* now run the 250 cycles loop. We'll pick the lowest value * among 5 runs of the long loop. */ us_duration250 = LLONG_MAX; for (i = 0; i < 5; i++) { us_begin = microseconds(); tsc_begin = rdtsc(); loop250(count); tsc_duration250 = rdtsc() - tsc_begin; us_duration = microseconds() - us_begin; if (us_duration < us_duration250) us_duration250 = us_duration; } /* make sure we have a valid measurement */ if (us_duration250 != us_duration50) break; /* otherwise we'll do it again waiting twice as long for a few times */ if (!retries--) break; count *= 2; } if (use_ints) snprintf(mhz, sizeof(mhz), "%.0f", count * 200.0 / (us_duration250 - us_duration50) + 0.5); else snprintf(mhz, sizeof(mhz), "%.3f", count * 200.0 / (us_duration250 - us_duration50)); if (!cpu_only && !tsc_only) { printf("count=%ld us50=%lld us250=%lld diff=%lld cpu_MHz=%s", count, us_duration50, us_duration250, us_duration250 - us_duration50, mhz); } else if (cpu_only) { printf("%s\n", mhz); return count; } #ifdef HAVE_RDTSC if (use_ints) snprintf(mhz, sizeof(mhz), "%.0f", (tsc_duration250 - tsc_duration50) / (float)(us_duration250 - us_duration50) + 0.5); else snprintf(mhz, sizeof(mhz), "%.3f", (tsc_duration250 - tsc_duration50) / (float)(us_duration250 - us_duration50)); if (!tsc_only) { printf(" tsc50=%lld tsc250=%lld diff=%lld rdtsc_MHz=%s", tsc_duration50, tsc_duration250, (tsc_duration250 - tsc_duration50) / count, mhz); } else { printf("%s\n", mhz); return count; } #endif putchar('\n'); return count; } /* spend us waiting for the CPU's frequency to raise. Will also stop * on backwards time jumps if any. */ void pre_heat(long delay) { unsigned long long start = microseconds(); while (microseconds() - start < (unsigned long long)delay) ; } void usage(const char *name) { printf("Usage: %s [-h|-c%s]* [lines [heat [count]]]\n" " -h show this help\n" " -c show CPU freq only (in MHz)\n" " -i report integral frequencies only\n" #ifdef HAVE_RDTSC " -t show TSC freq only (in MHz)\n" #endif " lines number of measurements (one line per measurement). Def: 1\n" " heat pre-heat time in microseconds. Def: 0\n" " count calibration value, higher is slower but more accurate. Def: auto\n" "\n", name, #ifdef HAVE_RDTSC "|-t" #else "" #endif ); exit(0); } int main(int argc, char **argv) { const char *name = argv[0]; unsigned int count = 0; long runs = 1; while (argc > 1 && *argv[1] == '-') { if (argv[1][1] == 'c') cpu_only = 1; else if (argv[1][1] == 'i') use_ints = 1; #ifdef HAVE_RDTSC else if (argv[1][1] == 't') tsc_only = 1; #endif else usage(name); argc--; argv++; } if (argc > 1) runs = atol(argv[1]); if (argc > 2) pre_heat(atol(argv[2])); /* default to initial count value of 1000 */ if (argc > 3) count = atol(argv[3]); if (count <= 0) count = 1000; while (runs--) count = run_once(count); return 0; }