pax_global_header00006660000000000000000000000064141220244620014507gustar00rootroot0000000000000052 comment=815eae2f4de45c990c92dcf12da818dcf816b7ea rr-5.5.0/000077500000000000000000000000001412202446200121415ustar00rootroot00000000000000rr-5.5.0/.clang-format000066400000000000000000000025561412202446200145240ustar00rootroot00000000000000# BasedOnStyle: Mozilla AccessModifierOffset: -2 ConstructorInitializerIndentWidth: 4 AlignEscapedNewlinesLeft: false AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakTemplateDeclarations: false AlwaysBreakBeforeMultilineStrings: false BreakBeforeBinaryOperators: false BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BinPackParameters: true ColumnLimit: 80 ConstructorInitializerAllOnOneLineOrOnePerLine: true DerivePointerBinding: false ExperimentalAutoDetectBinPacking: false IndentCaseLabels: true MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 60 PenaltyBreakString: 1000 PenaltyBreakFirstLessLess: 120 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerBindsToType: true SpacesBeforeTrailingComments: 1 Cpp11BracedListStyle: false Standard: Cpp11 IndentWidth: 2 TabWidth: 8 UseTab: Never BreakBeforeBraces: Attach IndentFunctionDeclarationAfterType: false SpacesInParentheses: false SpacesInAngles: false SpaceInEmptyParentheses: false SpacesInCStyleCastParentheses: false SpaceAfterControlStatementKeyword: true SpaceBeforeAssignmentOperators: true ContinuationIndentWidth: 4 rr-5.5.0/.gitignore000066400000000000000000000004411412202446200141300ustar00rootroot00000000000000*~ .cproject CMakeCache.txt CMakeFiles/ cmake_install.cmake CPackConfig.cmake CPackSourceConfig.cmake _CPack_Packages/ CTestTestfile.cmake Debug dist/ install_manifest.txt Makefile Profile obj/ .project .vscode/ *.log *.orig *.rej *.pyc *.record *.replay .settings/ Testing/ .idea/ .*.swp rr-5.5.0/CMakeLists.txt000066400000000000000000001526311412202446200147110ustar00rootroot00000000000000# *-* Mode: cmake; *-* cmake_minimum_required(VERSION 3.1.0) project(rr C CXX ASM) # "Do not add flags to export symbols from executables without the ENABLE_EXPORTS target property." # This avoids linking executables with -rdynamic. -rdynamic has been observed # to cause rr_exec_stub to be linked with the dynamic linker with some # version(s) of clang (but linked to an incorrect file name, causing # exec of rr_exec_stub to fail). if(POLICY CMP0065) cmake_policy(SET CMP0065 NEW) endif() # On single configuration generators, make Debug the default configuration if(NOT CMAKE_CONFIGURATION_TYPES) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Whether to build in `Debug` or `Release` mode." FORCE) endif() endif() enable_testing() set(BUILD_SHARED_LIBS ON) set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib/rr) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(BUILD_TESTS ON CACHE BOOL "Build tests") set(WILL_RUN_TESTS ${BUILD_TESTS} CACHE BOOL "Run tests") option(INSTALL_TESTSUITE "Install the testsuite") # CAREFUL! "-" is an invalid character in RPM package names, while # debian is happy with it. However, "_" is illegal in debs, while RPM # is cool with it. Sigh. set(rr_VERSION_MAJOR 5) set(rr_VERSION_MINOR 5) set(rr_VERSION_PATCH 0) add_definitions(-DRR_VERSION="${rr_VERSION_MAJOR}.${rr_VERSION_MINOR}.${rr_VERSION_PATCH}") execute_process( COMMAND git rev-parse HEAD WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE GIT_REVISION OUTPUT_STRIP_TRAILING_WHITESPACE ) configure_file( ${CMAKE_SOURCE_DIR}/src/git_revision.h.in ${CMAKE_BINARY_DIR}/git_revision.h ) set(FLAGS_COMMON "-D__USE_LARGEFILE64 -pthread") set(supports32bit true) set(x86ish false) if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") set(supports32bit false) set(FLAGS_COMMON "${FLAGS_COMMON} -march=armv8.3-a -Wl,-dynamic-linker=/lib/aarch64-linux-gnu/atomics/ld-linux-aarch64.so.1") else() set(x86ish true) set(FLAGS_COMMON "${FLAGS_COMMON} -msse2 -D__MMX__ -D__SSE__ -D__SSE2__") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS_COMMON} -Wstrict-prototypes -std=gnu11") # Define __STDC_LIMIT_MACROS so |#include | works as expected. # Define __STDC_FORMAT_MACROS so |#include | works as expected. include(CheckCXXCompilerFlag) CHECK_CXX_COMPILER_FLAG("-std=c++14" SUPPORTS_CXX14) if (SUPPORTS_CXX14) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_COMMON} -D__STDC_LIMIT_MACROS -D__STDC_FORMAT_MACROS -std=c++14") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_COMMON} -D__STDC_LIMIT_MACROS -D__STDC_FORMAT_MACROS -std=c++11") endif() # We support three build types: # DEBUG: suitable for debugging rr # RELEASE: suitable for using rr in production (but keeps rr debuginfo) # OTHER: suitable for using rr in production, but honouring distro/user opt/debug settings # (which we assume are suitable for production use) # Base settings for debug and release/unspecified builds. # Use -Werror for debug builds because we assume a developer is building, not a user. set(RR_FLAGS_DEBUG "-Wall -Wextra -DDEBUG -UNDEBUG") set(RR_FLAGS_RELEASE "-Wall -Wextra -UDEBUG -DNDEBUG") # The folowing settings are the defaults for the OTHER build type. # Flags used to build the preload library. MUST have debuginfo enabled. SHOULD be optimized. set(PRELOAD_COMPILE_FLAGS "${RR_FLAGS_RELEASE} -fno-stack-protector -g3 -U_FORTIFY_SOURCE") # Flags used to build Brotli. SHOULD be optimized. MUST NOT error on warnings. set(BROTLI_COMPILE_FLAGS ${RR_FLAGS_RELEASE}) # Flags used to build tests. MUST have -DDEBUG and debuginfo enabled, MUST NOT be optimized. set(RR_TEST_FLAGS "${RR_FLAGS_DEBUG} -g3 -O0") # Flags used to build other files. Entirely build-type-dependent. set(RR_FLAGS ${RR_FLAGS_RELEASE}) # Now override for build type. string(TOLOWER ${CMAKE_BUILD_TYPE} LOWERCASE_CMAKE_BUILD_TYPE) if(LOWERCASE_CMAKE_BUILD_TYPE STREQUAL "debug") set(PRELOAD_COMPILE_FLAGS "${PRELOAD_COMPILE_FLAGS} -O2 -Werror") set(BROTLI_COMPILE_FLAGS "${RR_FLAGS_RELEASE} -O2") set(RR_TEST_FLAGS "${RR_TEST_FLAGS} -Werror") set(RR_FLAGS "${RR_FLAGS_DEBUG} -g3 -Werror") elseif(LOWERCASE_CMAKE_BUILD_TYPE STREQUAL "release") # CMake itself will add optimization flags set(RR_FLAGS "${RR_FLAGS_RELEASE} -g3 -flto") endif() set(LINKER_FLAGS "") if(CMAKE_C_COMPILER_ID STREQUAL "GNU") # Gcc generates bogus R_386_GOTOFF relocations in .debug_info which # lld 9 rejects set(LINKER_FLAGS "-fuse-ld=bfd") endif() if(CMAKE_C_COMPILER_ID STREQUAL "Clang") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-command-line-argument") endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-command-line-argument") endif() if (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -fno-integrated-as") endif() option(force32bit "Force a 32-bit rr build, rather than both 64 and 32-bit. rr will only be able to record and replay 32-bit processes.") option(disable32bit "On a 64-bit platform, avoid requiring a 32-bit cross-compilation toolchain by not building 32-bit components. rr will be able to record 32-bit processes but not replay them.") if(force32bit) set(rr_32BIT true) set(rr_64BIT false) set(rr_MBITNESS_OPTION -m32) else() if(CMAKE_SIZEOF_VOID_P EQUAL 8) if(disable32bit OR NOT supports32bit) set(rr_32BIT false) else() set(rr_32BIT true) endif() set(rr_64BIT true) else() set(rr_32BIT true) set(rr_64BIT false) endif() set(rr_MBITNESS_OPTION) endif() option(staticlibs "Force usage of static linkage for non-standard libraries like capnproto") # Check that compiling 32-bit code on a 64-bit target works, if required. if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" AND rr_32BIT) # try_compile won't accept LINK_FLAGS, so do this manually. file(WRITE "${CMAKE_BINARY_DIR}/test32.c" "int main() { return 0; }") execute_process(COMMAND ${CMAKE_C_COMPILER} -o ${CMAKE_BINARY_DIR}/test32 ${CMAKE_BINARY_DIR}/test32.c -m32 RESULT_VARIABLE COMPILER_32BIT_RESULT) if(NOT (COMPILER_32BIT_RESULT EQUAL 0)) message(FATAL_ERROR "Your toolchain doesn't support 32-bit cross-compilation. Install the required packages or pass -Ddisable32bit=ON to cmake.") endif() endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${rr_MBITNESS_OPTION}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${rr_MBITNESS_OPTION}") set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${rr_MBITNESS_OPTION}") find_package(PkgConfig REQUIRED) # If we're cross-compiling a 32-bit rr build on a 64-bit host we need # to ensure we're looking for the right libraries. # This has been tested on Ubuntu and Fedora. if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" AND NOT rr_64BIT) set(LIBDIR32_CANDIDATES /usr/lib/i386-linux-gnu/pkgconfig/ /usr/lib/pkgconfig/ ) foreach(libdir ${LIBDIR32_CANDIDATES}) if(IS_DIRECTORY ${libdir}) set(ENV{PKG_CONFIG_LIBDIR} ${libdir}) break() endif() endforeach(libdir) if(NOT DEFINED ENV{PKG_CONFIG_LIBDIR}) message(FATAL_ERROR "Couldn't find a suitable 32-bit pkgconfig lib dir. You probably need to install a 32-bit pkgconfig package (pkgconfig.i686 for Fedora or pkg-config:i386 for Ubuntu") endif() endif() find_program(CAPNP capnp) if(${CAPNP} STREQUAL "CAPNP-NOTFOUND") message(FATAL_ERROR "Can't find 'capnp' command; install Capnproto packages? https://github.com/rr-debugger/rr/wiki/Building-And-Installing#tldr") endif() set(REQUIRED_LIBS capnp ) foreach(required_lib ${REQUIRED_LIBS}) string(TOUPPER ${required_lib} PKG) pkg_check_modules(${PKG} REQUIRED ${required_lib}) if(staticlibs) string(REPLACE ";" " " ${PKG}_STATIC_CFLAGS "${${PKG}_STATIC_CFLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${${PKG}_STATIC_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${${PKG}_STATIC_CFLAGS}") else() string(REPLACE ";" " " ${PKG}_CFLAGS "${${PKG}_CFLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${${PKG}_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${${PKG}_CFLAGS}") endif() endforeach(required_lib) # ==== brotli ==== set(BROTLI_FILES third-party/brotli/common/dictionary.c third-party/brotli/common/transform.c third-party/brotli/dec/bit_reader.c third-party/brotli/dec/decode.c third-party/brotli/dec/huffman.c third-party/brotli/dec/state.c third-party/brotli/enc/backward_references.c third-party/brotli/enc/backward_references.h third-party/brotli/enc/backward_references_hq.c third-party/brotli/enc/backward_references_hq.h third-party/brotli/enc/backward_references_inc.h third-party/brotli/enc/bit_cost.c third-party/brotli/enc/bit_cost.h third-party/brotli/enc/bit_cost_inc.h third-party/brotli/enc/block_encoder_inc.h third-party/brotli/enc/block_splitter.c third-party/brotli/enc/block_splitter.h third-party/brotli/enc/block_splitter_inc.h third-party/brotli/enc/brotli_bit_stream.c third-party/brotli/enc/brotli_bit_stream.h third-party/brotli/enc/cluster.c third-party/brotli/enc/cluster.h third-party/brotli/enc/cluster_inc.h third-party/brotli/enc/command.h third-party/brotli/enc/compress_fragment.c third-party/brotli/enc/compress_fragment.h third-party/brotli/enc/compress_fragment_two_pass.c third-party/brotli/enc/compress_fragment_two_pass.h third-party/brotli/enc/context.h third-party/brotli/enc/dictionary_hash.c third-party/brotli/enc/dictionary_hash.h third-party/brotli/enc/encode.c third-party/brotli/enc/encoder_dict.c third-party/brotli/enc/entropy_encode.c third-party/brotli/enc/entropy_encode.h third-party/brotli/enc/entropy_encode_static.h third-party/brotli/enc/fast_log.h third-party/brotli/enc/find_match_length.h third-party/brotli/enc/hash_forgetful_chain_inc.h third-party/brotli/enc/hash.h third-party/brotli/enc/hash_longest_match64_inc.h third-party/brotli/enc/hash_longest_match_inc.h third-party/brotli/enc/hash_longest_match_quickly_inc.h third-party/brotli/enc/hash_to_binary_tree_inc.h third-party/brotli/enc/histogram.c third-party/brotli/enc/histogram.h third-party/brotli/enc/histogram_inc.h third-party/brotli/enc/literal_cost.c third-party/brotli/enc/literal_cost.h third-party/brotli/enc/memory.c third-party/brotli/enc/memory.h third-party/brotli/enc/metablock.c third-party/brotli/enc/metablock.h third-party/brotli/enc/metablock_inc.h third-party/brotli/enc/port.h third-party/brotli/enc/prefix.h third-party/brotli/enc/quality.h third-party/brotli/enc/ringbuffer.h third-party/brotli/enc/static_dict.c third-party/brotli/enc/static_dict.h third-party/brotli/enc/static_dict_lut.h third-party/brotli/enc/utf8_util.c third-party/brotli/enc/utf8_util.h third-party/brotli/enc/write_bits.h ) add_library(brotli STATIC ${BROTLI_FILES}) set_source_files_properties(${BROTLI_FILES} PROPERTIES COMPILE_FLAGS ${BROTLI_COMPILE_FLAGS}) # ==== brotli ==== find_path(SECCOMP NAMES "linux/seccomp.h") if(NOT SECCOMP) message(FATAL_ERROR "Couldn't find linux/seccomp.h. You may need to upgrade your kernel.") endif() set(Python_ADDITIONAL_VERSIONS 3 3.8 3.7 3.6 3.5 3.4 3.3 3.2 3.1 3.0) find_package(PythonInterp 3 REQUIRED) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" "# nothing" RESULT_VARIABLE python_status) if(python_status) message(FATAL_ERROR "Couldn't run python interpreter ${PYTHON_EXECUTABLE}.") endif() # Check for required Python modules if(WILL_RUN_TESTS) if(NOT BUILD_TESTS) message(FATAL_ERROR "Running tests requires building them") endif() set(REQUIRED_PYTHON_MODULES pexpect ) else() set(REQUIRED_PYTHON_MODULES) endif() foreach(py_module ${REQUIRED_PYTHON_MODULES}) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" "import ${py_module}" RESULT_VARIABLE module_status) if(module_status) message(FATAL_ERROR "Couldn't find required Python module ${py_module}.") endif() endforeach(py_module) if(WILL_RUN_TESTS) # Check for gdb execute_process(COMMAND "gdb" "--version" RESULT_VARIABLE module_status OUTPUT_QUIET) if(module_status) message(FATAL_ERROR "Couldn't find gdb.") endif() endif() include_directories("${PROJECT_SOURCE_DIR}/include") include_directories("${PROJECT_SOURCE_DIR}/third-party/proc-service") include_directories("${PROJECT_SOURCE_DIR}/third-party/brotli/include") # We need to know where our generated files are. include_directories("${CMAKE_CURRENT_BINARY_DIR}") set(RR_PAGE_FILES rr_page.S ) set(RR_PAGE_SOURCE_FILES ${RR_PAGE_FILES} rr_page_instructions.S rr_vdso.S rr_page.ld ) add_library(rrpage) foreach(file ${RR_PAGE_FILES}) target_sources(rrpage PUBLIC "${CMAKE_SOURCE_DIR}/src/preload/${file}") set_source_files_properties("${CMAKE_SOURCE_DIR}/src/preload/${file}" PROPERTIES COMPILE_FLAGS ${PRELOAD_COMPILE_FLAGS}) endforeach(file) # Since librrpage replaces the kernel vDSO for processes exec'd by rr, # we want it to have the same SONAME as the real vDSO to trick things # like AddressSanitizer into recognising it as the vDSO. set_target_properties(rrpage PROPERTIES NO_SONAME ON) set_target_properties(rrpage PROPERTIES LINK_FLAGS "-Wl,-T -Wl,${CMAKE_SOURCE_DIR}/src/preload/rr_page.ld -Wl,--hash-style=both -nostartfiles -nostdlib -Wl,-z,max-page-size=0x1000 -Wl,-soname,linux-vdso.so.1 ${LINKER_FLAGS}") set_target_properties(rrpage PROPERTIES LINK_DEPENDS ${CMAKE_SOURCE_DIR}/src/preload/rr_page.ld) # CMake seems to have trouble generating the link line without this set_target_properties(rrpage PROPERTIES LINKER_LANGUAGE C) add_custom_command(TARGET rrpage POST_BUILD COMMAND ${CMAKE_SOURCE_DIR}/src/preload/tweak_librrpage.py $) # Order matters here! syscall_hook.S must be immediately before syscallbuf.c, # raw_syscall.S must be before overrides.c, which must be last. set(PRELOAD_FILES syscall_hook.S syscallbuf.c raw_syscall.S overrides.c ) set(PRELOAD_SOURCE_FILES ${PRELOAD_FILES} preload_interface.h rrcalls.h syscallbuf.h ) if (x86ish) add_library(rrpreload) foreach(file ${PRELOAD_FILES}) target_sources(rrpreload PUBLIC "${CMAKE_SOURCE_DIR}/src/preload/${file}") set_source_files_properties("${CMAKE_SOURCE_DIR}/src/preload/${file}" PROPERTIES COMPILE_FLAGS ${PRELOAD_COMPILE_FLAGS}) endforeach(file) set_target_properties(rrpreload PROPERTIES LINK_FLAGS "-nostartfiles ${LINKER_FLAGS}") set_target_properties(rrpreload PROPERTIES INSTALL_RPATH "\$ORIGIN") endif() set(AUDIT_FILES rtld-audit.c stap-note-iter.c ../preload/raw_syscall.S ) set(AUDIT_SOURCE_FILES ${AUDIT_FILES} rtld-audit.h stap-note-iter.h ../preload/preload_interface.h ../preload/rrcalls.h ) add_library(rraudit) foreach(file ${AUDIT_FILES}) target_sources(rraudit PUBLIC "${CMAKE_SOURCE_DIR}/src/audit/${file}") set_source_files_properties("${CMAKE_SOURCE_DIR}/src/audit/${file}" PROPERTIES COMPILE_FLAGS ${PRELOAD_COMPILE_FLAGS}) endforeach(file) set_target_properties(rraudit PROPERTIES LINK_FLAGS "-nostartfiles -ldl ${LINKER_FLAGS}") # Ensure that CMake knows about our generated files. # # Alphabetical, please. set(GENERATED_FILES AssemblyTemplates.generated CheckSyscallNumbers.generated SyscallEnumsX64.generated SyscallEnumsX86.generated SyscallEnumsGeneric.generated SyscallEnumsForTestsX64.generated SyscallEnumsForTestsX86.generated SyscallEnumsForTestsGeneric.generated SyscallHelperFunctions.generated SyscallnameArch.generated SyscallRecordCase.generated ) foreach(generated_file ${GENERATED_FILES}) set_source_files_properties(${generated_file} PROPERTIES GENERATED true HEADER_FILE_ONLY true) add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${generated_file}" COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/src/generate_syscalls.py" "${CMAKE_CURRENT_BINARY_DIR}/${generated_file}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/generate_syscalls.py" "${CMAKE_CURRENT_SOURCE_DIR}/src/syscalls.py" "${CMAKE_CURRENT_SOURCE_DIR}/src/assembly_templates.py") endforeach(generated_file) add_custom_target(Generated DEPENDS ${GENERATED_FILES}) add_custom_command(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++" "${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.h" COMMAND capnp compile "--src-prefix=${CMAKE_CURRENT_SOURCE_DIR}/src" "-oc++:${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/src/rr_trace.capnp" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/rr_trace.capnp") set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++" PROPERTIES GENERATED true) set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.h" PROPERTIES GENERATED true HEADER_FILE_ONLY true) if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") set(BLAKE_ARCH_DIR third-party/blake2/neon) else() set(BLAKE_ARCH_DIR third-party/blake2/sse) endif() set(RR_SOURCES src/AddressSpace.cc src/AutoRemoteSyscalls.cc src/BuildidCommand.cc src/Command.cc src/CompressedReader.cc src/CompressedWriter.cc src/CPUFeaturesCommand.cc src/CPUIDBugDetector.cc src/DiversionSession.cc src/DumpCommand.cc src/Dwarf.cc src/ElfReader.cc src/EmuFs.cc src/Event.cc src/ExtraRegisters.cc src/fast_forward.cc src/FdTable.cc src/FileMonitor.cc src/FileNameCommand.cc src/Flags.cc src/ftrace.cc src/GdbCommand.cc src/GdbCommandHandler.cc src/GdbConnection.cc src/GdbExpression.cc src/GdbInitCommand.cc src/GdbServer.cc src/HasTaskSet.cc src/HelpCommand.cc src/kernel_abi.cc src/kernel_metadata.cc src/log.cc src/LsCommand.cc src/MagicSaveDataMonitor.cc src/MmappedFileMonitor.cc src/MonitoredSharedMemory.cc src/Monkeypatcher.cc src/PackCommand.cc src/PerfCounters.cc src/ProcFdDirMonitor.cc src/ProcMemMonitor.cc src/ProcStatMonitor.cc src/PsCommand.cc src/RecordCommand.cc src/RecordSession.cc src/record_signal.cc src/record_syscall.cc src/RecordTask.cc src/Registers.cc src/remote_code_ptr.cc src/ReplayCommand.cc src/ReplaySession.cc src/replay_syscall.cc src/ReplayTask.cc src/ReplayTimeline.cc src/RerunCommand.cc src/ReturnAddressList.cc src/Scheduler.cc src/SeccompFilterRewriter.cc src/Session.cc src/SourcesCommand.cc src/StdioMonitor.cc src/SysCpuMonitor.cc src/Task.cc src/ThreadGroup.cc src/ThreadDb.cc src/TraceFrame.cc src/TraceInfoCommand.cc src/TraceStream.cc src/VirtualPerfCounterMonitor.cc src/util.cc src/WaitStatus.cc ${CMAKE_CURRENT_BINARY_DIR}/rr_trace.capnp.c++ ${BLAKE_ARCH_DIR}/blake2b.c ) if (x86ish) set(RR_SOURCES ${RR_SOURCES} src/test/x86/cpuid_loop.S) endif() set_source_files_properties(${RR_SOURCES} PROPERTIES COMPILE_FLAGS ${RR_FLAGS}) function(post_build_executable target) # grsecurity needs these. But if we add them ourselves, they may conflict # with other flags added in other ways, and they all have to match :-(. So # don't do this until a better solution presents itself # add_custom_command(TARGET ${target} # POST_BUILD # COMMAND setfattr ARGS -n user.pax.flags -v m $) endfunction(post_build_executable) if(UNIX) include(GNUInstallDirs) else() set(CMAKE_INSTALL_LIBDIR "lib") set(CMAKE_INSTALL_BINDIR "bin") set(CMAKE_INSTALL_DATADIR "share") set(CMAKE_INSTALL_DOCDIR "${CMAKE_INSTALL_DATADIR}/doc") set(CMAKE_INSTALL_INCLUDEDIR "include") endif() add_executable(rr ${RR_SOURCES} src/main.cc) set_target_properties(rr PROPERTIES ENABLE_EXPORTS true) post_build_executable(rr) set(RR_BIN rr) add_dependencies(rr Generated) option(strip "Strip debug info from rr binary") set(RR_MAIN_LINKER_FLAGS ${LINKER_FLAGS}) if(strip) set(RR_MAIN_LINKER_FLAGS "-s ${RR_MAIN_LINKER_FLAGS}") endif() target_link_libraries(rr ${CMAKE_DL_LIBS} -lrt brotli ) if(staticlibs) # Urgh ... this might not work for everyone, but there doesn't seem to be # a way to persuade pkg-confing/pkg_check_modules to produce the right flags target_link_libraries(rr -L/home/roc/lib -l:libcapnp.a -l:libkj.a) # Note that this works for both clang++ and g++ set(RR_MAIN_LINKER_FLAGS "-static-libstdc++ ${RR_MAIN_LINKER_FLAGS}") else() target_link_libraries(rr ${CAPNP_LDFLAGS}) endif() set_target_properties(rr PROPERTIES LINK_FLAGS "${RR_MAIN_LINKER_FLAGS}") if (x86ish) target_link_libraries(rrpreload ${CMAKE_DL_LIBS} ) endif() add_executable(rr_exec_stub src/exec_stub.c) post_build_executable(rr_exec_stub) set_target_properties(rr_exec_stub PROPERTIES LINK_FLAGS "-static -nostartfiles -nodefaultlibs ${LINKER_FLAGS}") set_source_files_properties(src/exec_stub.c COMPILE_FLAGS "-fno-stack-protector") set(RR_GDB_RESOURCES 32bit-avx.xml 32bit-core.xml 32bit-linux.xml 32bit-sse.xml 64bit-avx.xml 64bit-core.xml 64bit-linux.xml 64bit-seg.xml 64bit-sse.xml amd64-avx-linux.xml amd64-linux.xml i386-avx-linux.xml i386-linux.xml aarch64-core.xml aarch64-fpu.xml aarch64-pauth.xml ) foreach(file ${RR_GDB_RESOURCES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/third-party/gdb/${file}" "${CMAKE_CURRENT_BINARY_DIR}/share/rr/${file}" COPYONLY) install(FILES third-party/gdb/${file} DESTINATION ${CMAKE_INSTALL_DATADIR}/rr) endforeach(file) foreach(file ${PRELOAD_SOURCE_FILES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/preload/${file}" "${CMAKE_CURRENT_BINARY_DIR}/share/rr/src/preload/${file}" COPYONLY) install(FILES src/preload/${file} DESTINATION ${CMAKE_INSTALL_DATADIR}/rr/src/preload) endforeach(file) foreach(file ${RR_PAGE_SOURCE_FILES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/preload/${file}" "${CMAKE_CURRENT_BINARY_DIR}/share/rr/src/preload/${file}" COPYONLY) install(FILES src/preload/${file} DESTINATION ${CMAKE_INSTALL_DATADIR}/rr/src/preload) endforeach(file) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/scripts/rr-collect-symbols.py" "${CMAKE_CURRENT_BINARY_DIR}/bin/rr-collect-symbols.py" COPYONLY) install(PROGRAMS scripts/signal-rr-recording.sh scripts/rr-collect-symbols.py DESTINATION ${CMAKE_INSTALL_BINDIR}) install(PROGRAMS scripts/rr_completion DESTINATION ${CMAKE_INSTALL_DATADIR}/bash-completion/completions RENAME rr) set(RR_INSTALL_LIBS rrpage rraudit rr_exec_stub) if (x86ish) set(RR_INSTALL_LIBS rrpreload ${RR_INSTALL_LIBS}) endif() install(TARGETS ${RR_BIN} ${RR_INSTALL_LIBS} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr) # Build 32-bit librrpreload and librraudit on 64-bit builds. # We copy the source files into '32' subdirectories in the output # directory, so we can set different compile options on them. # This sucks but I can't find a better way to get CMake to build # the same source file in two different ways. if(rr_32BIT AND rr_64BIT) add_library(rrpage_32) foreach(file ${RR_PAGE_SOURCE_FILES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/preload/${file}" "${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}" COPYONLY) endforeach(file) foreach(file ${RR_PAGE_FILES}) target_sources(rrpage_32 PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}") set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}" PROPERTIES COMPILE_FLAGS "-m32 ${PRELOAD_COMPILE_FLAGS}") endforeach(file) set_target_properties(rrpage_32 PROPERTIES NO_SONAME ON) set_target_properties(rrpage_32 PROPERTIES LINK_FLAGS "-m32 -Wl,-T -Wl,${CMAKE_SOURCE_DIR}/src/preload/rr_page.ld -Wl,--hash-style=both -nostartfiles -nostdlib -Wl,-soname,linux-vdso.so.1 ${LINKER_FLAGS}") set_target_properties(rrpage_32 PROPERTIES LINK_DEPENDS ${CMAKE_SOURCE_DIR}/src/preload/rr_page.ld) set_target_properties(rrpage_32 PROPERTIES LINKER_LANGUAGE C) add_custom_command(TARGET rrpage_32 POST_BUILD COMMAND ${CMAKE_SOURCE_DIR}/src/preload/tweak_librrpage.py $) add_library(rrpreload_32) foreach(file ${PRELOAD_SOURCE_FILES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/preload/${file}" "${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}" COPYONLY) endforeach(file) foreach(file ${PRELOAD_FILES}) target_sources(rrpreload_32 PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}") set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/preload/${file}" PROPERTIES COMPILE_FLAGS "-m32 ${PRELOAD_COMPILE_FLAGS}") endforeach(file) set_target_properties(rrpreload_32 PROPERTIES LINK_FLAGS "-m32 -nostartfiles ${LINKER_FLAGS}") set_target_properties(rrpreload_32 PROPERTIES INSTALL_RPATH "\$ORIGIN") target_link_libraries(rrpreload_32 ${CMAKE_DL_LIBS} ) add_library(rraudit_32) foreach(file ${AUDIT_SOURCE_FILES}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/audit/${file}" "${CMAKE_CURRENT_BINARY_DIR}/32/audit/${file}" COPYONLY) endforeach(file) foreach(file ${AUDIT_FILES}) target_sources(rraudit_32 PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/32/audit/${file}") set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/audit/${file}" PROPERTIES COMPILE_FLAGS "-m32 ${PRELOAD_COMPILE_FLAGS}") endforeach(file) set_target_properties(rraudit_32 PROPERTIES LINK_FLAGS "-m32 -nostartfiles ${LINKER_FLAGS}") target_link_libraries(rraudit_32 ${CMAKE_DL_LIBS} ) foreach(file exec_stub.c) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/${file}" "${CMAKE_CURRENT_BINARY_DIR}/32/${file}" COPYONLY) set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/${file}" PROPERTIES COMPILE_FLAGS "-m32 -fno-stack-protector") endforeach(file) add_executable(rr_exec_stub_32 32/exec_stub.c) post_build_executable(rr_exec_stub_32) set_target_properties(rr_exec_stub_32 PROPERTIES LINK_FLAGS "-static -nostartfiles -nodefaultlibs -m32 ${LINKER_FLAGS}") install(TARGETS rrpreload_32 rrpage_32 rraudit_32 rr_exec_stub_32 RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr) endif() ##-------------------------------------------------- ## Testing # A "basic test" consists of a foo.c source file. All basic tests use the # same basic_test.run driver script. The test name is passed as an additional # parameter to the driver script. This script just does # "compare_test EXIT-SUCCESS", i.e. records and replays the program and verifies # that the output of both runs is identical and contains EXIT-SUCCESS. # # NB: you must update this variable when adding a new test source # file. The list is not generated automatically. # # Alphabetical, please. set(BASIC_TESTS 64bit_child _llseek abort accept acct adjtimex aio alarm alarm2 alsa_ioctl arch_prctl async_segv_ignored at_threadexit bad_ip bad_syscall barrier big_buffers block block_open bpf brk brk2 capget chew_cpu x86/chew_cpu_cpuid chmod chown clock clock_adjtime clock_nanosleep clock_time64 clone clone_bad_stack clone_bad_tls clone_cleartid_coredump clone_fail clone_file_range clone_immediate_exit clone_newflags clone_parent clone_untraced clone_vfork_pidfd cloned_sigmask constructor copy_file_range x86/cpuid_same_state creat_address_not_truncated x86/cross_arch cwd_inaccessible daemon desched_blocking_poll detach_state detach_threads deterministic_sigsys dev_zero direct dup doublesegv epoll_create epoll_create1 epoll_pwait_eintr_sigmask eventfd exec_flags exec_no_env exec_self exec_from_main_thread exec_from_other_thread exec_stopsig exit_with_syscallbuf_signal fadvise fanotify fatal_init_signal fatal_sigsegv_thread x86/fault_in_code_page fcntl_dupfd fcntl_misc fcntl_notify fcntl_owner_ex fcntl_rw_hints fcntl_seals fcntl_sig fd_cleanup fd_tracking_across_threads fds_clean flock flock_ofd flock2 fork_brk fork_child_crash fork_many futex_exit_race futex_exit_race_sigsegv futex_pi futex_priorities futex_requeue gcrypt_rdrand getcpu getgroups getpwnam getrandom setitimer getsid gettimeofday grandchild_threads grandchild_threads_main_running grandchild_threads_thread_running grandchild_threads_parent_alive x86/hle inotify int3 intr_futex_wait_restart intr_poll intr_ppoll intr_pselect intr_read_no_restart intr_read_restart intr_sleep intr_sleep_no_restart invalid_exec invalid_fcntl invalid_ioctl io io_uring ioctl ioctl_blk ioctl_fb ioctl_fs ioctl_pty ioctl_sg ioctl_tty ioctl_vt ioprio x86/ioperm x86/iopl join_threads joystick kcmp keyctl kill_newborn kill_ptracee large_hole large_write_deadlock legacy_ugid x86/lsl madvise madvise_free madvise_wipeonfork map_fixed map_shared_syscall membarrier memfd_create memfd_create_shared memfd_create_shared_huge mincore mknod mlock mmap_adjacent_to_rr_usage mmap_private mmap_private_grow_under_map mmap_ro mmap_self_maps_shared mmap_shared mmap_shared_dev_zero mmap_shared_grow mmap_shared_grow_under_map mmap_shared_multiple mmap_shared_subpage mmap_shared_write mmap_shared_write_fork mmap_short_file mmap_write_complex mmap_zero_size_fd x86/modify_ldt mount_ns_exec mount_ns_exec2 mprotect mprotect_heterogenous mprotect_none mprotect_stack mq mremap mremap_after_coalesce mremap_grow mremap_grow_shared mremap_non_page_size mremap_overwrite mremap_private_grow_under_map mremap_shrink msg msg_trunc msync mtio multiple_pending_signals multiple_pending_signals_sequential munmap_segv munmap_discontinuous nanosleep netfilter netlink_mmap_disable no_mask_timeslice nscd numa x86/old_fork orphan_process packet_mmap_disable pause perf_event personality pid_ns_kill_child pid_ns_kill_child_threads pid_ns_kill_child_zombie pid_ns_kill_threads pid_ns_kill_threads_exit_wait pid_ns_reap pid_ns_segv pid_ns_shutdown pidfd x86/pkeys poll_sig_race ppoll prctl prctl_caps prctl_deathsig prctl_name prctl_short_name prctl_speculation_ctrl x86/prctl_tsc privileged_net_ioctl proc_fds proc_mem protect_rr_fds prw pthread_condvar_locking pthread_mutex_timedlock pthread_pi_mutex pthread_rwlocks x86/ptrace ptrace_attach_null_status ptrace_attach_running ptrace_attach_sleeping ptrace_attach_stopped ptrace_attach_thread_running ptrace_breakpoint ptrace_change_patched_syscall x86/ptrace_debug_regs ptrace_exec x86/ptrace_exec32 ptrace_kill_grandtracee x86/ptrace_tls ptrace_seize ptrace_sigchld_blocked ptrace_signals ptrace_singlestep ptrace_syscall ptrace_syscall_clone_untraced x86/ptrace_sysemu ptrace_sysemu_syscall ptrace_trace_clone ptrace_trace_exit ptrace_traceme ptracer_death ptracer_death_multithread ptracer_death_multithread_peer # pivot_root ... disabled because it fails when run as root and does nothing otherwise quotactl x86/rdtsc read_nothing readdir read_large read_oversize readlink readlinkat readv record_replay_subject recvfrom redzone_integrity rename rlimit robust_futex rseq rusage samask save_data_fd sched_attr sched_setaffinity sched_setparam sched_yield sched_yield_to_lower_priority scm_rights scratch_read seccomp seccomp_cloning seccomp_clone_fail seccomp_desched seccomp_kill_exit seccomp_null seccomp_sigsys_args seccomp_sigsys_sigtrap seccomp_sigsys_syscallbuf seccomp_tsync seccomp_veto_exec self_shebang self_sigint sem send_block sendfile set_ptracer set_tid_address setgid setgroups setsid setuid shared_exec shared_monitor shared_offset shared_write shm shm_unmap sigaction_old sigaltstack sigchld_interrupt_signal sigcont sighandler_bad_rsp_sigsegv sighandler_fork sighandler_mask sigill signal_deferred signal_during_preload_init signal_frame signal_unstoppable signalfd sigprocmask sigprocmask_ensure_delivery sigprocmask_exec sigprocmask_evil sigprocmask_in_syscallbuf_sighandler sigprocmask_rr_sigs sigprocmask_syscallbuf sigqueueinfo x86/sigreturn sigreturn_reg sigreturnmask sigrt sigstop sigstop2 sigsuspend sigtrap simple_threads_stress sioc small_holes sock_names_opts spinlock_priorities splice stack_growth_after_syscallbuf stack_growth_syscallbuf stack_growth_with_guard stack_invalid stack_overflow stack_overflow_altstack stack_overflow_with_guard statfs statx stdout_child stdout_cloexec stdout_dup stdout_redirect switch_read symlink sync sync_file_range syscall_bp syscall_in_writable_mem syscallbuf_signal_reset syscallbuf_signal_blocking syscallbuf_sigstop syscallbuf_timeslice syscallbuf_timeslice2 sysconf sysctl sysemu_singlestep x86/sysfs sysinfo tgkill thread_yield timer timerfd times truncate_temp tun two_signals_with_mask ulimit_low uname unexpected_exit unexpected_exit_execve unexpected_exit_execve_twice unexpected_exit_pid_ns unjoined_thread unshare userfaultfd utimes vdso_parts vfork_flush vfork_shared video_capture vm_readv_writev vsyscall vsyscall_timeslice x86/x87env wait wait_sigstop write_race writev xattr zero_length_read ) set(BASIC_CPP_TESTS std_random unwind_rr_page ) # A "test with program" consists of a foo.c source file and a foo.run driver # script. See src/test/util.sh to learn how the .run files work. # # NB: you must update this variable when adding a new test source # file. The list is not generated automatically. # # Alphabetical, please. set(TESTS_WITH_PROGRAM abort_nonmain alternate_thread_diversion args async_kill_with_threads async_kill_with_threads_main_running async_kill_with_threads_thread_running async_segv async_signal_syscalls async_signal_syscalls2 async_signal_syscalls_siginfo async_usr1 blacklist block_clone_checkpoint block_clone_interrupted block_clone_syscallbuf_overflow block_intr_sigchld blocked_bad_ip blocked_sigill x86/blocked_sigsegv breakpoint breakpoint_conditions breakpoint_overlap call_function call_gettid # Disabled because it's very slow # check_session_leaks checkpoint_dying_threads checkpoint_mixed_mode checksum_sanity check_lost_interrupts clone_interruption # Disabled because it fails # clone_share_vm clone_vfork conditional_breakpoint_calls conditional_breakpoint_offload condvar_stress cont_race x86/cpuid_singlestep crash crash_in_function daemon_read dconf_mock dev_tty diversion_sigtrap diversion_syscall dlopen early_error elapsed_time exclusion_region exec_failed exec_many execve_loop exit_codes exit_group exit_race exit_status x86/explicit_checkpoints fd_limit fork_stress fork_syscalls function_calls x86/fxregs getcwd gdb_bogus_breakpoint goto_event hello hooks # Disabled because issue #1806 makes tests fail on Debian 8.5 at least # history ignored_async_usr1 ignored_sigsegv ignore_nested immediate_restart x86/int3_ok interrupt intr_ptrace_decline invalid_interpreter invalid_jump jit_proc_mem link madvise_dontfork main_thread_exit mmap_fd_reuse_checkpoint mmap_replace_most_mappings mmap_shared_prot mmap_shared_write_exec_race mmap_tmpfs mmap_write mmap_write_private morestack_unwind mprotect_growsdown mprotect_syscallbuf_overflow mutex_pi_stress nested_detach_wait overflow_branch_counter patch_page_end x86/patch_40_80_f6_81 priority ptrace_remote_unmap remove_latest_trace # Not called ps, because that interferes with using real 'ps' in tests rr_ps rr_ps_ns read_big_struct restart_abnormal_exit reverse_continue_breakpoint reverse_continue_multiprocess reverse_continue_process_signal reverse_many_breakpoints reverse_step_long reverse_step_threads reverse_step_threads_break search seccomp_blocks_rr seccomp_signals segfault shared_map shared_persistent_file signal_numbers sigprocmask_race sigprocmask_rr_sigs_nondefault simple x86/singlestep_pushf stack_growth step_thread strict_priorities x86/string_instructions x86/string_instructions_async_signals x86/string_instructions_async_signals_shared x86/string_instructions_multiwatch x86/string_instructions_replay x86/string_instructions_singlestep_fastforward x86/string_instructions_watch syscallbuf_fd_disabling syscallbuf_signal_blocking_read sysconf_onln target_fork target_process tcp_sockets term_nonmain term_rr term_trace_reset term_trace_syscall thread_exit_signal thread_open_race thread_stress threaded_syscall_spam threads tls ttyname unexpected_stack_growth user_ignore_sig vdso_clock_gettime_stack vdso_gettimeofday_stack vdso_time_stack vfork vfork_read_clone_stress vsyscall_reverse_next wait_for_all watchpoint watchpoint_at_sched watchpoint_before_signal watchpoint_no_progress watchpoint_size_change watchpoint_syscall watchpoint_unaligned ) # A "test without program" is a foo.run driver script only, which does # something with one of the test executables above (or has special rules # to build its own executable). # # NB: you must update this variable when adding a new test source # file. The list is not generated automatically. # # Alphabetical, please. set(TESTS_WITHOUT_PROGRAM async_signal_syscalls_100 async_signal_syscalls_1000 bad_breakpoint break_block break_clock break_clone break_exec break_int3 break_mmap_private break_msg x86/break_rdtsc break_sigreturn break_sync_signal break_thread break_time_slice breakpoint_consistent call_exit check_patched_pthread checkpoint_async_signal_syscalls_1000 checkpoint_mmap_shared checkpoint_prctl_name checkpoint_simple checksum_sanity_noclone cont_signal x86/cpuid dead_thread_target desched_ticks deliver_async_signal_during_syscalls env_newline exec_deleted exec_stop execp explicit_checkpoint_clone file_name_newline final_sigkill first_instruction fork_exec_info_thr get_thread_list hardlink_mmapped_files hbreak mprotect_step nested_detach nested_detach_kill nested_release parent_no_break_child_bkpt parent_no_stop_child_crash post_exec_fpu_regs proc_maps read_bad_mem record_replay remove_watchpoint replay_overlarge_event_number replay_serve_files restart_invalid_checkpoint restart_unstable restart_diversion reverse_alarm reverse_continue_exec_subprocess reverse_continue_fork_subprocess reverse_continue_int3 reverse_continue_start reverse_finish reverse_step_breakpoint reverse_step_signal reverse_step_threads2 reverse_watchpoint reverse_watchpoint_syscall run_end run_in_function sanity seekticks shm_checkpoint siginfo x86/sigreturn_checksum signal_stop signal_checkpoint simple_script simple_script_debug simple_winch stack_overflow_debug step1 x86/step_rdtsc step_signal x86/string_instructions_break x86/string_instructions_replay_quirk subprocess_exit_ends_session switch_processes syscallbuf_timeslice_250 trace_version term_trace_cpu trace_events tty unmap_vdso unwind_on_signal vfork_exec vfork_break_parent vsyscall_singlestep watch_code watchpoint_cond when ) if(BUILD_TESTS) # Part of the installable testsuite (test files). if(INSTALL_TESTSUITE) install(DIRECTORY ${CMAKE_SOURCE_DIR}/src/test/ DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/rr/src/test USE_SOURCE_PERMISSIONS) endif(INSTALL_TESTSUITE) # We use symlinks in the tests to access the build and source directories. # This is needed because we cannot change the paths used by the tests when # the testsuite is installed. We work around this by using symlinks during # the normal build, and then installing symlinks with the testsuite that # have the same name but, the new link targets. execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_SOURCE_DIR} source_dir) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_BINARY_DIR} bin_dir) if(INSTALL_TESTSUITE) # Create the directory for the symlinks first and then create symlinks. install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink \${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/rr \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/source_dir) execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink \${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin_dir)") endif(INSTALL_TESTSUITE) add_test(check_environment bash source_dir/src/test/check_environment_test.run) set_tests_properties(check_environment PROPERTIES FAIL_REGULAR_EXPRESSION "rr needs /proc/sys/kernel/perf_event_paranoid <= 1") foreach(test ${BASIC_TESTS} ${TESTS_WITH_PROGRAM}) if (NOT x86ish AND ${test} MATCHES "^x86/.*") continue() endif() get_filename_component(testname ${test} NAME) add_executable(${testname} src/test/${test}.c) target_include_directories(${testname} PRIVATE src/preload) post_build_executable(${testname}) set_source_files_properties(src/test/${test}.c PROPERTIES COMPILE_FLAGS ${RR_TEST_FLAGS}) add_dependencies(${testname} Generated) target_link_libraries(${testname} -lrt -ldl) # Part of the installable testsuite (test programs). if(INSTALL_TESTSUITE) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/${testname} DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) endif(INSTALL_TESTSUITE) endforeach(test) # Test disabled because it requires libuvc to be built and installed, and a # working USB camera # add_executable(usb src/test/usb.c) # post_build_executable(usb) # add_dependencies(usb Generated) # target_link_libraries(usb -lrt -L/usr/local/lib -luvc -lusb-1.0) foreach(test ${BASIC_CPP_TESTS}) add_executable(${test} src/test/${test}.cc) post_build_executable(${test}) set_source_files_properties(src/test/${test}.cc PROPERTIES COMPILE_FLAGS ${RR_TEST_FLAGS}) add_dependencies(${test} Generated) target_link_libraries(${test} -lrt) # Part of the installable testsuite (test programs). if(INSTALL_TESTSUITE) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/${test} DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) endif(INSTALL_TESTSUITE) endforeach(test) add_library(test_lib src/test/test_lib.c ) add_dependencies(test_lib Generated) set_source_files_properties(src/test/test_lib.c PROPERTIES COMPILE_FLAGS ${RR_TEST_FLAGS}) target_link_libraries(constructor -lrt test_lib) # cpuid test needs to link with cpuid_loop.S if (x86ish) add_executable(cpuid src/test/x86/cpuid.c src/test/x86/cpuid_loop.S) post_build_executable(cpuid) set_source_files_properties(src/test/x86/cpuid.c PROPERTIES COMPILE_FLAGS ${RR_TEST_FLAGS}) add_dependencies(cpuid Generated) target_link_libraries(cpuid -lrt) endif() # Check if we're running on KNL. If so, we allot more time to tests, due to # reduced single-core performance. exec_program(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) string(REGEX MATCH "^.*(Xeon Phi).*$" CPU_MODEL_PHI ${CPUINFO}) if(NOT "${CPU_MODEL_PHI}" STREQUAL "") set(TEST_MONITOR_DEFAULT_TIMEOUT 480) else() set(TEST_MONITOR_DEFAULT_TIMEOUT 120) endif() # The real timeouts are handled by test-monitor set(CTEST_TEST_TIMEOUT 1000) function(configure_test test) set_tests_properties(${test} PROPERTIES FAIL_REGULAR_EXPRESSION "FAILED") endfunction(configure_test) if(INSTALL_TESTSUITE) install(TARGETS test_lib LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/test-monitor DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) if (x86ish) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/cpuid DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) endif(x86ish) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj) endif(INSTALL_TESTSUITE) foreach(test ${BASIC_TESTS} ${BASIC_CPP_TESTS} ${OTHER_TESTS}) if (NOT x86ish AND ${test} MATCHES "^x86/.*") continue() endif() get_filename_component(testname ${test} NAME) add_test(${test} bash source_dir/src/test/basic_test.run ${testname} "" bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}) add_test(${test}-no-syscallbuf bash source_dir/src/test/basic_test.run ${testname} -n bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-no-syscallbuf) endforeach(test) foreach(test ${TESTS_WITH_PROGRAM} ${TESTS_WITHOUT_PROGRAM}) if (NOT x86ish AND ${test} MATCHES "^x86/.*") continue() endif() get_filename_component(testname ${test} NAME) add_test(${test} bash source_dir/src/test/${test}.run ${testname} "" bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}) add_test(${test}-no-syscallbuf bash source_dir/src/test/${test}.run ${testname} -n bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-no-syscallbuf) endforeach(test) # Run 32-bit tests on 64-bit builds. # We copy the test files into '32' subdirectories in the output # directory, so we can set different compile options on them. # This sucks but I can't find a better way to get CMake to build # the same source file in two different ways. if(rr_32BIT AND rr_64BIT) foreach(header util.h nsutils.h ptrace_util.h util_internal.h) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/test/${header}" "${CMAKE_CURRENT_BINARY_DIR}/32/${header}" COPYONLY) endforeach(header) foreach(test ${BASIC_TESTS} ${TESTS_WITH_PROGRAM} x86/cpuid test_lib) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/test/${test}.c" "${CMAKE_CURRENT_BINARY_DIR}/32/${test}.c" COPYONLY) set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/${test}.c" PROPERTIES COMPILE_FLAGS "-m32 ${RR_TEST_FLAGS}") endforeach(test) foreach(test ${BASIC_CPP_TESTS}) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/test/${test}.cc" "${CMAKE_CURRENT_BINARY_DIR}/32/${test}.cc" COPYONLY) set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/${test}.cc" PROPERTIES COMPILE_FLAGS "-m32 ${RR_TEST_FLAGS}") endforeach(test) foreach(file x86/cpuid_loop.S x86/util.h) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/test/${file}" "${CMAKE_CURRENT_BINARY_DIR}/32/${file}" COPYONLY) set_source_files_properties("${CMAKE_CURRENT_BINARY_DIR}/32/${file}" PROPERTIES COMPILE_FLAGS "-m32 ${RR_TEST_FLAGS}") endforeach(file) foreach(test ${BASIC_TESTS} ${BASIC_CPP_TESTS} ${TESTS_WITH_PROGRAM}) get_filename_component(testname ${test} NAME) if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/32/${test}.c") add_executable(${testname}_32 "${CMAKE_CURRENT_BINARY_DIR}/32/${test}.c") else() add_executable(${testname}_32 "${CMAKE_CURRENT_BINARY_DIR}/32/${test}.cc") endif() target_include_directories(${testname}_32 PRIVATE src/preload) post_build_executable(${testname}_32) add_dependencies(${testname}_32 Generated) set_target_properties(${testname}_32 PROPERTIES LINK_FLAGS "-m32 ${RR_TEST_FLAGS} ${LINKER_FLAGS}") target_link_libraries(${testname}_32 -lrt -ldl) # Part of the installable testsuite (test programs). if (INSTALL_TESTSUITE) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/${testname}_32 DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) endif (INSTALL_TESTSUITE) endforeach(test) add_library(test_lib_32 "${CMAKE_CURRENT_BINARY_DIR}/32/test_lib.c" ) add_dependencies(test_lib_32 Generated) set_target_properties(test_lib_32 PROPERTIES LINK_FLAGS "-m32 ${LINKER_FLAGS}") target_link_libraries(constructor_32 -lrt test_lib_32) # cpuid test needs to link with cpuid_loop.S add_executable(cpuid_32 32/x86/cpuid.c 32/x86/cpuid_loop.S) post_build_executable(cpuid_32) add_dependencies(cpuid_32 Generated) set_target_properties(cpuid_32 PROPERTIES LINK_FLAGS "-m32 ${LINKER_FLAGS}") target_link_libraries(cpuid_32 -lrt) if(INSTALL_TESTSUITE) install(TARGETS test_lib_32 LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr) install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/bin/cpuid_32 DESTINATION ${CMAKE_INSTALL_LIBDIR}/rr/testsuite/obj/bin) endif(INSTALL_TESTSUITE) foreach(test ${BASIC_TESTS} ${BASIC_CPP_TESTS} ${OTHER_TESTS}) get_filename_component(testname ${test} NAME) add_test(${test}-32 bash source_dir/src/test/basic_test.run ${testname}_32 "" bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-32) add_test(${test}-32-no-syscallbuf bash source_dir/src/test/basic_test.run ${testname}_32 -n bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-32-no-syscallbuf) endforeach(test) foreach(test ${TESTS_WITH_PROGRAM} ${TESTS_WITHOUT_PROGRAM}) get_filename_component(testname ${test} NAME) add_test(${test}-32 bash source_dir/src/test/${test}.run ${testname}_32 "" bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-32) add_test(${test}-32-no-syscallbuf bash source_dir/src/test/${test}.run ${testname}_32 -n bin_dir ${TEST_MONITOR_DEFAULT_TIMEOUT}) configure_test(${test}-32-no-syscallbuf) endforeach(test) endif() set(CHAOS_TESTS core_count futex_wakeup getaffinity_core_count pipe_wakeup mmap_adjacent mmap_bits starvation_multithreaded starvation_singlethreaded ) foreach(test ${CHAOS_TESTS}) add_executable(${test} src/chaos-test/${test}.c) post_build_executable(${test}) target_link_libraries(${test} -lrt) endforeach(test) add_executable(test-monitor src/test-monitor/test-monitor.cc) endif() add_executable(ftrace_helper src/ftrace/ftrace_helper.c) include(ProcessorCount) ProcessorCount(N) if(NOT N EQUAL 0) set(JFLAG -j${N}) endif() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --verbose ${JFLAG}) # Run only syscallbuf-enabled and native-bitness tests add_custom_target(fastcheck COMMAND ${CMAKE_CTEST_COMMAND} --verbose --exclude-regex '[-]' ${JFLAG}) ##-------------------------------------------------- ## Package configuration include (InstallRequiredSystemLibraries) set(CPACK_PACKAGE_NAME "rr") set(CPACK_PACKAGE_VERSION_MAJOR "${rr_VERSION_MAJOR}") set(CPACK_PACKAGE_VERSION_MINOR "${rr_VERSION_MINOR}") set(CPACK_PACKAGE_VERSION_PATCH "${rr_VERSION_PATCH}") set(CPACK_SYSTEM_NAME "${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_OUTPUT_FILE_PREFIX dist) set(CPACK_GENERATOR "TGZ;RPM;DEB") set(CPACK_SOURCE_GENERATOR "TGZ") set(CPACK_BINARY_DIR "${PROJECT_BINARY_DIR}") # Don't strip binaries. It's important/useful for librrpreload at least to # have debug symbols. For package releases, pass -Dstrip=TRUE to strip symbols # from the rr binary at build time. set(CPACK_STRIP_FILES FALSE) set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Lightweight tool for recording and replaying execution of applications (trees of processes and threads)") set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md") set(CPACK_PACKAGE_VENDOR "rr-debugger") set(CPACK_DEBIAN_PACKAGE_MAINTAINER "rr-debugger") set(CPACK_DEBIAN_PACKAGE_SECTION "devel") if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "i.86") set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "i386") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm.*") set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "arm") endif() # XXX Cmake 2.8.7 doesn't know how to avoid specifying /usr, # /usr/bin, etc, as files to be installed, but distros are finicky # about their specification. We want to manually filter those paths # out of our install list but 2.8.7 also isn't capable of that. set(CPACK_RPM_USER_BINARY_SPECFILE "${CMAKE_SOURCE_DIR}/rr.spec") set(CPACK_RPM_PACKAGE_RELEASE 1) set(CPACK_RPM_PACKAGE_GROUP "Development/Debuggers") set(CPACK_RPM_PACKAGE_LICENSE "MIT and BSD") # Prevent binaries from being stripped set(CPACK_RPM_SPEC_INSTALL_POST "/bin/true") include (CPack) ##-------------------------------------------------- ## Misc add_custom_target(setup-travis COMMAND src/script/setup_travis.sh) rr-5.5.0/CODE_OF_CONDUCT.md000066400000000000000000000012631412202446200147420ustar00rootroot00000000000000# Community Participation Guidelines This repository is governed by Mozilla's code of conduct and etiquette guidelines. For more details, please read the [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). ## How to Report For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. rr-5.5.0/CONTRIBUTING.md000066400000000000000000000072611412202446200144000ustar00rootroot00000000000000## Submission Checklist Please make sure you go through this list before submitting a patch. The rules aren't hard and fast, but mostly adhering to them will make for quicker mergings. - [ ] Does your PR add support for a new kernel API? For example, supporting a new syscall. If so, your patch should include at least one new test for the API. This is usually pretty easy. See `$rr/src/test` for examples. - [ ] Did you run the rr test suite (including your new tests, if any), and pass all the tests? `make -C $objdir check`. Unfortunately, rr doesn't have automated infrastructure that can run the tests yet, so developers have to run them locally. - [ ] If you created new files for your PR, did you `git add` them? Habitually (or with a script or push hook) checking `git status` is a good habit to acquire. - [ ] If you changed the trace layout or format, did you bump `TRACE_VERSION`? - [ ] If you added new command-line parameters, did you update `print_usage()` to document them? - [ ] Does your PR apply cleanly on top of upstream/master HEAD? It's dangerous to have someone else sort out your merge conflicts, so just don't do it. Best of all is to have a PR *rebased* on top of upstream/master HEAD, so that the merge is simply a fast-forward. - [ ] If your PR includes multiple changesets, do they all (i) build cleanly in sequence; (ii) pass all tests in sequence? This is important for bisecting over commit history. - [ ] If your PR is a very large-scale change (for example, a rewrite in Rust to use the visitor pattern), did you discuss the proposed changes in an issue or the mailing list? It's hard to review large patches that just fall in ones lap. It's much easier to discuss the important changes at a high level and then approach the patch knowing what's important and what's not. - [ ] If your PR is large or includes many changesets, would it have been possible to break the changes into a series of smaller PRs? For example, it's hard to review a big patch that, say, fixes whitespace errors in a file along with a one-line, important, bug fix. It's much easier to review one PR that fixes whitespace (which can just be skimmed), and then review another PR that makes the one-line bug fix (which would be scrutinized more). This approach is also better for the patch author in that it usually allows the work to land faster, and reduces the burden of continually un-bit-rotting large, trivial, changes. - [ ] Did you check your code is formatted correctly? It's easiest to run `scripts/reformat.sh` on each commit. ## Coding Guidelines rr uses assertions heavily, for code documentation, for automated checking that the code matches the documentation, and to improve the power of automated tests. Assertions are turned on in release builds. Whenever you depend on an invariant not immediately obvious, consider adding assertions to check it. rr ships with debugging enabled and compiler optimizations disabled for the rr process itself. That's because rr performance almost always depends on algorithmic issues --- minimizing the number of system calls, and especially, minimizing the number of context switches between the tracees and the rr process --- much more than the performance of the code running in the rr process. For the same reason, rr-process code should be as simple as possible even if that's less efficient. To some extent, once we're running code in the rr process, we've already lost performance-wise. OTOH we do enable optimizations in `preload.c` because that runs in tracees. ## Coding Style Put braces around all statement blocks, even one-line `if` bodies etc. All C++ declarations are in the `rr` namespace. All C++ types are in CamelCase; all C types are underscore_names. rr-5.5.0/LICENSE000066400000000000000000000051071412202446200131510ustar00rootroot00000000000000Copyright (c) 2013 Mozilla Foundation Copyright 2015 VMware, Inc Copyright 2015 Google Inc. Contributors: Albert Noll , Thomas Anderegg , Nimrod Partush Andrew Walton Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /* * Copyright 2002 Niels Provos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ rr-5.5.0/README.md000066400000000000000000000031121412202446200134150ustar00rootroot00000000000000# Overview rr is a lightweight tool for recording, replaying and debugging execution of applications (trees of processes and threads). Debugging extends gdb with very efficient reverse-execution, which in combination with standard gdb/x86 features like hardware data watchpoints, makes debugging much more fun. More information about the project, including instructions on how to install, run, and build rr, is at [https://rr-project.org](https://rr-project.org). The best technical overview is currently the paper [Engineering Record And Replay For Deployability: Extended Technical Report](https://arxiv.org/pdf/1705.05937.pdf). Or go directly to the [installation and building instructions](https://github.com/rr-debugger/rr/wiki/Building-And-Installing). Please contribute! Make sure to review the [pull request checklist](/CONTRIBUTING.md) before submitting a pull request. If you find rr useful, please [add a testimonial](https://github.com/rr-debugger/rr/wiki/Testimonials). rr development is sponsored by [Pernosco](https://pernos.co) and was originated by [Mozilla](https://www.mozilla.org). # System requirements * Linux kernel ≥ 3.11 is required (for `PTRACE_SETSIGMASK`). * rr currently requires either: * An Intel CPU with [Nehalem](https://en.wikipedia.org/wiki/Nehalem_%28microarchitecture%29) (2010) or later microarchitecture. * Certain AMD Zen or later processors (see https://github.com/rr-debugger/rr/wiki/Zen) * Running in a VM guest is supported, as long as the VM supports virtualization of hardware performance counters. (VMware and KVM are known to work; Xen does not.) rr-5.5.0/Vagrantfile000066400000000000000000000023231412202446200143260ustar00rootroot00000000000000# -*- mode: ruby -*- # vi: set ft=ruby : # Author: David Manouchehri Vagrant.configure("2") do |config| config.vm.box = "bento/ubuntu-16.04" config.vm.synced_folder ".", "/vagrant", disabled: true config.vm.provision "shell", inline: <<-SHELL apt-get update # DEBIAN_FRONTEND=noninteractive apt-get -y upgrade DEBIAN_FRONTEND=noninteractive apt-get -y install ccache cmake make g++-multilib gdb pkg-config realpath python-pexpect manpages-dev git ninja-build capnproto libcapnp-dev apt-get clean SHELL config.vm.provision "shell", privileged: false, inline: <<-SHELL git clone https://github.com/rr-debugger/rr.git cd rr mkdir obj cd obj cmake .. -DPYTHON_EXECUTABLE=/usr/bin/python make -j8 make test SHELL config.vm.provision "shell", inline: <<-SHELL cd /home/vagrant/rr/obj/ make install SHELL %w(vmware_fusion vmware_workstation vmware_appcatalyst).each do |provider| config.vm.provider provider do |v| v.vmx["memsize"] = "4096" v.vmx['vpmc.enable'] = 'true' v.vmx['vhv.enable'] = 'true' v.vmx['vvtd.enable'] = 'true' v.vmx['monitor_control.disable_hvsim_clusters'] = 'true' v.vmx['virtualHW.version'] = '14' v.vmx['ethernet0.virtualDev'] = 'vmxnet3' end end end rr-5.5.0/configure000077500000000000000000000001301412202446200140420ustar00rootroot00000000000000#!/usr/bin/env bash # Helper to make |./configure && make| do what you expect. cmake . rr-5.5.0/include/000077500000000000000000000000001412202446200135645ustar00rootroot00000000000000rr-5.5.0/include/rr/000077500000000000000000000000001412202446200142075ustar00rootroot00000000000000rr-5.5.0/include/rr/rr.h000066400000000000000000000031601412202446200150030ustar00rootroot00000000000000/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_H_ #define RR_H_ /** * rr tracees can write data to this special fd that they want * verified across record/replay. When it's written in recording, rr * saves the data. During replay, the data are checked against the * recorded data. * * Tracees using this interface should take care that the buffers * storing the data are either not racy, or are synchronized by the * tracee. * * To simplify things, we make this a valid fd opened to /dev/null during * recording. * * Tracees may close this fd, or dup() something over it, etc. If that happens, * it will lose its magical properties. */ #define RR_MAGIC_SAVE_DATA_FD 999 /** * rr uses this fd to ensure the tracee has access to the original root * directory after a chroot(). Tracee close()es of this fd will be silently * ignored, and tracee dup()s to this fd will fail with EBADF. * This is set up during both recording and replay. */ #define RR_RESERVED_ROOT_DIR_FD 1000 /** * Tracees use this fd to send other fds to rr. * This is only set up during recording. * Only the outermost rr uses this. Inner rr replays will use a different fd. */ #define RR_RESERVED_SOCKET_FD 1001 /** * The preferred fd that rr uses to control tracee desched. Some software * (e.g. the chromium IPC code) wants to have the first few fds all to itself, * so we need to stay above some floor. Tracee close()es of the fd that is * actually assigned will be silently ignored, and tracee dup()s to that fd will * fail with EBADF. */ #define RR_DESCHED_EVENT_FLOOR_FD 100 #endif /* RR_H_ */ rr-5.5.0/rr.spec000066400000000000000000000033161412202446200134430ustar00rootroot00000000000000Buildroot: @CPACK_BINARY_DIR@/_CPack_Packages/@CPACK_SYSTEM_NAME@/RPM/@CPACK_PACKAGE_FILE_NAME@ Summary: Lightweight tool for recording and replaying execution of applications (trees of processes and threads) Name: @CPACK_PACKAGE_NAME@ Version: @CPACK_PACKAGE_VERSION@ Release: @CPACK_RPM_PACKAGE_RELEASE@ License: @CPACK_RPM_PACKAGE_LICENSE@ Group: Development/Debuggers Vendor: @CPACK_PACKAGE_VENDOR@ Prefix: @CPACK_PACKAGING_INSTALL_PREFIX@ @CPACK_RPM_PACKAGE_REQUIRES@ %define _rpmfilename @CPACK_PACKAGE_FILE_NAME@.rpm %define _unpackaged_files_terminate_build 0 %description rr is a lightweight tool for recording and replaying execution of applications (trees of processes and threads). For more information, please visit http://rr-project.org # This is a shortcutted spec file generated by CMake RPM generator # we skip _install step because CPack does that for us. # We do only save CPack installed tree in _prepr # and then restore it in build. %install mkdir -p %{buildroot}@CPACK_PACKAGING_INSTALL_PREFIX@/lib64 cp -a %{buildroot}@CPACK_PACKAGING_INSTALL_PREFIX@/lib/* %{buildroot}@CPACK_PACKAGING_INSTALL_PREFIX@/lib64 ln -s @CPACK_BINARY_DIR@/_CPack_Packages/@CPACK_SYSTEM_NAME@/RPM/RPMS/@CPACK_PACKAGE_FILE_NAME@.rpm @CPACK_BINARY_DIR@/_CPack_Packages/@CPACK_SYSTEM_NAME@/RPM/@CPACK_PACKAGE_FILE_NAME@.rpm %files %defattr(-,root,root,-) @CPACK_PACKAGING_INSTALL_PREFIX@/lib64/* @CPACK_PACKAGING_INSTALL_PREFIX@/bin/rr @CPACK_PACKAGING_INSTALL_PREFIX@/bin/rr_exec_stub* @CPACK_PACKAGING_INSTALL_PREFIX@/bin/signal-rr-recording.sh @CPACK_PACKAGING_INSTALL_PREFIX@/share/rr/*.xml %changelog * Tue Jun 25 2013 Chris Jones - - Initial build. rr-5.5.0/scripts/000077500000000000000000000000001412202446200136305ustar00rootroot00000000000000rr-5.5.0/scripts/checkpoint-visualizer.html000066400000000000000000000053271412202446200210470ustar00rootroot00000000000000
CheckpointTimeTime to next
rr-5.5.0/scripts/reformat.sh000077500000000000000000000001211412202446200160000ustar00rootroot00000000000000#!/bin/sh find src -regex '.*\.\(c\|h\|cc\)$'|xargs clang-format -style=file -i rr-5.5.0/scripts/rr-collect-symbols.py000077500000000000000000000164701412202446200177510ustar00rootroot00000000000000#!/usr/bin/env python3 import errno import glob import os import re import shutil import subprocess import sys import tempfile from urllib.request import urlretrieve from urllib.error import HTTPError, ContentTooShortError # Usage: rr-collect-symbols.py [ | ] # # Given a , downloads the zip/.tar.zst file at , uncompresses it, # runs "gunzip" on any .gz files, and for any ELF files found whose build-ids # match the build-id of an ELF file in the trace, moves it into the trace. # # Given a , which must contain a .build-id directory with the usual # structure (e.g. as Ubuntu and Fedora create under /usr/lib/debug), searches # the directory tree for any ELF files whose build-ids match the build-id of # an ELF file in the trace and copies them into the trace. defaults to # "/usr/lib/debug", which will grab any available system debuginfo files # in Ubuntu and Fedora at least. # # This script assumes that the trace-dir has been packed via `rr pack` so all # relevant files actually appear in the trace-dir. # It also assumes rr is on the PATH. # # The debuginfo files are placed in the trace under a "debug" subdirectory, # in a ".build-id" subdirectory with the usual structure. # # If a debuginfo file contains a .gnu_debugaltlink section then we also # attempt to find the referenced file and copy it into the trace with the # same file name as the .debug file, but with a .sup suffix. if len(sys.argv) < 2: print("Usage: rr-collect-symbols.py [ | ]", file=sys.stderr) sys.exit(1) trace_dir = sys.argv[1] if len(sys.argv) < 3: source = "/usr/lib/debug" else: source = sys.argv[2] rr_buildid = subprocess.Popen(["rr", "buildid"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def build_id_for(file): global rr_buildid rr_buildid.stdin.write(("%s\n"%file).encode('utf-8')) try: rr_buildid.stdin.flush() except BrokenPipeError: print("Can't write to rr, termination code %s"%rr_buildid.returncode, file=sys.stderr) sys.exit(2) return rr_buildid.stdout.readline().rstrip().decode('utf-8') altref_regex = re.compile(rb"^\s+\[\s*0\]\s+(.*)"); def find_altref(file): proc = subprocess.Popen(["readelf", "-p", ".gnu_debugaltlink", file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) try: for line in proc.stdout: m = altref_regex.match(line) if m: return m.group(1).rstrip() finally: proc.wait() return None def find_altref_for_trace_file(trace_file, altref): proc = subprocess.Popen(["rr", "filename", trace_file], stdout=subprocess.PIPE) try: for line in proc.stdout: file = line.rstrip() altref_file = os.path.join(os.path.dirname(file), altref) if os.path.isfile(altref_file): return altref_file finally: proc.wait() return None def mkdir_p(path): try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise # 'dst' must be a complete file name, not a directory. def copy_file(src, dst): try: # Remove the destination file in case it's a hard link # or owned by someone else. os.remove(dst) except: pass shutil.copy(src, dst) # 'dst' must be a complete file name, not a directory def create_link(src, dst): try: # Remove the destination file in case it's wrong. os.remove(dst) except: pass os.symlink(src, dst) def collect_trace_build_ids(): ret = {} for file in glob.iglob("%s/mmap_*"%trace_dir): build_id = build_id_for(file) if build_id: ret[build_id] = True altref = find_altref(file) if altref: altref_file = find_altref_for_trace_file(file, altref) if not altref_file: print("WARNING: Can't find alt file %s for %s"%(altref, file)) continue dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) mkdir_p(dir) copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:])) return ret trace_build_ids = collect_trace_build_ids() def collect_archive(url): is_tar_zst = url.endswith(".tar.zst") tmp_dir = tempfile.mkdtemp(dir=trace_dir) if is_tar_zst: tmp_file_name = "%s/archive.tar.zst"%tmp_dir else: # Assume its a ZIP tmp_file_name = "%s/archive.zip"%tmp_dir try: (file, headers) = urlretrieve(url, tmp_file_name) except (HTTPError, ContentTooShortError) as exc: print("Failed to load archive %s: %s"%(url, exc), file=sys.stderr) sys.exit(2) if is_tar_zst: subprocess.check_call(["tar", "-C", tmp_dir, "-I", "zstd", "-xvf", file]) else: subprocess.check_call(["unzip", "-d", tmp_dir, file]) os.remove(file) for root, dirs, files in os.walk(tmp_dir): for name in files: file = os.path.join(root, name) if file.endswith(".gz"): subprocess.check_call(["gunzip", file]) file = file[:-3] build_id = build_id_for(file) if build_id and build_id in trace_build_ids: dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) mkdir_p(dir) dst = "%s/%s.debug"%(dir, build_id[2:]) os.rename(file, dst) else: os.remove(file) shutil.rmtree(tmp_dir) def collect_filesystem(path): for root, dirs, files in os.walk(path): for name in files: file = os.path.join(root, name) if not os.path.islink(file): build_id = build_id_for(file) if build_id and build_id in trace_build_ids: dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) mkdir_p(dir) copy_file(file, "%s/%s.debug"%(dir, build_id[2:])) altref = find_altref(file) if altref: altref = altref.decode('utf-8') altref_file = os.path.join(os.path.dirname(file), altref) copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:])) if altref.startswith("../../../.dwz/"): mkdir_p("%s/.dwz"%trace_dir) src = "../debug/.build-id/%s/%s.sup"%(build_id[:2], build_id[2:]) create_link(src, "%s/.dwz/%s"%(trace_dir, altref[14:])) elif altref.startswith("../../.dwz/"): mkdir_p("%s/debug/.dwz"%trace_dir) src = "../.build-id/%s/%s.sup"%(build_id[:2], build_id[2:]) create_link(src, "%s/debug/.dwz/%s"%(trace_dir, altref[11:])) elif altref.startswith("../.dwz/"): mkdir_p("%s/debug/.build-id/.dwz"%trace_dir) src = "../%s/%s.sup"%(build_id[:2], build_id[2:]) create_link(src, "%s/debug/.build-id/.dwz/%s"%(trace_dir, altref[8:])) if re.search("^[^:/]+:", source): collect_archive(source) else: collect_filesystem(source) rr_buildid.terminate() rr-5.5.0/scripts/rr_completion000066400000000000000000000015461412202446200164350ustar00rootroot00000000000000# vi:syntax=sh # # completion script for rr commands (to be sourced) _rr_subcmd_completion() { local cmd=$1 local short_opts=$(rr help $cmd | sed -n 's/\s*-\([a-zA-Z]\),.*/-\1/p') local long_opts=$(rr help $cmd | sed -n 's/.*--\([^= ]*\).*/--\1/p') echo "$short_opts" "$long_opts" } _rr_completion() { COMPREPLY=() local rr_commands="$(rr --list-commands | cut -s -d ' ' -f 3)" # completion for rr if [ $COMP_CWORD -eq 1 ]; then COMPREPLY=( $( compgen -W "$rr_commands" -- "${COMP_WORDS[1]}" ) ) return fi # completion for rr 's options local cmd="$(echo "${COMP_WORDS[1]}" | tr -d '[:space:]')" if [ "$(echo $rr_commands | grep -w "$cmd")" ] ; then COMPREPLY=( $( compgen -W "$(_rr_subcmd_completion "$cmd")" -- "${COMP_WORDS[COMP_CWORD]}" ) ) fi } complete -F _rr_completion rr rr-5.5.0/scripts/setup_travis.sh000077500000000000000000000005401412202446200167160ustar00rootroot00000000000000#!/bin/bash # Install the prerequisites needed to build and run tests on travis-ci. echo Configuring travis-ci build slave ... echo The slave is `uname -a` packages=(rpm ccache cmake make g++-multilib pkg-config realpath zlib1g-dev) sudo apt-get update && \ sudo apt-get install -y "${packages[@]}" && \ echo ... finished configuring slave rr-5.5.0/scripts/signal-rr-recording.sh000077500000000000000000000010561412202446200200410ustar00rootroot00000000000000#!/usr/bin/bash signal=$1 if [[ "$signal" == "" ]]; then echo "Usage: $0 " >&2 echo "Sends to all processes being recorded by rr" >&2 exit 1 fi function signal_descendants { pid=$1 for child in `ps -o pid= --ppid $pid`; do echo Sending $signal to $child kill -s $signal $child signal_descendants $child done } for rr_pid in `pidof rr` ; do if cat /proc/$rr_pid/cmdline | tr '\0' '\n' | head -n2 | tail -n1 | grep -qz '\(^record$\)\|/' ; then signal_descendants $rr_pid fi done rr-5.5.0/scripts/tag-release.sh000077500000000000000000000017721412202446200163670ustar00rootroot00000000000000#!/bin/bash function fatal { why=$1; echo "[FATAL]" $why >&2 exit 1 } major=$1 minor=$2 patch=$3 ver="$major.$minor.$patch" echo "Preparing for release '$ver' ..." if [[ $major == "" || $minor == "" || $patch == "" ]]; then fatal "Usage: ./tag-release.sh MAJOR MINOR PATCH" fi verfile=CMakeLists.txt echo "Patching $verfile with new version string ..." sed -i "s/rr_VERSION_MAJOR [0-9][0-9]*/rr_VERSION_MAJOR $major/g" $verfile sed -i "s/rr_VERSION_MINOR [0-9][0-9]*/rr_VERSION_MINOR $minor/g" $verfile sed -i "s/rr_VERSION_PATCH [0-9][0-9]*/rr_VERSION_PATCH $patch/g" $verfile echo "Showing diff for $verfile ..." git diff -p -U8 echo -n "Is this what you expected to see? [Y/n] " read ok if [[ $ok != "Y" ]]; then fatal "Oops. Aborting version update by user request." fi echo "Generating git commit ..." git commit $verfile -m "Bump version to $ver." echo "Generating git tag $ver ..." git tag $ver echo "Done! Publish the new version using 'git push --all' or 'git push; git push --tags'." rr-5.5.0/scripts/update-gh-pages.sh000077500000000000000000000013361412202446200171450ustar00rootroot00000000000000#!/bin/bash function fatal { why=$1; echo "[FATAL]" $why >&2 exit 1 } rev=HEAD if [[ $1 != "" ]]; then rev=$1 fi ver=`git name-rev --name-only --tags $rev` if [[ $ver == undefined ]]; then fatal "No tag found" fi echo "Updating repo ..." git checkout gh-pages || fatal "Failed to checkout gh-pages branch." verfile=index.html echo "Patching $verfile with new version $ver ..." sed -i "s/[^<]*$ver= 0 and ssb_mode & PR_SPEC_PRCTL: mitigated = (prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0) == 0) if not mitigated: print('Failed to enable SSB mitigation') else: ssb_status = 'mitigated' else: ssb_status = 'immutable' msrs = [read_msr(cpu) & BIT for cpu in cpus] if all(msr for msr in msrs): if ssb_status in ('mitigated', 'immutable') or args.check: print('Zen workaround in place') else: print('Zen workaround maybe in place.') elif args.reset or args.check: if all(not msr for msr in msrs): print('Zen workaround disabled') elif args.reset: print('Zen workaround somehow not entirely disabled?') else: print('Zen workaround not entirely enabled?') else: print('Zen workaround does not stick. Please see https://github.com/rr-debugger/rr/wiki/Zen') rr-5.5.0/scripts/zen_workaround.service000066400000000000000000000021761412202446200202670ustar00rootroot00000000000000# systemd service for AMD Zen `rr` workaround # See https://github.com/rr-debugger/rr/wiki/Zen for more details # To install: # - Save this file as `/etc/systemd/system/zen_workaround.service` # - Download the `zen_workaround.py` script to a secure location, example: # - sudo mkdir -p /usr/share/zen_workaround # - cd /usr/share/zen_workaround # - curl -L https://github.com/rr-debugger/rr/raw/master/scripts/zen_workaround.py | sudo tee -a zen_workaround.py >/dev/null # - chmod +x ./zen_workaround.py # - run `sudo systemctl enable zen_workaround` to enable on startup # - run `sudo systemctl start zen_workaround` to manually start it immediately # - run `systemctl status zen_workaround` to ensure that it completed successfully on your hardware [Unit] Description = Startup script for rr zen workaround [Service] # Step to actually run `zen_workaround.py`. ExecStart =+/usr/share/zen_workaround/zen_workaround.py # Only run this once, report it as "(active)" even after we've exited. Type = oneshot RemainAfterExit = yes [Install] WantedBy = default.target rr-5.5.0/snap/000077500000000000000000000000001412202446200131025ustar00rootroot00000000000000rr-5.5.0/snap/snapcraft.yaml000066400000000000000000000035721412202446200157560ustar00rootroot00000000000000name: rr base: core20 # the base snap is the execution environment for this snap version: git summary: low-overhead record-replay debugging tool description: | rr aspires to be your primary C/C++ debugging tool for Linux, replacing — well, enhancing — gdb. You record a failure once, then debug the recording, deterministically, as many times as you want. The same execution is replayed every time. rr also provides efficient reverse execution under gdb. Set breakpoints and data watchpoints and quickly reverse-execute to where they were hit. * Low overhead compared to other similar tools, especially on mostly-single-threaded workloads * Supports recording and replay of all kinds of applications: Firefox, Chrome, QEMU, LibreOffice, Go programs, ... * Record, replay and debug multiple-process workloads, including entire containers * Works with gdb scripting and [IDE integration](https://github.com/rr-debugger/rr/wiki/Using-rr-in-an-IDE) * [Durable](http://robert.ocallahan.org/2017/06/new-rr-pack-command.html), [compact](http://robert.ocallahan.org/2017/07/selecting-compression-algorithm-for-rr.html) traces that can be [ported](http://robert.ocallahan.org/2017/09/rr-trace-portability.html) between machines * [Chaos mode](http://robert.ocallahan.org/2016/02/introducing-rr-chaos-mode.html) to make intermittent bugs more reproducible grade: stable # must be 'stable' to release into candidate/stable channels confinement: classic apps: rr: command: usr/bin/rr parts: rr: plugin: cmake cmake-parameters: - -DCMAKE_INSTALL_PREFIX=/usr source: . source-type: git build-packages: - g++ - g++-multilib - gdb - pkg-config - coreutils - python3-pexpect - manpages-dev - ninja-build - capnproto - libcapnp-dev stage-packages: - libcapnp-0.7.0 rr-5.5.0/src/000077500000000000000000000000001412202446200127305ustar00rootroot00000000000000rr-5.5.0/src/AddressSpace.cc000066400000000000000000002370561412202446200156150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "AddressSpace.h" #include #include #include #include #include #include #include "rr/rr.h" #include "preload/preload_interface.h" #include "AutoRemoteSyscalls.h" #include "MonitoredSharedMemory.h" #include "RecordSession.h" #include "RecordTask.h" #include "Session.h" #include "Task.h" #include "core.h" #include "log.h" using namespace std; namespace rr { static const uint8_t x86_breakpoint_insn[] = { 0xcc }; // int $3 static const uint8_t arm64_breakpoint_insn[4] = {0x0, 0x0, 0x20, 0xd4}; // brk #0 static const uint8_t *breakpoint_insn(SupportedArch arch) { switch (arch) { case x86: case x86_64: return x86_breakpoint_insn; case aarch64: return arm64_breakpoint_insn; default: DEBUG_ASSERT(0 && "Must define breakpoint insn for this architecture"); return nullptr; } } /** * Advance *str to skip leading blank characters. */ static const char* trim_leading_blanks(const char* str) { const char* trimmed = str; while (isblank(*trimmed)) { ++trimmed; } return trimmed; } /** * Returns true if a task in t's thread-group other than t is doing an exec. */ static bool thread_group_in_exec(Task* t) { if (!t->session().is_recording()) { return false; } for (Task* tt : t->thread_group()->task_set()) { if (tt == t || t->already_exited()) { continue; } RecordTask* rt = static_cast(tt); Event& ev = rt->ev(); if (ev.is_syscall_event() && is_execve_syscall(ev.Syscall().number, ev.Syscall().arch())) { return true; } } return false; } KernelMapIterator::KernelMapIterator(Task* t) : tid(t->tid) { // See https://lkml.org/lkml/2016/9/21/423 ASSERT(t, !thread_group_in_exec(t)) << "Task-group in execve, so reading " "/proc/.../maps may trigger kernel " "deadlock!"; init(); } KernelMapIterator::~KernelMapIterator() { if (maps_file) { fclose(maps_file); } } void KernelMapIterator::init() { char maps_path[PATH_MAX]; sprintf(maps_path, "/proc/%d/maps", tid); if (!(maps_file = fopen(maps_path, "r"))) { FATAL() << "Failed to open " << maps_path; } ++*this; } void KernelMapIterator::operator++() { char line[PATH_MAX * 2]; if (!fgets(line, sizeof(line), maps_file)) { fclose(maps_file); maps_file = nullptr; return; } uint64_t start, end, offset, inode; int dev_major, dev_minor; char flags[32]; int chars_scanned; int nparsed = sscanf(line, "%" SCNx64 "-%" SCNx64 " %31s %" SCNx64 " %x:%x %" SCNu64 " %n", &start, &end, flags, &offset, &dev_major, &dev_minor, &inode, &chars_scanned); DEBUG_ASSERT(8 /*number of info fields*/ == nparsed || 7 /*num fields if name is blank*/ == nparsed); // trim trailing newline, if any int last_char = strlen(line) - 1; if (line[last_char] == '\n') { line[last_char] = 0; } raw_line = line; const char* name = trim_leading_blanks(line + chars_scanned); #if defined(__i386__) if (start > numeric_limits::max() || end > numeric_limits::max() || strcmp(name, "[vsyscall]") == 0) { // We manually read the exe link here because // this helper is used to set // |t->vm()->exe_image()|, so we can't rely on // that being correct yet. char proc_exe[PATH_MAX]; char exe[PATH_MAX]; snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", tid); ssize_t size = readlink(proc_exe, exe, sizeof(exe)); if (size < 0) { FATAL() << "readlink failed"; } FATAL() << "Sorry, tracee " << tid << " has x86-64 image " << exe << " and that's not supported with a 32-bit rr."; } #endif int prot = (strchr(flags, 'r') ? PROT_READ : 0) | (strchr(flags, 'w') ? PROT_WRITE : 0) | (strchr(flags, 'x') ? PROT_EXEC : 0); int f = (strchr(flags, 'p') ? MAP_PRIVATE : 0) | (strchr(flags, 's') ? MAP_SHARED : 0); string tmp_name; if (strchr(name, '\\')) { // Unescape any '\012' sequences while (*name) { if (strncmp(name, "\\012", 4) == 0) { tmp_name.push_back('\n'); name += 4; } else { tmp_name.push_back(*name); ++name; } } name = tmp_name.c_str(); } km = KernelMapping(start, end, name, MKDEV(dev_major, dev_minor), inode, prot, f, offset); } static KernelMapping read_kernel_mapping(pid_t tid, remote_ptr addr) { MemoryRange range(addr, 1); for (KernelMapIterator it(tid); !it.at_end(); ++it) { const KernelMapping& km = it.current(); if (km.contains(range)) { return km; } } return KernelMapping(); } KernelMapping AddressSpace::read_kernel_mapping(Task* t, remote_ptr addr) { return rr::read_kernel_mapping(t->tid, addr); } KernelMapping AddressSpace::read_local_kernel_mapping(uint8_t* addr) { return rr::read_kernel_mapping(getpid(), remote_ptr((uintptr_t)addr)); } /** * Cat the /proc/[t->tid]/maps file to stdout, line by line. */ void AddressSpace::print_process_maps(Task* t) { for (KernelMapIterator it(t); !it.at_end(); ++it) { string line; it.current(&line); cerr << line << '\n'; } } AddressSpace::Mapping::Mapping(const KernelMapping& map, const KernelMapping& recorded_map, EmuFile::shr_ptr emu_file, std::unique_ptr mapped_file_stat, void* local_addr, shared_ptr&& monitored) : map(map), recorded_map(recorded_map), emu_file(emu_file), mapped_file_stat(move(mapped_file_stat)), local_addr(static_cast(local_addr)), monitored_shared_memory(move(monitored)), flags(FLAG_NONE) {} static unique_ptr clone_stat( const unique_ptr& other) { return other ? unique_ptr(new struct stat(*other)) : nullptr; } AddressSpace::Mapping::Mapping(const Mapping& other) : map(other.map), recorded_map(other.recorded_map), emu_file(other.emu_file), mapped_file_stat(clone_stat(other.mapped_file_stat)), local_addr(other.local_addr), monitored_shared_memory(other.monitored_shared_memory), flags(other.flags) {} AddressSpace::Mapping::~Mapping() {} AddressSpace::~AddressSpace() { for (auto& m : mem) { if (m.second.local_addr) { int ret = munmap(m.second.local_addr, m.second.map.size()); if (ret < 0) { FATAL() << "Can't munmap"; } } } session_->on_destroy(this); } void AddressSpace::after_clone() { allocate_watchpoints(); } static uint32_t find_offset_of_syscall_instruction_in(SupportedArch arch, uint8_t* vdso_data, size_t vdso_len) { auto instruction = syscall_instruction(arch); for (uint32_t i = 1; i < vdso_len - instruction.size(); ++i) { if (memcmp(vdso_data + i, instruction.data(), instruction.size()) == 0) { return i; } } return 0; } uint32_t AddressSpace::offset_to_syscall_in_vdso[SupportedArch_MAX + 1]; remote_code_ptr AddressSpace::find_syscall_instruction(Task* t) { SupportedArch arch = t->arch(); // This assert passes even if --unmap-vdso is passed because this only ever // gets called at the start of process_execve before we unmap the vdso. After // the rr page is mapped in, we use the syscall instructions contained therein ASSERT(t, has_vdso()) << "Kernel with vDSO disabled?"; if (!offset_to_syscall_in_vdso[arch]) { auto vdso_data = t->read_mem(vdso().start().cast(), vdso().size()); offset_to_syscall_in_vdso[arch] = find_offset_of_syscall_instruction_in( arch, vdso_data.data(), vdso_data.size()); ASSERT(t, offset_to_syscall_in_vdso[arch]) << "No syscall instruction found in VDSO"; } return remote_code_ptr( (vdso().start().cast() + offset_to_syscall_in_vdso[arch]) .as_int()); } void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) { int prot = PROT_EXEC | PROT_READ; int flags = MAP_PRIVATE | MAP_FIXED; string file_name; Task* t = remote.task(); SupportedArch arch = t->arch(); const char *fname; switch (t->arch()) { case x86_64: case aarch64: fname = RRPAGE_LIB_FILENAME; break; case x86: #if defined(__x86_64__) fname = RRPAGE_LIB_FILENAME_32; #else fname = RRPAGE_LIB_FILENAME; #endif break; } string path = find_helper_library(fname); if (path.empty()) { FATAL() << "Failed to locate " << fname; } path += fname; size_t offset_pages = t->session().is_recording() ? RRPAGE_RECORD_PAGE_OFFSET : RRPAGE_REPLAY_PAGE_OFFSET; size_t offset_bytes = offset_pages*rr_page_size(); { ScopedFd page(path.c_str(), O_RDONLY); ASSERT(t, page.is_open()) << "Failed to open rrpage library " << path; long child_fd = remote.send_fd(page.get()); ASSERT(t, child_fd >= 0); if (t->session().is_recording()) { remote.infallible_mmap_syscall(rr_page_start() - offset_bytes, offset_bytes, prot, flags, child_fd, 0); } remote.infallible_mmap_syscall(rr_page_start(), rr_page_size(), prot, flags, child_fd, offset_pages); struct stat fstat = t->stat_fd(child_fd); file_name = t->file_name_of_fd(child_fd); remote.infallible_syscall(syscall_number_for_close(arch), child_fd); map(t, rr_page_start(), rr_page_size(), prot, flags, offset_pages * page_size(), file_name, fstat.st_dev, fstat.st_ino); mapping_flags_of(rr_page_start()) = Mapping::IS_RR_PAGE; if (t->session().is_recording()) { map(t, rr_page_start() - offset_bytes, offset_bytes, prot, flags, 0, file_name, fstat.st_dev, fstat.st_ino); } } if (t->session().is_recording()) { // brk() will not have been called yet so the brk area is empty. brk_start = brk_end = remote.infallible_syscall(syscall_number_for_brk(arch), 0); ASSERT(t, !brk_end.is_null()); } traced_syscall_ip_ = rr_page_syscall_entry_point( TRACED, UNPRIVILEGED, RECORDING_AND_REPLAY, t->arch()); privileged_traced_syscall_ip_ = rr_page_syscall_entry_point( TRACED, PRIVILEGED, RECORDING_AND_REPLAY, t->arch()); } void AddressSpace::unmap_all_but_rr_page(AutoRemoteSyscalls& remote) { vector unmaps; for (const auto& m : maps()) { // Do not attempt to unmap [vsyscall] --- it doesn't work. if (m.map.start() != AddressSpace::rr_page_start() && m.map.start() != AddressSpace::preload_thread_locals_start() && !m.map.is_vsyscall()) { unmaps.push_back(m.map); } } for (auto& m : unmaps) { remote.infallible_syscall(syscall_number_for_munmap(remote.task()->arch()), m.start(), m.size()); unmap(remote.task(), m.start(), m.size()); } } /** * Must match generate_rr_page.py */ static const AddressSpace::SyscallType entry_points[] = { { AddressSpace::TRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_AND_REPLAY }, { AddressSpace::TRACED, AddressSpace::PRIVILEGED, AddressSpace::RECORDING_AND_REPLAY }, { AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_AND_REPLAY }, { AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::REPLAY_ONLY }, { AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_ONLY }, { AddressSpace::UNTRACED, AddressSpace::PRIVILEGED, AddressSpace::RECORDING_AND_REPLAY }, { AddressSpace::UNTRACED, AddressSpace::PRIVILEGED, AddressSpace::REPLAY_ONLY }, { AddressSpace::UNTRACED, AddressSpace::PRIVILEGED, AddressSpace::RECORDING_ONLY }, { AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::REPLAY_ASSIST }, }; static int rr_page_syscall_stub_size(SupportedArch arch) { int val = 0; switch (arch) { case x86: case x86_64: val = 3; break; case aarch64: val = 8; break; default: FATAL() << "Syscall stub size not defined for this architecture"; } if (arch == NativeArch::arch()) { DEBUG_ASSERT(val == RR_PAGE_SYSCALL_STUB_SIZE); } return val; } static int rr_page_syscall_instruction_end(SupportedArch arch) { int val = 0; switch (arch) { case x86: case x86_64: val = 2; break; case aarch64: val = 4; break; default: FATAL() << "Syscall stub size not defined for this architecture"; } if (arch == NativeArch::arch()) { DEBUG_ASSERT(val == RR_PAGE_SYSCALL_INSTRUCTION_END); } return val; } static remote_code_ptr entry_ip_from_index(SupportedArch arch, size_t i) { return remote_code_ptr(RR_PAGE_ADDR + rr_page_syscall_stub_size(arch) * i); } static remote_code_ptr exit_ip_from_index(SupportedArch arch, size_t i) { return remote_code_ptr(RR_PAGE_ADDR + rr_page_syscall_stub_size(arch) * i + rr_page_syscall_instruction_end(arch)); } remote_code_ptr AddressSpace::rr_page_syscall_exit_point(Traced traced, Privileged privileged, Enabled enabled, SupportedArch arch) { for (auto& e : entry_points) { if (e.traced == traced && e.privileged == privileged && e.enabled == enabled) { return exit_ip_from_index(arch, &e - entry_points); } } return nullptr; } remote_code_ptr AddressSpace::rr_page_syscall_entry_point(Traced traced, Privileged privileged, Enabled enabled, SupportedArch arch) { for (auto& e : entry_points) { if (e.traced == traced && e.privileged == privileged && e.enabled == enabled) { return entry_ip_from_index(arch, &e - entry_points); } } return nullptr; } const AddressSpace::SyscallType* AddressSpace::rr_page_syscall_from_exit_point( SupportedArch arch, remote_code_ptr ip) { for (size_t i = 0; i < array_length(entry_points); ++i) { if (exit_ip_from_index(arch, i) == ip) { return &entry_points[i]; } } return nullptr; } const AddressSpace::SyscallType* AddressSpace::rr_page_syscall_from_entry_point( SupportedArch arch, remote_code_ptr ip) { for (size_t i = 0; i < array_length(entry_points); ++i) { if (entry_ip_from_index(arch, i) == ip) { return &entry_points[i]; } } return nullptr; } vector AddressSpace::rr_page_syscalls() { vector result; for (auto& e : entry_points) { result.push_back(e); } return result; } void AddressSpace::save_auxv(Task* t) { saved_auxv_ = read_auxv(t); save_interpreter_base(t, saved_auxv()); } void AddressSpace::save_interpreter_base(Task* t, std::vector auxv) { saved_interpreter_base_ = read_interpreter_base(auxv); save_ld_path(t, saved_interpreter_base()); } void AddressSpace::save_ld_path(Task* t, remote_ptr interpreter_base) { saved_ld_path_ = read_ld_path(t, interpreter_base); } void AddressSpace::read_mm_map(Task* t, NativeArch::prctl_mm_map* map) { char buf[PATH_MAX+1024]; { string proc_stat = t->proc_stat_path(); ScopedFd fd(proc_stat.c_str(), O_RDONLY); memset(buf, 0, sizeof(buf)); int err = read_to_end(fd, 0, buf, sizeof(buf)-1); if (err < 0) { FATAL() << "Failed to read /proc//stat"; } } // The last close-paren indicates the end of the comm and the // start of the fixed-width area char* fixed = strrchr(buf, ')'); // We don't change /proc/pid/exe, since we're unlikely to have CAP_SYS_ADMIN map->exe_fd = -1; // auxv is restored separately map->auxv.val = 0; map->auxv_size = 0; // All of these fields of /proc/pid/stat, we don't use (currently) char state; pid_t ppid; pid_t pgrp; int session; int tty_nr; int tpgid; unsigned int flags; unsigned long minflt, cminflt, majflt, cmajflt, utime, stime; long cutime, cstime, priority, nice, num_threads, itrealvalue; unsigned long long starttime; unsigned long vsize; long rss; unsigned long rsslim, kstkesp, kstskip, signal; unsigned long blocked, sigignore, sigcatch, wchan, nswap, cnswap; int exit_signal, processor; unsigned int rt_priority, policy; unsigned long long delayacct_blkio_ticks; unsigned long guest_time; long cguest_time; int exit_code; // See the proc(5) man page for the correct scan codes for these size_t n = sscanf(fixed + 1, // state ppid pgrp session tty_nr tpgid " %c %d %d %d %d %d" // flags minflt cminflt majflt cmajflt utime stime cutime cstime " %u %lu %lu %lu %lu %lu %lu %ld %ld" // priority nice num_threads itrealvalue starttime vsize rss " %ld %ld %ld %ld %llu %lu %ld" // rsslim startcode endcode startstack kstkesp kstskip signal " %lu %lu %lu %lu %lu %lu %lu" // blocked sigignore sigcatch wchan nswap cnswap exit_signal " %lu %lu %lu %lu %lu %lu %d" // processor rt_priority policy delayacct_blkio_ticks guest_time cguest_time " %d %u %u %llu %lu %ld " // start_data end_data start_brk arg_start arg_end env_start env_end exit_code " %lu %lu %lu %lu %lu %lu %lu %d", &state, &ppid, &pgrp, &session, &tty_nr, &tpgid, &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime, &cutime, &cstime, &priority, &nice, &num_threads, &itrealvalue, &starttime, &vsize, &rss, &rsslim, (unsigned long *)&map->start_code, (unsigned long *)&map->end_code, (unsigned long *)&map->start_stack, &kstkesp, &kstskip, &signal, &blocked, &sigignore, &sigcatch, &wchan, &nswap, &cnswap, &exit_signal, &processor, &rt_priority, &policy, &delayacct_blkio_ticks, &guest_time, &cguest_time, (unsigned long *)&map->start_data, (unsigned long *)&map->end_data, (unsigned long *)&map->start_brk, (unsigned long *)&map->arg_start, (unsigned long *)&map->arg_end, (unsigned long *)&map->env_start, (unsigned long *)&map->env_end, &exit_code); ASSERT(t, n == 50); // Fill in brk end ASSERT(t, map->start_brk == this->brk_start.as_int()); map->brk = this->brk_end.as_int(); } void AddressSpace::post_exec_syscall(Task* t) { // First locate a syscall instruction we can use for remote syscalls. traced_syscall_ip_ = find_syscall_instruction(t); privileged_traced_syscall_ip_ = nullptr; do_breakpoint_fault_addr_ = nullptr; stopping_breakpoint_table_ = nullptr; stopping_breakpoint_table_entry_size_ = 0; // Now remote syscalls work, we can open_mem_fd. t->open_mem_fd(); // Set up AutoRemoteSyscalls again now that the mem-fd is open. AutoRemoteSyscalls remote(t); // Now we can set up the "rr page" at its fixed address. This gives // us traced and untraced syscall instructions at known, fixed addresses. map_rr_page(remote); // Set up the preload_thread_locals shared area. t->session().create_shared_mmap(remote, PRELOAD_THREAD_LOCALS_SIZE, preload_thread_locals_start(), "preload_thread_locals"); mapping_flags_of(preload_thread_locals_start()) |= AddressSpace::Mapping::IS_THREAD_LOCALS; } void AddressSpace::brk(Task* t, remote_ptr addr, int prot) { LOG(debug) << "brk(" << addr << ")"; remote_ptr old_brk = ceil_page_size(brk_end); remote_ptr new_brk = ceil_page_size(addr); if (old_brk < new_brk) { map(t, old_brk, new_brk - old_brk, prot, MAP_ANONYMOUS | MAP_PRIVATE, 0, "[heap]"); } else { unmap(t, new_brk, old_brk - new_brk); } brk_end = addr; } static const char* stringify_flags(int flags) { switch (flags) { case AddressSpace::Mapping::FLAG_NONE: return ""; case AddressSpace::Mapping::IS_SYSCALLBUF: return " [syscallbuf]"; case AddressSpace::Mapping::IS_THREAD_LOCALS: return " [thread_locals]"; case AddressSpace::Mapping::IS_PATCH_STUBS: return " [patch_stubs]"; default: return "[unknown_flags]"; } } void AddressSpace::dump() const { fprintf(stderr, " (heap: %p-%p)\n", (void*)brk_start.as_int(), (void*)brk_end.as_int()); for (auto it = mem.begin(); it != mem.end(); ++it) { const KernelMapping& m = it->second.map; fprintf(stderr, "%s%s\n", m.str().c_str(), stringify_flags(it->second.flags)); } } SupportedArch AddressSpace::arch() const { return (*task_set().begin())->arch(); } BreakpointType AddressSpace::get_breakpoint_type_for_retired_insn( remote_code_ptr ip) { remote_code_ptr addr = ip.undo_executed_bkpt(arch()); return get_breakpoint_type_at_addr(addr); } BreakpointType AddressSpace::get_breakpoint_type_at_addr(remote_code_ptr addr) { auto it = breakpoints.find(addr); return it == breakpoints.end() ? BKPT_NONE : it->second.type(); } bool AddressSpace::is_breakpoint_in_private_read_only_memory( remote_code_ptr addr) { for (const auto& m : maps_containing_or_after(addr.to_data_ptr())) { if (m.map.start() >= addr.increment_by_bkpt_insn_length(arch()).to_data_ptr()) { break; } if ((m.map.prot() & PROT_WRITE) || (m.map.flags() & MAP_SHARED)) { return false; } } return true; } void AddressSpace::replace_breakpoints_with_original_values( uint8_t* dest, size_t length, remote_ptr addr) { for (auto& it : breakpoints) { remote_ptr bkpt_location = it.first.to_data_ptr(); remote_ptr start = max(addr, bkpt_location); remote_ptr end = min(addr + length, bkpt_location + bkpt_instruction_length(arch())); if (start < end) { memcpy(dest + (start - addr), it.second.original_data() + (start - bkpt_location), end - start); } } } bool AddressSpace::is_breakpoint_instruction(Task* t, remote_code_ptr ip) { bool ok = true; uint8_t data[MAX_BKPT_INSTRUCTION_LENGTH]; t->read_bytes_helper(ip.to_data_ptr(), bkpt_instruction_length(t->arch()), data, &ok); return memcmp(data, breakpoint_insn(t->arch()), bkpt_instruction_length(t->arch())) == 0 && ok; } static void remove_range(set& ranges, const MemoryRange& range) { if (ranges.empty()) { return; } auto start = ranges.lower_bound(range); // An earlier range might extend into range, so check for that. if (start != ranges.begin()) { --start; if (start->end() <= range.start()) { ++start; } } auto end = start; auto prev_end = start; while (end != ranges.end() && end->start() < range.end()) { prev_end = end; ++end; } if (start == end) { return; } MemoryRange start_range = *start; MemoryRange end_range = *prev_end; ranges.erase(start, end); if (start_range.start() < range.start()) { ranges.insert(MemoryRange(start_range.start(), range.start())); } if (range.end() < end_range.end()) { ranges.insert(MemoryRange(range.end(), end_range.end())); } } static void add_range(set& ranges, const MemoryRange& range) { // Remove overlapping ranges remove_range(ranges, range); ranges.insert(range); // We could coalesce adjacent ranges, but there's probably no need. } KernelMapping AddressSpace::map(Task* t, remote_ptr addr, size_t num_bytes, int prot, int flags, off64_t offset_bytes, const string& fsname, dev_t device, ino_t inode, unique_ptr mapped_file_stat, const KernelMapping* recorded_map, EmuFile::shr_ptr emu_file, void* local_addr, shared_ptr&& monitored) { LOG(debug) << "mmap(" << addr << ", " << num_bytes << ", " << HEX(prot) << ", " << HEX(flags) << ", " << HEX(offset_bytes) << ")"; num_bytes = ceil_page_size(num_bytes); KernelMapping m(addr, addr + num_bytes, fsname, device, inode, prot, flags, offset_bytes); if (!num_bytes) { return m; } remove_range(dont_fork, MemoryRange(addr, num_bytes)); remove_range(wipe_on_fork, MemoryRange(addr, num_bytes)); // The mmap() man page doesn't specifically describe // what should happen if an existing map is // "overwritten" by a new map (of the same resource). // In testing, the behavior seems to be as if the // overlapping region is unmapped and then remapped // per the arguments to the second call. unmap_internal(t, addr, num_bytes); const KernelMapping& actual_recorded_map = recorded_map ? *recorded_map : m; map_and_coalesce(t, m, actual_recorded_map, emu_file, move(mapped_file_stat), move(local_addr), move(monitored)); // During an emulated exec, we will explicitly map in a (copy of) the VDSO // at the recorded address. if (actual_recorded_map.is_vdso()) { vdso_start_addr = addr; } return m; } template void AddressSpace::at_preload_init_arch(Task* t) { auto params = t->read_mem( remote_ptr>(t->regs().arg1())); if (t->session().is_recording()) { ASSERT(t, t->session().as_record()->use_syscall_buffer() == params.syscallbuf_enabled) << "Tracee thinks syscallbuf is " << (params.syscallbuf_enabled ? "en" : "dis") << "abled, but tracer thinks " << (t->session().as_record()->use_syscall_buffer() ? "en" : "dis") << "abled"; } else { if (params.breakpoint_table_entry_size == -1) { do_breakpoint_fault_addr_ = params.breakpoint_instr_addr.rptr().as_int(); } else { stopping_breakpoint_table_ = params.breakpoint_table.rptr().as_int(); stopping_breakpoint_table_entry_size_ = params.breakpoint_table_entry_size; } } if (!params.syscallbuf_enabled) { return; } syscallbuf_enabled_ = true; if (t->session().is_recording()) { monkeypatch_state->patch_at_preload_init(static_cast(t)); } } void AddressSpace::at_preload_init(Task* t) { RR_ARCH_FUNCTION(at_preload_init_arch, t->arch(), t); } const AddressSpace::Mapping& AddressSpace::mapping_of( remote_ptr addr) const { MemoryRange range(floor_page_size(addr), 1); auto it = mem.find(range); DEBUG_ASSERT(it != mem.end()); DEBUG_ASSERT(it->second.map.contains(range)); return it->second; } uint32_t& AddressSpace::mapping_flags_of(remote_ptr addr) { return const_cast( static_cast(this)->mapping_of(addr)) .flags; } uint8_t* AddressSpace::local_mapping(remote_ptr addr, size_t size) { MemoryRange range(floor_page_size(addr), 1); auto it = mem.find(range); if (it == mem.end()) { return nullptr; } DEBUG_ASSERT(it->second.map.contains(range)); const Mapping& map = it->second; // Fall back to the slow path if we can't get the entire region if (size > static_cast(map.map.end() - addr)) { return nullptr; } if (map.local_addr != nullptr) { size_t offset = addr - map.map.start(); return static_cast(map.local_addr) + offset; } return nullptr; } void* AddressSpace::detach_local_mapping(remote_ptr addr) { auto m = const_cast(mapping_of(addr)); void* p = m.local_addr; m.local_addr = nullptr; return p; } bool AddressSpace::has_mapping(remote_ptr addr) const { if (addr + page_size() < addr) { // Assume the last byte in the address space is never mapped; avoid overflow return false; } MemoryRange m(floor_page_size(addr), 1); auto it = mem.find(m); return it != mem.end() && it->first.contains(m); } bool AddressSpace::has_rr_page() const { MemoryRange m(RR_PAGE_ADDR, 1); auto it = mem.find(m); return it != mem.end() && (it->second.flags & Mapping::IS_RR_PAGE); } void AddressSpace::protect(Task* t, remote_ptr addr, size_t num_bytes, int prot) { LOG(debug) << "mprotect(" << addr << ", " << num_bytes << ", " << HEX(prot) << ")"; MemoryRange last_overlap; auto protector = [this, prot, &last_overlap](const Mapping& mm, const MemoryRange& rem) { LOG(debug) << " protecting (" << rem << ") ..."; Mapping m = move(mm); remove_from_map(m.map); // PROT_GROWSDOWN means that if this is a grows-down segment // (which for us means "stack") then the change should be // extended to the start of the segment. // We don't try to handle the analogous PROT_GROWSUP, because we // don't understand the idea of a grows-up segment. remote_ptr new_start; if ((m.map.start() < rem.start()) && (prot & PROT_GROWSDOWN)) { new_start = m.map.start(); LOG(debug) << " PROT_GROWSDOWN: expanded region down to " << new_start; } else { new_start = rem.start(); } LOG(debug) << " erased (" << m.map << ")"; // If the first segment we protect underflows the // region, remap the underflow region with previous // prot. auto monitored = m.monitored_shared_memory; if (m.map.start() < new_start) { Mapping underflow( m.map.subrange(m.map.start(), rem.start()), m.recorded_map.subrange(m.recorded_map.start(), rem.start()), m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr, move(monitored)); underflow.flags = m.flags; add_to_map(underflow); } // Remap the overlapping region with the new prot. remote_ptr new_end = min(rem.end(), m.map.end()); int new_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC); Mapping overlap( m.map.subrange(new_start, new_end).set_prot(new_prot), m.recorded_map.subrange(new_start, new_end).set_prot(new_prot), m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr ? m.local_addr + (new_start - m.map.start()) : 0, m.monitored_shared_memory ? m.monitored_shared_memory->subrange(new_start - m.map.start(), new_end - new_start) : nullptr); overlap.flags = m.flags; add_to_map(overlap); last_overlap = overlap.map; // If the last segment we protect overflows the // region, remap the overflow region with previous // prot. if (rem.end() < m.map.end()) { Mapping overflow( m.map.subrange(rem.end(), m.map.end()), m.recorded_map.subrange(rem.end(), m.map.end()), m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr ? m.local_addr + (rem.end() - m.map.start()) : 0, m.monitored_shared_memory ? m.monitored_shared_memory->subrange(rem.end() - m.map.start(), m.map.end() - rem.end()) : nullptr); overflow.flags = m.flags; add_to_map(overflow); } }; for_each_in_range(addr, num_bytes, protector, ITERATE_CONTIGUOUS); if (last_overlap.size()) { // All mappings that we altered which might need coalescing // are adjacent to |last_overlap|. coalesce_around(t, mem.find(last_overlap)); } } void AddressSpace::fixup_mprotect_growsdown_parameters(Task* t) { ASSERT(t, !(t->regs().arg3() & PROT_GROWSUP)); if (t->regs().arg3() & PROT_GROWSDOWN) { Registers r = t->regs(); if (r.arg1() == floor_page_size(r.arg1()) && has_mapping(r.arg1())) { auto& km = mapping_of(r.arg1()).map; if (km.flags() & MAP_GROWSDOWN) { auto new_start = km.start(); r.set_arg2(remote_ptr(r.arg1()) + size_t(r.arg2()) - new_start); r.set_arg1(new_start); r.set_arg3(r.arg3() & ~PROT_GROWSDOWN); t->set_regs(r); } } } } void AddressSpace::remap(Task* t, remote_ptr old_addr, size_t old_num_bytes, remote_ptr new_addr, size_t new_num_bytes) { LOG(debug) << "mremap(" << old_addr << ", " << old_num_bytes << ", " << new_addr << ", " << new_num_bytes << ")"; old_num_bytes = ceil_page_size(old_num_bytes); Mapping mr = mapping_of(old_addr); DEBUG_ASSERT(!mr.monitored_shared_memory); KernelMapping km = mr.map.subrange(old_addr, min(mr.map.end(), old_addr + old_num_bytes)); unmap_internal(t, old_addr, old_num_bytes); if (0 == new_num_bytes) { return; } new_num_bytes = ceil_page_size(new_num_bytes); auto it = dont_fork.lower_bound(MemoryRange(old_addr, old_num_bytes)); if (it != dont_fork.end() && it->start() < old_addr + old_num_bytes) { // mremap fails if some but not all pages are marked DONTFORK DEBUG_ASSERT(*it == MemoryRange(old_addr, old_num_bytes)); remove_range(dont_fork, MemoryRange(old_addr, old_num_bytes)); add_range(dont_fork, MemoryRange(new_addr, new_num_bytes)); } else { remove_range(dont_fork, MemoryRange(old_addr, old_num_bytes)); remove_range(dont_fork, MemoryRange(new_addr, new_num_bytes)); } it = wipe_on_fork.lower_bound(MemoryRange(old_addr, old_num_bytes)); if (it != wipe_on_fork.end() && it->start() < old_addr + old_num_bytes) { // hopefully mremap fails if some but not all pages are marked DONTFORK DEBUG_ASSERT(*it == MemoryRange(old_addr, old_num_bytes)); remove_range(wipe_on_fork, MemoryRange(old_addr, old_num_bytes)); add_range(wipe_on_fork, MemoryRange(new_addr, new_num_bytes)); } else { remove_range(wipe_on_fork, MemoryRange(old_addr, old_num_bytes)); remove_range(wipe_on_fork, MemoryRange(new_addr, new_num_bytes)); } unmap_internal(t, new_addr, new_num_bytes); remote_ptr new_end = new_addr + new_num_bytes; map_and_coalesce(t, km.set_range(new_addr, new_end), mr.recorded_map.set_range(new_addr, new_end), mr.emu_file, clone_stat(mr.mapped_file_stat), nullptr, nullptr); } void AddressSpace::remove_breakpoint(remote_code_ptr addr, BreakpointType type) { auto it = breakpoints.find(addr); if (it == breakpoints.end() || it->second.unref(type) > 0) { return; } destroy_breakpoint(it); } bool AddressSpace::add_breakpoint(remote_code_ptr addr, BreakpointType type) { auto it = breakpoints.find(addr); if (it == breakpoints.end()) { uint8_t overwritten_data[MAX_BKPT_INSTRUCTION_LENGTH]; ssize_t bkpt_size = bkpt_instruction_length(arch()); // Grab a random task from the VM so we can use its // read/write_mem() helpers. Task* t = first_running_task(); if (!t || bkpt_size != t->read_bytes_fallible(addr.to_data_ptr(), bkpt_size, overwritten_data)) { return false; } t->write_bytes_helper(addr.to_data_ptr(), bkpt_size, breakpoint_insn(arch()), nullptr, Task::IS_BREAKPOINT_RELATED); auto it_and_is_new = breakpoints.insert(make_pair(addr, Breakpoint())); DEBUG_ASSERT(it_and_is_new.second); memcpy(it_and_is_new.first->second.overwritten_data, overwritten_data, sizeof(overwritten_data)); it = it_and_is_new.first; } it->second.ref(type); return true; } void AddressSpace::remove_all_breakpoints() { while (!breakpoints.empty()) { destroy_breakpoint(breakpoints.begin()); } } void AddressSpace::suspend_breakpoint_at(remote_code_ptr addr) { auto it = breakpoints.find(addr); if (it != breakpoints.end()) { Task* t = first_running_task(); if (t) { t->write_bytes_helper(addr.to_data_ptr(), bkpt_instruction_length(arch()), it->second.overwritten_data); } } } void AddressSpace::restore_breakpoint_at(remote_code_ptr addr) { auto it = breakpoints.find(addr); if (it != breakpoints.end()) { Task* t = first_running_task(); if (t) { t->write_bytes_helper(addr.to_data_ptr(), bkpt_instruction_length(arch()), breakpoint_insn(arch())); } } } int AddressSpace::access_bits_of(WatchType type) { switch (type) { case WATCH_EXEC: return EXEC_BIT; case WATCH_WRITE: return WRITE_BIT; case WATCH_READWRITE: return READ_BIT | WRITE_BIT; default: FATAL() << "Unknown watchpoint type " << type; return 0; // not reached } } /** * We do not allow a watchpoint to watch the last byte of memory addressable * by rr. This avoids constructing a MemoryRange that wraps around. * For 64-bit builds this is no problem because addresses at the top of memory * are in kernel space. For 32-bit builds it seems impossible to map the last * page of memory in Linux so we should be OK there too. * Note that zero-length watchpoints are OK. configure_watch_registers just * ignores them. */ static MemoryRange range_for_watchpoint(remote_ptr addr, size_t num_bytes) { uintptr_t p = addr.as_int(); uintptr_t max_len = UINTPTR_MAX - p; return MemoryRange(addr, min(num_bytes, max_len)); } void AddressSpace::remove_watchpoint(remote_ptr addr, size_t num_bytes, WatchType type) { auto it = watchpoints.find(range_for_watchpoint(addr, num_bytes)); if (it != watchpoints.end() && 0 == it->second.unwatch(access_bits_of(type))) { watchpoints.erase(it); } allocate_watchpoints(); } bool AddressSpace::add_watchpoint(remote_ptr addr, size_t num_bytes, WatchType type) { MemoryRange key = range_for_watchpoint(addr, num_bytes); auto it = watchpoints.find(key); if (it == watchpoints.end()) { auto it_and_is_new = watchpoints.insert(make_pair(key, Watchpoint(num_bytes))); DEBUG_ASSERT(it_and_is_new.second); it = it_and_is_new.first; update_watchpoint_value(it->first, it->second); } it->second.watch(access_bits_of(type)); return allocate_watchpoints(); } void AddressSpace::save_watchpoints() { saved_watchpoints.push_back(watchpoints); } bool AddressSpace::restore_watchpoints() { DEBUG_ASSERT(!saved_watchpoints.empty()); watchpoints = saved_watchpoints[saved_watchpoints.size() - 1]; saved_watchpoints.pop_back(); return allocate_watchpoints(); } bool AddressSpace::update_watchpoint_value(const MemoryRange& range, Watchpoint& watchpoint) { Task* t = first_running_task(); if (!t) { return false; } bool valid = true; vector value_bytes = watchpoint.value_bytes; for (size_t i = 0; i < value_bytes.size(); ++i) { value_bytes[i] = 0xFF; } remote_ptr addr = range.start(); size_t num_bytes = range.size(); while (num_bytes > 0) { ssize_t bytes_read = t->read_bytes_fallible( addr, num_bytes, value_bytes.data() + (addr - range.start())); if (bytes_read <= 0) { valid = false; // advance to next page and try to read more. We want to know // when the valid part of a partially invalid watchpoint changes. bytes_read = min(num_bytes, (floor_page_size(addr) + page_size()) - addr); } addr += bytes_read; num_bytes -= bytes_read; } bool changed = valid != watchpoint.valid || memcmp(value_bytes.data(), watchpoint.value_bytes.data(), value_bytes.size()) != 0; watchpoint.valid = valid; watchpoint.value_bytes = value_bytes; return changed; } void AddressSpace::update_watchpoint_values(remote_ptr start, remote_ptr end) { MemoryRange r(start, end); for (auto& it : watchpoints) { if (it.first.intersects(r) && update_watchpoint_value(it.first, it.second)) { it.second.changed = true; // We do nothing to track kernel reads of read-write watchpoints... } } } static int DR_WATCHPOINT(int n) { return 1 << n; } static bool watchpoint_triggered(uintptr_t debug_status, const vector& regs) { for (auto reg : regs) { if (debug_status & DR_WATCHPOINT(reg)) { return true; } } return false; } bool AddressSpace::notify_watchpoint_fired(uintptr_t debug_status, remote_ptr hit_addr, remote_code_ptr address_of_singlestep_start) { bool triggered = false; for (auto& it : watchpoints) { // On Skylake/4.14.13-300.fc27.x86_64 at least, we have observed a // situation where singlestepping through the instruction before a hardware // execution watchpoint causes singlestep completion *and* also reports the // hardware execution watchpoint being triggered. The latter is incorrect. // This could be a HW issue or a kernel issue. Work around it by ignoring // triggered watchpoints that aren't on the instruction we just tried to // execute. bool write_triggered = (it.second.watched_bits() & WRITE_BIT) && update_watchpoint_value(it.first, it.second); // Depending on the architecture the hardware may indicate hit watchpoints // either by number, or by the address that triggered the watchpoint hit // - support either. bool read_triggered = false; bool exec_triggered = false; bool watchpoint_in_range = false; if (is_x86ish(arch())) { read_triggered = (it.second.watched_bits() & READ_BIT) && watchpoint_triggered(debug_status, it.second.debug_regs_for_exec_read); exec_triggered = (it.second.watched_bits() & EXEC_BIT) && (address_of_singlestep_start.is_null() || it.first.start() == address_of_singlestep_start.to_data_ptr()) && watchpoint_triggered(debug_status, it.second.debug_regs_for_exec_read); } else { watchpoint_in_range = it.first.contains(hit_addr); } if (write_triggered || read_triggered || exec_triggered || watchpoint_in_range) { it.second.changed = true; triggered = true; } } return triggered; } void AddressSpace::notify_written(remote_ptr addr, size_t num_bytes, uint32_t flags) { if (!(flags & Task::IS_BREAKPOINT_RELATED)) { update_watchpoint_values(addr, addr + num_bytes); } session()->accumulate_bytes_written(num_bytes); } void AddressSpace::remove_all_watchpoints() { watchpoints.clear(); allocate_watchpoints(); } void AddressSpace::unmap(Task* t, remote_ptr addr, ssize_t num_bytes) { LOG(debug) << "munmap(" << addr << ", " << num_bytes << ")"; num_bytes = ceil_page_size(num_bytes); if (!num_bytes) { return; } remove_range(dont_fork, MemoryRange(addr, num_bytes)); remove_range(wipe_on_fork, MemoryRange(addr, num_bytes)); return unmap_internal(t, addr, num_bytes); } void AddressSpace::unmap_internal(Task*, remote_ptr addr, ssize_t num_bytes) { LOG(debug) << "munmap(" << addr << ", " << num_bytes << ")"; auto unmapper = [this](const Mapping& mm, const MemoryRange& rem) { LOG(debug) << " unmapping (" << rem << ") ..."; Mapping m = move(mm); remove_from_map(m.map); LOG(debug) << " erased (" << m.map << ") ..."; // If the first segment we unmap underflows the unmap // region, remap the underflow region. auto monitored = m.monitored_shared_memory; if (m.map.start() < rem.start()) { Mapping underflow(m.map.subrange(m.map.start(), rem.start()), m.recorded_map.subrange(m.map.start(), rem.start()), m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr, move(monitored)); underflow.flags = m.flags; add_to_map(underflow); } // If the last segment we unmap overflows the unmap // region, remap the overflow region. if (rem.end() < m.map.end()) { Mapping overflow( m.map.subrange(rem.end(), m.map.end()), m.recorded_map.subrange(rem.end(), m.map.end()), m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr ? m.local_addr + (rem.end() - m.map.start()) : 0, m.monitored_shared_memory ? m.monitored_shared_memory->subrange(rem.end() - m.map.start(), m.map.end() - rem.end()) : nullptr); overflow.flags = m.flags; add_to_map(overflow); } if (m.local_addr) { auto addr = m.local_addr + (rem.start() - m.map.start()); auto size = std::min(rem.size(), m.map.size() - (rem.start() - m.map.start())); int ret = munmap(addr, size); if (ret < 0) { FATAL() << "Can't munmap"; } } }; for_each_in_range(addr, num_bytes, unmapper); update_watchpoint_values(addr, addr + num_bytes); } void AddressSpace::advise(Task*, remote_ptr addr, ssize_t num_bytes, int advice) { LOG(debug) << "madvise(" << addr << ", " << num_bytes << ", " << advice << ")"; num_bytes = ceil_page_size(num_bytes); switch (advice) { case MADV_DONTFORK: add_range(dont_fork, MemoryRange(addr, num_bytes)); break; case MADV_DOFORK: remove_range(dont_fork, MemoryRange(addr, num_bytes)); break; case MADV_WIPEONFORK: add_range(wipe_on_fork, MemoryRange(addr, num_bytes)); break; case MADV_KEEPONFORK: remove_range(wipe_on_fork, MemoryRange(addr, num_bytes)); break; default: break; } } void AddressSpace::did_fork_into(Task* t) { // MADV_WIPEONFORK is inherited across fork and cleared on exec. // We'll copy it here, then do the `dont_fork` unmappings, and then // whatever survives in the new AddressSpace's wipe_on_fork gets wiped. t->vm()->wipe_on_fork = wipe_on_fork; for (auto& range : dont_fork) { // During recording we execute MADV_DONTFORK so the forked child will // have had its dontfork areas unmapped by the kernel already if (!t->session().is_recording()) { AutoRemoteSyscalls remote(t); remote.infallible_syscall(syscall_number_for_munmap(remote.arch()), range.start(), range.size()); } t->vm()->unmap(t, range.start(), range.size()); } // Any ranges that were dropped were unmapped (and thus removed from // wipe_on_fork), so now we can record anything that's left. for (auto& range : t->vm()->wipe_on_fork) { if (t->session().is_recording()) { // Record that these mappings were wiped. RecordTask* rt = static_cast(t); rt->record_remote(range); } } } static string strip_deleted(const string& s) { static const char deleted[] = " (deleted)"; ssize_t find_deleted = s.size() - (sizeof(deleted) - 1); if (s.find(deleted) == size_t(find_deleted)) { return s.substr(0, find_deleted); } return s; } string KernelMapping::fsname_strip_deleted() const { return strip_deleted(fsname_); } enum HandleHeap { TREAT_HEAP_AS_ANONYMOUS, RESPECT_HEAP }; static bool normalized_file_names_equal(const KernelMapping& km1, const KernelMapping& km2, HandleHeap handle_heap) { if (km1.is_stack() || km2.is_stack()) { // The kernel seems to use "[stack:]" for any mapping area containing // thread |tid|'s stack pointer. When the thread exits, the next read of // the maps doesn't treat the area as stack at all. We don't want to track // thread exits, so if one of the mappings is a stack, skip the name // comparison. Device and inode numbers will still be checked. return true; } if (handle_heap == TREAT_HEAP_AS_ANONYMOUS && (km1.is_heap() || km2.is_heap())) { // The kernel's heuristics for treating an anonymous mapping as "[heap]" // are obscure. Just skip the name check. Device and inode numbers will // still be checked. return true; } // We don't track when a file gets deleted, so it's possible for the kernel // to have " (deleted)" when we don't. return strip_deleted(km1.fsname()) == strip_deleted(km2.fsname()); } /** * Return true iff |left| and |right| are located adjacently in memory * with the same metadata, and map adjacent locations of the same * underlying (real) device. */ static bool is_adjacent_mapping(const KernelMapping& mleft, const KernelMapping& mright, HandleHeap handle_heap, int32_t flags_to_check = 0xFFFFFFFF) { if (mleft.end() != mright.start()) { return false; } if (((mleft.flags() ^ mright.flags()) & flags_to_check) || mleft.prot() != mright.prot()) { return false; } if (!normalized_file_names_equal(mleft, mright, handle_heap)) { return false; } if (mleft.device() != mright.device() || mleft.inode() != mright.inode()) { return false; } if (mleft.is_real_device() && mleft.file_offset_bytes() + off64_t(mleft.size()) != mright.file_offset_bytes()) { return false; } return true; } /** * If |*left_m| and |right_m| are adjacent (see * |is_adjacent_mapping()|), write a merged segment descriptor to * |*left_m| and return true. Otherwise return false. */ static bool try_merge_adjacent(KernelMapping* left_m, const KernelMapping& right_m) { if (is_adjacent_mapping(*left_m, right_m, TREAT_HEAP_AS_ANONYMOUS, KernelMapping::checkable_flags_mask)) { *left_m = KernelMapping(left_m->start(), right_m.end(), left_m->fsname(), left_m->device(), left_m->inode(), right_m.prot(), right_m.flags(), left_m->file_offset_bytes()); return true; } return false; } static dev_t normalized_device_number(const KernelMapping& m) { if (m.fsname().c_str()[0] != '/') { return m.device(); } // btrfs files can report the wrong device number in /proc//maps, so // restrict ourselves to checking whether the device number is != 0 if (m.device() != KernelMapping::NO_DEVICE) { return (dev_t)-1; } return m.device(); } static void assert_segments_match(Task* t, const KernelMapping& input_m, const KernelMapping& km) { KernelMapping m = input_m; string err; if (m.start() != km.start()) { err = "starts differ"; } else if (m.end() != km.end()) { err = "ends differ"; } else if (m.prot() != km.prot()) { err = "prots differ"; } else if ((m.flags() ^ km.flags()) & KernelMapping::checkable_flags_mask) { err = "flags differ"; } else if (!normalized_file_names_equal(m, km, TREAT_HEAP_AS_ANONYMOUS) && !(km.is_heap() && m.fsname() == "") && !(m.is_heap() && km.fsname() == "") && !km.is_vdso()) { // Due to emulated exec, the kernel may identify any of our anonymous maps // as [heap] (or not). // Kernels before 3.16 have a bug where any mapping at the original VDSO // address is marked [vdso] even if the VDSO was unmapped and replaced by // something else, so if the kernel reports [vdso] it may be spurious and // we skip this check. See kernel commit // a62c34bd2a8a3f159945becd57401e478818d51c. err = "filenames differ"; } else if (normalized_device_number(m) != normalized_device_number(km)) { err = "devices_differ"; } else if (m.inode() != km.inode()) { err = "inodes differ"; } if (err.size()) { LOG(error) << "cached mmap:"; t->vm()->dump(); LOG(error) << "/proc/" << t->tid << "/mmaps:"; AddressSpace::print_process_maps(t); ASSERT(t, false) << "\nCached mapping " << m << " should be " << km << "; " << err; } } void AddressSpace::ensure_replay_matches_single_recorded_mapping(Task* t, MemoryRange range) { // The only case where we eagerly coalesced during recording but not replay should // be where we mapped private memory beyond-end-of-file. // Don't do an actual coalescing check here; we rely on the caller to tell us // the range to coalesce. ASSERT(t, range.start() == floor_page_size(range.start())); ASSERT(t, range.end() == ceil_page_size(range.end())); auto fixer = [this, t, range](const Mapping& mm, const MemoryRange&) { if (mm.map == range) { // Existing single mapping covers entire range; nothing to do. return; } Mapping mapping = move(mm); // These should be null during replay ASSERT(t, !mapping.mapped_file_stat); // These should not be in use for a beyond-end-of-file mapping ASSERT(t, !mapping.local_addr); // The mapping should be private ASSERT(t, mapping.map.flags() & MAP_PRIVATE); ASSERT(t, !mapping.emu_file); ASSERT(t, !mapping.monitored_shared_memory); // Flagged mappings shouldn't be coalescable ever ASSERT(t, !mapping.flags); if (!(mapping.map.flags() & MAP_ANONYMOUS)) { // Direct-mapped piece. Turn it into an anonymous mapping. vector buffer; buffer.resize(mapping.map.size()); t->read_bytes_helper(mapping.map.start(), buffer.size(), buffer.data()); { AutoRemoteSyscalls remote(t); remote.infallible_mmap_syscall(mapping.map.start(), buffer.size(), mapping.map.prot(), mapping.map.flags() | MAP_ANONYMOUS | MAP_FIXED, -1, 0); } t->write_bytes_helper(mapping.map.start(), buffer.size(), buffer.data()); // We replace the entire mapping even if part of it falls outside the desired range. // That's OK, this replacement preserves behaviour, it's simpler, even if a bit // less efficient in weird cases. mem.erase(mapping.map); KernelMapping anonymous_km(mapping.map.start(), mapping.map.end(), string(), KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, mapping.map.prot(), mapping.map.flags() | MAP_ANONYMOUS); Mapping new_mapping(anonymous_km, mapping.recorded_map); mem[new_mapping.map] = new_mapping; } }; for_each_in_range(range.start(), range.size(), fixer); coalesce_around(t, mem.find(range)); } KernelMapping AddressSpace::vdso() const { DEBUG_ASSERT(!vdso_start_addr.is_null()); return mapping_of(vdso_start_addr).map; } /** * Iterate over /proc/maps segments for a task and verify that the * task's cached mapping matches the kernel's (given a lenient fuzz * factor). */ void AddressSpace::verify(Task* t) const { ASSERT(t, task_set().end() != task_set().find(t)); if (thread_group_in_exec(t)) { return; } LOG(debug) << "Verifying address space for task " << t->tid; MemoryMap::const_iterator mem_it = mem.begin(); KernelMapIterator kernel_it(t); if (kernel_it.at_end()) { LOG(debug) << "Task " << t->tid << " exited unexpectedly, ignoring"; return; } while (!kernel_it.at_end() && mem_it != mem.end()) { KernelMapping km = kernel_it.current(); ++kernel_it; while (!kernel_it.at_end()) { KernelMapping next_km = kernel_it.current(); if (!try_merge_adjacent(&km, next_km)) { break; } ++kernel_it; } KernelMapping vm = mem_it->second.map; ++mem_it; while (mem_it != mem.end() && try_merge_adjacent(&vm, mem_it->second.map)) { ++mem_it; } assert_segments_match(t, vm, km); } ASSERT(t, kernel_it.at_end() && mem_it == mem.end()); } // Just a place that rr's AutoSyscall functionality can use as a syscall // instruction in rr's address space for use before we have exec'd. extern "C" { // Mark this as hidden, otherwise we might get the address of the GOT entry, // which could cause problems. extern char rr_syscall_addr __attribute__ ((visibility ("hidden"))); } static void __attribute__((noinline, used)) fake_syscall() { #ifdef __i386__ __asm__ __volatile__("rr_syscall_addr: int $0x80\n\t" "nop\n\t" "nop\n\t" "nop\n\t"); #elif defined(__x86_64__) __asm__ __volatile__("rr_syscall_addr: syscall\n\t" "nop\n\t" "nop\n\t" "nop\n\t"); #elif defined(__aarch64__) __asm__ __volatile__("rr_syscall_addr: svc #0\n\t" "nop\n\t" "nop\n\t" "nop\n\t"); #endif } AddressSpace::AddressSpace(Task* t, const string& exe, uint32_t exec_count) : exe(exe), leader_tid_(t->rec_tid), leader_serial(t->tuid().serial()), exec_count(exec_count), session_(&t->session()), monkeypatch_state(t->session().is_recording() ? new Monkeypatcher() : nullptr), syscallbuf_enabled_(false), do_breakpoint_fault_addr_(nullptr), stopping_breakpoint_table_(nullptr), stopping_breakpoint_table_entry_size_(0), first_run_event_(0) { // TODO: this is a workaround of // https://github.com/rr-debugger/rr/issues/1113 . if (session_->done_initial_exec()) { populate_address_space(t); DEBUG_ASSERT(!vdso_start_addr.is_null()); } else { // Setup traced_syscall_ip_ now because we need to do AutoRemoteSyscalls // (for open_mem_fd) before the first exec. We rely on the fact that we // haven't execed yet, so the address space layout is the same. traced_syscall_ip_ = remote_code_ptr((uintptr_t)&rr_syscall_addr); } } AddressSpace::AddressSpace(Session* session, const AddressSpace& o, pid_t leader_tid, uint32_t leader_serial, uint32_t exec_count) : exe(o.exe), leader_tid_(leader_tid), leader_serial(leader_serial), exec_count(exec_count), brk_start(o.brk_start), brk_end(o.brk_end), mem(o.mem), shm_sizes(o.shm_sizes), monitored_mem(o.monitored_mem), session_(session), vdso_start_addr(o.vdso_start_addr), monkeypatch_state(o.monkeypatch_state ? new Monkeypatcher(*o.monkeypatch_state) : nullptr), traced_syscall_ip_(o.traced_syscall_ip_), privileged_traced_syscall_ip_(o.privileged_traced_syscall_ip_), syscallbuf_enabled_(o.syscallbuf_enabled_), do_breakpoint_fault_addr_(o.do_breakpoint_fault_addr_), stopping_breakpoint_table_(o.stopping_breakpoint_table_), stopping_breakpoint_table_entry_size_(o.stopping_breakpoint_table_entry_size_), saved_auxv_(o.saved_auxv_), saved_interpreter_base_(o.saved_interpreter_base_), saved_ld_path_(o.saved_ld_path_), first_run_event_(0) { for (auto& m : mem) { // The original address space continues to have exclusive ownership of // all local mappings. m.second.local_addr = nullptr; } for (auto& it : o.breakpoints) { breakpoints.insert(make_pair(it.first, it.second)); } for (auto& it : o.watchpoints) { watchpoints.insert(make_pair(it.first, it.second)); } if (session != o.session()) { // Cloning into a new session means we're checkpointing. first_run_event_ = o.first_run_event_; } // cloned tasks will automatically get cloned debug registers and // cloned address-space memory, so we don't need to do any more work here. } bool AddressSpace::post_vm_clone(Task* t) { if (has_mapping(preload_thread_locals_start()) && (mapping_flags_of(preload_thread_locals_start()) & AddressSpace::Mapping::IS_THREAD_LOCALS) == 0) { // The tracee already has a mapping at this address that doesn't belong to // us. Don't touch it. return false; } // Otherwise, the preload_thread_locals mapping is non-existent or ours. // Recreate it. AutoRemoteSyscalls remote(t); t->session().create_shared_mmap(remote, PRELOAD_THREAD_LOCALS_SIZE, preload_thread_locals_start(), "preload_thread_locals"); mapping_flags_of(preload_thread_locals_start()) |= AddressSpace::Mapping::IS_THREAD_LOCALS; return true; } static bool try_split_unaligned_range(MemoryRange& range, size_t bytes, vector& result) { if ((range.start().as_int() & (bytes - 1)) || range.size() < bytes) { return false; } result.push_back(MemoryRange(range.start(), bytes)); range = MemoryRange(range.start() + bytes, range.end()); return true; } static vector split_range(const MemoryRange& range) { vector result; MemoryRange r = range; while (r.size() > 0) { if ((sizeof(void*) < 8 || !try_split_unaligned_range(r, 8, result)) && !try_split_unaligned_range(r, 4, result) && !try_split_unaligned_range(r, 2, result)) { bool ret = try_split_unaligned_range(r, 1, result); DEBUG_ASSERT(ret); } } return result; } static void configure_watch_registers(vector& regs, const MemoryRange& range, WatchType type, vector* assigned_regs) { // Zero-sized WatchConfigs return no ranges here, so are ignored. auto split_ranges = split_range(range); if (type == WATCH_WRITE && range.size() > 1) { // We can suppress spurious write-watchpoint triggerings by checking // whether memory values have changed. So we can sometimes conserve // debug registers by upgrading an unaligned range to an aligned range // of a larger size. uintptr_t align; if (range.size() <= 2) { align = 2; } else if (range.size() <= 4 || sizeof(void*) <= 4) { align = 4; } else { align = 8; } remote_ptr aligned_start(range.start().as_int() & ~(align - 1)); remote_ptr aligned_end((range.end().as_int() + (align - 1)) & ~(align - 1)); auto split = split_range(MemoryRange(aligned_start, aligned_end)); // If the aligned range doesn't reduce register usage, use the original // split to avoid spurious triggerings if (split.size() < split_ranges.size()) { split_ranges = split; } } for (auto& r : split_ranges) { if (assigned_regs) { assigned_regs->push_back(regs.size()); } regs.push_back(WatchConfig(r.start(), r.size(), type)); } } vector AddressSpace::get_watch_configs( WillSetTaskState will_set_task_state) { vector result; for (auto& kv : watchpoints) { vector* assigned_regs = nullptr; if (will_set_task_state == SETTING_TASK_STATE) { kv.second.debug_regs_for_exec_read.clear(); assigned_regs = &kv.second.debug_regs_for_exec_read; } const MemoryRange& r = kv.first; int watching = kv.second.watched_bits(); if (EXEC_BIT & watching) { configure_watch_registers(result, r, WATCH_EXEC, assigned_regs); } if (READ_BIT & watching) { configure_watch_registers(result, r, WATCH_READWRITE, assigned_regs); } else if (WRITE_BIT & watching) { configure_watch_registers(result, r, WATCH_WRITE, nullptr); } } return result; } vector AddressSpace::get_watchpoints_internal( WatchpointFilter filter) { vector result; for (auto& kv : watchpoints) { if (filter == CHANGED_WATCHPOINTS) { if (!kv.second.changed) { continue; } kv.second.changed = false; } const MemoryRange& r = kv.first; int watching = kv.second.watched_bits(); if (EXEC_BIT & watching) { result.push_back(WatchConfig(r.start(), r.size(), WATCH_EXEC)); } if (READ_BIT & watching) { result.push_back(WatchConfig(r.start(), r.size(), WATCH_READWRITE)); } else if (WRITE_BIT & watching) { result.push_back(WatchConfig(r.start(), r.size(), WATCH_WRITE)); } } return result; } vector AddressSpace::consume_watchpoint_changes() { return get_watchpoints_internal(CHANGED_WATCHPOINTS); } vector AddressSpace::all_watchpoints() { return get_watchpoints_internal(ALL_WATCHPOINTS); } bool AddressSpace::has_any_watchpoint_changes() { for (auto& kv : watchpoints) { if (kv.second.changed) { return true; } } return false; } bool AddressSpace::has_exec_watchpoint_fired(remote_code_ptr addr) { for (auto& kv : watchpoints) { if (kv.second.changed && kv.second.exec_count > 0 && kv.first.start() == addr.to_data_ptr()) { return true; } } return false; } bool AddressSpace::allocate_watchpoints() { Task::DebugRegs regs = get_watch_configs(SETTING_TASK_STATE); if (regs.size() <= 0x7f) { bool ok = true; for (auto t : task_set()) { if (!t->set_debug_regs(regs)) { ok = false; } } if (ok) { return true; } } regs.clear(); for (auto t2 : task_set()) { t2->set_debug_regs(regs); } for (auto kv : watchpoints) { kv.second.debug_regs_for_exec_read.clear(); } return false; } static inline void assert_coalesceable(Task* t, const AddressSpace::Mapping& lower, const AddressSpace::Mapping& higher) { ASSERT(t, lower.emu_file == higher.emu_file); ASSERT(t, lower.flags == higher.flags); ASSERT(t, (lower.local_addr == 0 && higher.local_addr == 0) || lower.local_addr + lower.map.size() == higher.local_addr); ASSERT(t, !lower.monitored_shared_memory && !higher.monitored_shared_memory); } static bool is_coalescable(const AddressSpace::Mapping& mleft, const AddressSpace::Mapping& mright) { if (!is_adjacent_mapping(mleft.map, mright.map, RESPECT_HEAP) || !is_adjacent_mapping(mleft.recorded_map, mright.recorded_map, RESPECT_HEAP)) { return false; } return mleft.flags == mright.flags; } void AddressSpace::coalesce_around(Task* t, MemoryMap::iterator it) { auto first_kv = it; while (mem.begin() != first_kv) { auto next = first_kv; --first_kv; if (!is_coalescable(first_kv->second, next->second)) { first_kv = next; break; } assert_coalesceable(t, first_kv->second, next->second); } auto last_kv = it; while (true) { auto prev = last_kv; ++last_kv; if (mem.end() == last_kv || !is_coalescable(prev->second, last_kv->second)) { last_kv = prev; break; } assert_coalesceable(t, prev->second, last_kv->second); } ASSERT(t, last_kv != mem.end()); if (first_kv == last_kv) { LOG(debug) << " no mappings to coalesce"; return; } Mapping new_m(first_kv->second.map.extend(last_kv->first.end()), first_kv->second.recorded_map.extend(last_kv->first.end()), first_kv->second.emu_file, clone_stat(first_kv->second.mapped_file_stat), first_kv->second.local_addr); new_m.flags = first_kv->second.flags; LOG(debug) << " coalescing " << new_m.map; // monitored-memory currently isn't coalescable so we don't need to // adjust monitored_mem mem.erase(first_kv, ++last_kv); auto ins = mem.insert(MemoryMap::value_type(new_m.map, new_m)); DEBUG_ASSERT(ins.second); // key didn't already exist } void AddressSpace::destroy_breakpoint(BreakpointMap::const_iterator it) { if (task_set().empty()) { return; } Task* t = first_running_task(); if (!t) { return; } auto ptr = it->first.to_data_ptr(); auto data = it->second.overwritten_data; if (bkpt_instruction_length(arch()) == 1) { LOG(debug) << "Writing back " << HEX(data[0]) << " at " << ptr; } else { LOG(debug) << "Writing back " << bkpt_instruction_length(arch()) << " bytes at " << ptr; } t->write_bytes_helper(ptr, bkpt_instruction_length(arch()), data, nullptr, Task::IS_BREAKPOINT_RELATED); breakpoints.erase(it); } void AddressSpace::maybe_update_breakpoints(Task* t, remote_ptr addr, size_t len) { for (auto& it : breakpoints) { remote_ptr bp_addr = it.first.to_data_ptr(); if (addr <= bp_addr && bp_addr < addr + len - 1) { // This breakpoint was overwritten. Note the new data and reset the // breakpoint. bool ok = true; t->read_bytes_helper(bp_addr, bkpt_instruction_length(arch()), &it.second.overwritten_data, &ok); ASSERT(t, ok); t->write_bytes_helper(bp_addr, bkpt_instruction_length(arch()), breakpoint_insn(arch())); } } } void AddressSpace::for_each_in_range( remote_ptr addr, ssize_t num_bytes, function f, int how) { remote_ptr region_start = floor_page_size(addr); remote_ptr last_unmapped_end = region_start; remote_ptr region_end = ceil_page_size(addr + num_bytes); while (last_unmapped_end < region_end) { // Invariant: |rem| is always exactly the region of // memory remaining to be examined for pages to be // unmapped. MemoryRange rem(last_unmapped_end, region_end); // The next page to iterate may not be contiguous with // the last one seen. auto it = mem.lower_bound(rem); if (mem.end() == it) { LOG(debug) << " not found, done."; return; } // Don't make a reference here. |f| is allowed to erase Mappings. MemoryRange range = it->first; if (rem.end() <= range.start()) { LOG(debug) << " mapping at " << range.start() << " out of range, done."; return; } if (ITERATE_CONTIGUOUS == how && !(range.start() < region_start || rem.start() == range.start())) { LOG(debug) << " discontiguous mapping at " << range.start() << ", done."; return; } f(it->second, rem); // Maintain the loop invariant. last_unmapped_end = range.end(); } } void AddressSpace::map_and_coalesce( Task* t, const KernelMapping& m, const KernelMapping& recorded_map, EmuFile::shr_ptr emu_file, unique_ptr mapped_file_stat, void* local_addr, shared_ptr&& monitored) { LOG(debug) << " mapping " << m; if (monitored) { monitored_mem.insert(m.start()); } auto ins = mem.insert(MemoryMap::value_type( m, Mapping(m, recorded_map, emu_file, move(mapped_file_stat), local_addr, move(monitored)))); coalesce_around(t, ins.first); update_watchpoint_values(m.start(), m.end()); } static bool could_be_stack(const KernelMapping& km) { // On 4.1.6-200.fc22.x86_64 we observe that during exec of the rr_exec_stub // during replay, when the process switches from 32-bit to 64-bit, the 64-bit // registers seem truncated to 32 bits during the initial PTRACE_GETREGS so // our sp looks wrong and /proc//maps doesn't identify the region as // stack. // On stub execs there should only be one read-writable memory area anyway. return km.prot() == (PROT_READ | PROT_WRITE) && km.fsname() == "" && km.device() == KernelMapping::NO_DEVICE && km.inode() == KernelMapping::NO_INODE; } static dev_t check_device(const KernelMapping& km) { if (km.fsname().c_str()[0] != '/') { return km.device(); } // btrfs files can return the wrong device number in /proc//maps struct stat st; int ret = stat(km.fsname().c_str(), &st); if (ret < 0) { return km.device(); } return st.st_dev; } void AddressSpace::populate_address_space(Task* t) { bool found_proper_stack = false; for (KernelMapIterator it(t); !it.at_end(); ++it) { auto& km = it.current(); if (km.is_stack()) { found_proper_stack = true; } } // If we're being recorded by rr, we'll see the outer rr's rr_page and // preload_thread_locals. In post_exec() we'll remap those with our // own mappings. That's OK because a) the rr_page contents are the same // anyway and immutable and b) the preload_thread_locals page is only // used by the preload library, and the preload library only knows about // the inner rr. I.e. as far as the outer rr is concerned, the tracee is // not doing syscall buffering. int found_stacks = 0; for (KernelMapIterator it(t); !it.at_end(); ++it) { auto& km = it.current(); int flags = km.flags(); remote_ptr start = km.start(); bool is_stack = found_proper_stack ? km.is_stack() : could_be_stack(km); if (is_stack) { ++found_stacks; flags |= MAP_GROWSDOWN; if (uses_invisible_guard_page()) { // MAP_GROWSDOWN segments really occupy one additional page before // the start address shown by /proc//maps --- unless that page // is already occupied by another mapping. if (!has_mapping(start - page_size())) { start -= page_size(); } } } map(t, start, km.end() - start, km.prot(), flags, km.file_offset_bytes(), km.fsname(), check_device(km), km.inode(), nullptr); } ASSERT(t, found_stacks == 1); } static int random_addr_bits(SupportedArch arch) { switch (arch) { default: DEBUG_ASSERT(0 && "Unknown architecture"); RR_FALLTHROUGH; case x86: return 32; // Current x86-64 systems have only 48 bits of virtual address space, // and only the bottom half is usable by user space case x86_64: return 47; // Aarch64 has 48 bit address space, with user and kernel each getting // their own 48 bits worth of address space at opposite end of the full // 64-bit address space. case aarch64: return 48; } } static MemoryRange adjust_range_for_stack_growth(const KernelMapping& km) { remote_ptr start = km.start(); if (km.flags() & MAP_GROWSDOWN) { start = min(start, km.end() - AddressSpace::chaos_mode_min_stack_size()); } return MemoryRange(start, km.end()); } static bool overlaps_asan_usage(const MemoryRange& r) { MemoryRange asan_shadow(remote_ptr((uintptr_t)0x00007fff7000LL), remote_ptr((uintptr_t)0x10007fff8000LL)); MemoryRange asan_allocator_reserved(remote_ptr((uintptr_t)0x600000000000LL), remote_ptr((uintptr_t)0x640000000000LL)); return r.intersects(asan_shadow) || r.intersects(asan_allocator_reserved); } // Choose a 4TB range to exclude from random mappings. This makes room for // advanced trace analysis tools that require a large address range in tracees // that is never mapped. static MemoryRange choose_global_exclusion_range() { if (sizeof(uintptr_t) < 8) { return MemoryRange(nullptr, 0); } const uint64_t range_size = uint64_t(4)*1024*1024*1024*1024; while (true) { int bits = random_addr_bits(x86_64); uint64_t r = ((uint64_t)(uint32_t)random() << 32) | (uint32_t)random(); uint64_t r_addr = r & ((uint64_t(1) << bits) - 1); r_addr = min(r_addr, (uint64_t(1) << bits) - range_size); remote_ptr addr = floor_page_size(remote_ptr(r_addr)); MemoryRange ret(addr, (uintptr_t)range_size); if (!overlaps_asan_usage(ret)) { return ret; } } } MemoryRange AddressSpace::get_global_exclusion_range() { static MemoryRange global_exclusion_range = choose_global_exclusion_range(); return global_exclusion_range; } remote_ptr AddressSpace::chaos_mode_find_free_memory(RecordTask* t, size_t len, remote_ptr hint) { MemoryRange global_exclusion_range = get_global_exclusion_range(); // NB: Above RR_PAGE_ADDR is probably not free anyways, but if it somehow is // don't hand it out again. static MemoryRange rrpage_so_range = MemoryRange(RR_PAGE_ADDR - page_size(), RR_PAGE_ADDR + page_size()); // Ignore the hint half the time. if (hint && (random() & 1)) { hint = nullptr; } int bits = random_addr_bits(t->arch()); uint64_t addr_space_limit = uint64_t(1) << bits; while (true) { remote_ptr addr; if (hint) { addr = hint; // Don't try using the hint again. hint = nullptr; } else { // Half the time, try to allocate at a completely random address. The other // half of the time, we'll try to allocate immediately before or after a // randomly chosen existing mapping. if (random() % 2) { // Some of these addresses will not be mappable. That's fine, the // kernel will fall back to a valid address if the hint is not valid. uint64_t r = ((uint64_t)(uint32_t)random() << 32) | (uint32_t)random(); addr = floor_page_size(remote_ptr(r & (addr_space_limit - 1))); } else { ASSERT(t, !mem.empty()); int map_index = random() % mem.size(); int map_count = 0; for (const auto& m : maps()) { if (map_count == map_index) { addr = m.map.start(); break; } ++map_count; } } } // If there's a collision (which there always will be in the second case // above), either move the mapping forwards or backwards in memory until it // fits. Choose the direction randomly. int direction = (random() % 2) ? 1 : -1; while (true) { Maps m = maps_starting_at(addr); if (m.begin() == m.end()) { break; } MemoryRange range = adjust_range_for_stack_growth(m.begin()->map); if (range.start() >= addr + len) { // No overlap with an existing mapping; we're good! break; } if (direction == -1) { addr = range.start() - len; } else { addr = range.end(); } } if (uint64_t(addr.as_int()) >= addr_space_limit || uint64_t(addr.as_int()) + ceil_page_size(len) >= addr_space_limit) { // We fell off one end of the address space. Try everything again. continue; } MemoryRange r(addr, ceil_page_size(len)); if (r.intersects(rrpage_so_range)) { continue; } if (r.intersects(global_exclusion_range)) { continue; } if (t->session().asan_active() && sizeof(size_t) == 8) { LOG(debug) << "Checking ASAN shadow"; if (overlaps_asan_usage(r)) { continue; } } return addr; } } remote_ptr AddressSpace::find_free_memory(size_t required_space, remote_ptr after) { auto maps = maps_starting_at(after); auto current = maps.begin(); while (current != maps.end()) { auto next = current; ++next; if (next == maps.end()) { if (current->map.end() + required_space >= current->map.end()) { break; } } else { if (current->map.end() + required_space <= next->map.start()) { break; } } current = next; } return current->map.end(); } void AddressSpace::add_stap_semaphore_range(Task* task, MemoryRange range) { ASSERT(task, range.start() != range.end()) << "Unexpected zero-length SystemTap semaphore range: " << range; ASSERT(task, (range.size() & 1) == 0) << "Invalid SystemTap semaphore range at " << range << ": size is not a multiple of the size of a STap semaphore!"; auto ptr = range.start().cast(), end = range.end().cast(); for (; ptr < end; ++ptr) { stap_semaphores.insert(ptr); } } void AddressSpace::remove_stap_semaphore_range(Task* task, MemoryRange range) { ASSERT(task, range.start() != range.end()) << "Unexpected zero-length SystemTap semaphore range: " << range; ASSERT(task, (range.size() & 1) == 0) << "Invalid SystemTap semaphore range at " << range << ": size is not a multiple of the size of a STap semaphore!"; auto ptr = range.start().cast(), end = range.end().cast(); for (; ptr < end; ++ptr) { stap_semaphores.erase(ptr); } } bool AddressSpace::is_stap_semaphore(remote_ptr addr) { return stap_semaphores.find(addr) != stap_semaphores.end(); } void AddressSpace::fd_tables_changed() { if (!session()->is_recording()) { // All modifications are recorded during record return; } if (!syscallbuf_enabled()) { return; } DEBUG_ASSERT(task_set().size() != 0); uint8_t fdt_uniform = true; RecordTask* rt = static_cast(first_running_task()); if (!rt) { return; } auto fdt = rt->fd_table(); for (auto* t : task_set()) { if (t->fd_table() != fdt) { fdt_uniform = false; } } auto addr = REMOTE_PTR_FIELD(rt->preload_globals, fdt_uniform); bool ok = true; if (rt->read_mem(addr, &ok) != fdt_uniform) { if (!ok) { return; } rt->write_mem(addr, fdt_uniform); rt->record_local(addr, sizeof(fdt_uniform), &fdt_uniform); } } } // namespace rr rr-5.5.0/src/AddressSpace.h000066400000000000000000001202411412202446200154420ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_ADDRESS_SPACE_H_ #define RR_ADDRESS_SPACE_H_ #include #include #include #include #include #include #include #include #include #include #include "preload/preload_interface.h" #include "EmuFs.h" #include "HasTaskSet.h" #include "MemoryRange.h" #include "Monkeypatcher.h" #include "PropertyTable.h" #include "TaskishUid.h" #include "TraceStream.h" #include "core.h" #include "kernel_abi.h" #include "remote_code_ptr.h" #include "util.h" namespace rr { class AutoRemoteSyscalls; class MonitoredSharedMemory; class RecordTask; class Session; class Task; /** * Records information that the kernel knows about a mapping. This includes * everything returned through /proc//maps but also information that * we know from observing mmap and mprotect calls. */ class KernelMapping : public MemoryRange { public: /** * These are the flags we track internally to distinguish * between adjacent segments. For example, the kernel * considers a NORESERVE anonynmous mapping that's adjacent to * a non-NORESERVE mapping distinct, even if all other * metadata are the same. See |is_adjacent_mapping()|. */ static const int map_flags_mask = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE | MAP_SHARED | MAP_STACK | MAP_GROWSDOWN; static const int checkable_flags_mask = MAP_PRIVATE | MAP_SHARED; static const dev_t NO_DEVICE = 0; static const ino_t NO_INODE = 0; KernelMapping() : device_(0), inode_(0), prot_(0), flags_(0), offset(0) {} KernelMapping(remote_ptr start, remote_ptr end, const std::string& fsname, dev_t device, ino_t inode, int prot, int flags, off64_t offset = 0) : MemoryRange(start, end), fsname_(fsname), device_(device), inode_(inode), prot_(prot), flags_(flags & map_flags_mask), offset(offset) { assert_valid(); } KernelMapping(const KernelMapping& o) : MemoryRange(o), fsname_(o.fsname_), device_(o.device_), inode_(o.inode_), prot_(o.prot_), flags_(o.flags_), offset(o.offset) { assert_valid(); } KernelMapping operator=(const KernelMapping& o) { this->~KernelMapping(); new (this) KernelMapping(o); return *this; } void assert_valid() const { DEBUG_ASSERT(end() >= start()); DEBUG_ASSERT(size() % page_size() == 0); DEBUG_ASSERT(!(flags_ & ~map_flags_mask)); DEBUG_ASSERT(offset % page_size() == 0); } KernelMapping extend(remote_ptr end) const { DEBUG_ASSERT(end >= MemoryRange::end()); return KernelMapping(start(), end, fsname_, device_, inode_, prot_, flags_, offset); } KernelMapping set_range(remote_ptr start, remote_ptr end) const { return KernelMapping(start, end, fsname_, device_, inode_, prot_, flags_, offset); } KernelMapping subrange(remote_ptr start, remote_ptr end) const { DEBUG_ASSERT(start >= MemoryRange::start() && end <= MemoryRange::end()); return KernelMapping( start, end, fsname_, device_, inode_, prot_, flags_, offset + (is_real_device() ? start - MemoryRange::start() : 0)); } KernelMapping set_prot(int prot) const { return KernelMapping(start(), end(), fsname_, device_, inode_, prot, flags_, offset); } /** * Dump a representation of |this| to a string in a format * similar to the former part of /proc/[tid]/maps. */ std::string str() const { char str[200]; sprintf(str, "%8p-%8p %c%c%c%c %08" PRIx64 " %02x:%02x %-10ld ", (void*)start().as_int(), (void*)end().as_int(), (PROT_READ & prot_) ? 'r' : '-', (PROT_WRITE & prot_) ? 'w' : '-', (PROT_EXEC & prot_) ? 'x' : '-', (MAP_SHARED & flags_) ? 's' : 'p', offset, (int)MAJOR(device()), (int)MINOR(device()), (long)inode()); return str + fsname(); } const std::string& fsname() const { return fsname_; } std::string fsname_strip_deleted() const; dev_t device() const { return device_; } ino_t inode() const { return inode_; } int prot() const { return prot_; } int flags() const { return flags_; } uint64_t file_offset_bytes() const { return offset; } /** * Return true if this file is/was backed by an external * device, as opposed to a transient RAM mapping. */ bool is_real_device() const { return device() > NO_DEVICE; } bool is_vdso() const { return fsname() == "[vdso]"; } bool is_heap() const { return fsname() == "[heap]"; } bool is_stack() const { return fsname().find("[stack") == 0; } bool is_vvar() const { return fsname() == "[vvar]"; } bool is_vsyscall() const { return fsname() == "[vsyscall]"; } struct stat fake_stat() const { struct stat fake_stat; memset(&fake_stat, 0, sizeof(fake_stat)); fake_stat.st_dev = device(); fake_stat.st_ino = inode(); fake_stat.st_size = size(); return fake_stat; } private: // The kernel's name for the mapping, as per /proc//maps. This must // be exactly correct. const std::string fsname_; // Note that btrfs has weird behavior and /proc/.../maps can show a different // device number to the device from stat()ing the file that was mapped. // https://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg57667.html // We store here the device number obtained from fstat()ing the file. // This also seems to be consistent with what we read from populate_address_space // for the initial post-exec mappings. It is NOT consistent with what we get // from reading /proc/.../maps for non-initial mappings. dev_t device_; ino_t inode_; const int prot_; const int flags_; const uint64_t offset; }; inline std::ostream& operator<<(std::ostream& o, const KernelMapping& m) { o << m.str(); return o; } /** * Compare |a| and |b| so that "subset" lookups will succeed. What * does that mean? If |a| and |b| overlap (intersect), then this * comparator considers them equivalent. That means that if |a| * represents one byte within a mapping |b|, then |a| and |b| will be * considered equivalent. * * If |a| and |b| don't overlap, return true if |a|'s start address is * less than |b|'s/ */ struct MappingComparator { bool operator()(const MemoryRange& a, const MemoryRange& b) const { return !a.intersects(b) && a.start() < b.start(); } }; enum BreakpointType { BKPT_NONE = 0, // Trap for internal rr purposes, f.e. replaying async // signals. BKPT_INTERNAL, // Trap on behalf of a debugger user. BKPT_USER, }; enum WatchType { // NB: these random-looking enumeration values are chosen to // match the numbers programmed into x86 debug registers. WATCH_EXEC = 0x00, WATCH_WRITE = 0x01, WATCH_READWRITE = 0x03 }; enum ArmWatchType { ARM_WATCH_EXEC = 0x0, ARM_WATCH_READ = 0x1, ARM_WATCH_WRITE = 0x2, ARM_WATCH_READWRITE = ARM_WATCH_READ | ARM_WATCH_WRITE }; enum ArmPrivLevel { ARM_PRIV_EL0 = 0x2 }; enum DebugStatus { DS_WATCHPOINT_ANY = 0xf, DS_SINGLESTEP = 1 << 14, }; /** * A distinct watchpoint, corresponding to the information needed to * program a single x86 debug register. */ struct WatchConfig { WatchConfig(remote_ptr addr, size_t num_bytes, WatchType type) : addr(addr), num_bytes(num_bytes), type(type) {} remote_ptr addr; size_t num_bytes; WatchType type; }; /** * Models the address space for a set of tasks. This includes the set * of mapped pages, and the resources those mappings refer to. */ class AddressSpace : public HasTaskSet { friend class Session; friend struct VerifyAddressSpace; public: class Mapping { public: Mapping(const KernelMapping& map, const KernelMapping& recorded_map, EmuFile::shr_ptr emu_file = nullptr, std::unique_ptr mapped_file_stat = nullptr, void* local_addr = nullptr, std::shared_ptr&& monitored = nullptr); ~Mapping(); Mapping(const Mapping&); Mapping() = default; const Mapping& operator=(const Mapping& other) { this->~Mapping(); new (this) Mapping(other); return *this; } const KernelMapping map; // The corresponding KernelMapping in the recording. During recording, // equal to 'map'. const KernelMapping recorded_map; const EmuFile::shr_ptr emu_file; std::unique_ptr mapped_file_stat; // If this mapping has been mapped into the local address space, // this is the address of the first byte of the equivalent local mapping. // This mapping is always mapped as PROT_READ|PROT_WRITE regardless of the // mapping's permissions in the tracee. Also note that it is the caller's // responsibility to keep this alive at least as long as this mapping is // present in the address space. uint8_t* local_addr; const std::shared_ptr monitored_shared_memory; // Flags indicate mappings that require special handling. Adjacent mappings // may only be merged if their `flags` value agree. enum : uint32_t { FLAG_NONE = 0x0, // This mapping represents a syscallbuf. It needs to handled specially // during checksumming since its contents are not fully restored by the // replay. IS_SYSCALLBUF = 0x1, // This mapping is used as our thread-local variable area for this // address space IS_THREAD_LOCALS = 0x2, // This mapping is used for syscallbuf patch stubs IS_PATCH_STUBS = 0x4, // This mapping is the rr page IS_RR_PAGE = 0x8 }; uint32_t flags; }; typedef std::map MemoryMap; typedef std::shared_ptr shr_ptr; ~AddressSpace(); /** * Call this after a new task has been cloned within this * address space. */ void after_clone(); /** * Call this after a successful execve syscall has completed. At this point * it is safe to perform remote syscalls. */ void post_exec_syscall(Task* t); /** * Change the program data break of this address space to * |addr|. Only called during recording! */ void brk(Task* t, remote_ptr addr, int prot); /** * This can only be called during recording. */ remote_ptr current_brk() const { DEBUG_ASSERT(!brk_end.is_null()); return brk_end; } /** * Dump a representation of |this| to stderr in a format * similar to /proc/[tid]/maps. * * XXX/ostream-ify me. */ void dump() const; /** * Return tid of the first task for this address space. */ pid_t leader_tid() const { return leader_tid_; } /** * Return AddressSpaceUid for this address space. */ AddressSpaceUid uid() const { return AddressSpaceUid(leader_tid_, leader_serial, exec_count); } Session* session() const { return session_; } SupportedArch arch() const; /** * Return the path this address space was exec()'d with. */ const std::string& exe_image() const { return exe; } /** * Assuming the last retired instruction has raised a SIGTRAP * and might be a breakpoint trap instruction, return the type * of breakpoint set at |ip() - sizeof(breakpoint_insn)|, if * one exists. Otherwise return TRAP_NONE. */ BreakpointType get_breakpoint_type_for_retired_insn(remote_code_ptr ip); /** * Return the type of breakpoint that's been registered for * |addr|. */ BreakpointType get_breakpoint_type_at_addr(remote_code_ptr addr); /** * Returns true when the breakpoint at |addr| is in private * non-writeable memory. When this returns true, the breakpoint can't be * overwritten by the tracee without an intervening mprotect or mmap * syscall. */ bool is_breakpoint_in_private_read_only_memory(remote_code_ptr addr); /** * Return true if there's a breakpoint instruction at |ip|. This might * be an explicit instruction, even if there's no breakpoint set via our API. */ bool is_breakpoint_instruction(Task* t, remote_code_ptr ip); /** * The buffer |dest| of length |length| represents the contents of tracee * memory at |addr|. Replace the bytes in |dest| that have been overwritten * by breakpoints with the original data that was replaced by the breakpoints. */ void replace_breakpoints_with_original_values(uint8_t* dest, size_t length, remote_ptr addr); /** * Map |num_bytes| into this address space at |addr|, with * |prot| protection and |flags|. The pages are (possibly * initially) backed starting at |offset| of |res|. |fsname|, |device| and * |inode| are values that will appear in the /proc//maps entry. * |mapped_file_stat| is a complete copy of the 'stat' data for the mapped * file, or null if this isn't a file mapping or isn't during recording. * |*recorded_map| is the mapping during recording, or null if the mapping * during recording is known to be the same as the new map (e.g. because * we are recording!). * |local_addr| is the local address of the memory shared with the tracee, * or null if it's not shared with the tracee. AddressSpace takes ownership * of the shared memory and is responsible for unmapping it. */ KernelMapping map( Task* t, remote_ptr addr, size_t num_bytes, int prot, int flags, off64_t offset_bytes, const std::string& fsname, dev_t device = KernelMapping::NO_DEVICE, ino_t inode = KernelMapping::NO_INODE, std::unique_ptr mapped_file_stat = nullptr, const KernelMapping* recorded_map = nullptr, EmuFile::shr_ptr emu_file = nullptr, void* local_addr = nullptr, std::shared_ptr&& monitored = nullptr); /** * Return the mapping and mapped resource for the byte at address 'addr'. * There must be such a mapping. */ const Mapping& mapping_of(remote_ptr addr) const; /** * Detach local mapping and return it. */ void* detach_local_mapping(remote_ptr addr); /** * Return a reference to the flags of the mapping at this address, allowing * manipulation. There must exist a mapping at `addr`. */ uint32_t& mapping_flags_of(remote_ptr addr); /** * Return true if there is some mapping for the byte at 'addr'. */ bool has_mapping(remote_ptr addr) const; /** * If the given memory region is mapped into the local address space, obtain * the local address from which the `size` bytes at `addr` can be accessed. */ uint8_t* local_mapping(remote_ptr addr, size_t size); /** * Return true if the rr page is mapped at its expected address. */ bool has_rr_page() const; /** * Object that generates robust iterators through the memory map. The * memory map can be updated without invalidating iterators, as long as * Mappings are not added or removed. */ class Maps { public: Maps(const AddressSpace& outer, remote_ptr start) : outer(outer), start(start) {} class iterator { public: iterator(const iterator& it) = default; const iterator& operator++() { ptr = to_it()->second.map.end(); return *this; } bool operator==(const iterator& other) const { return to_it() == other.to_it(); } bool operator!=(const iterator& other) const { return !(*this == other); } const Mapping* operator->() const { return &to_it()->second; } const Mapping& operator*() const { return to_it()->second; } iterator& operator=(const iterator& other) { this->~iterator(); new (this) iterator(other); return *this; } private: friend class Maps; iterator(const MemoryMap& outer, remote_ptr ptr) : outer(outer), ptr(ptr), at_end(false) {} iterator(const MemoryMap& outer) : outer(outer), at_end(true) {} MemoryMap::const_iterator to_it() const { return at_end ? outer.end() : outer.lower_bound(MemoryRange(ptr, ptr)); } const MemoryMap& outer; remote_ptr ptr; bool at_end; }; iterator begin() const { return iterator(outer.mem, start); } iterator end() const { return iterator(outer.mem); } private: const AddressSpace& outer; remote_ptr start; }; friend class Maps; Maps maps() const { return Maps(*this, remote_ptr()); } Maps maps_starting_at(remote_ptr start) { return Maps(*this, start); } Maps maps_containing_or_after(remote_ptr start) { if (has_mapping(start)) { return Maps(*this, mapping_of(start).map.start()); } else { return Maps(*this, start); } } const std::set>& monitored_addrs() const { return monitored_mem; } /** * Change the protection bits of [addr, addr + num_bytes) to * |prot|. */ void protect(Task* t, remote_ptr addr, size_t num_bytes, int prot); /** * Fix up mprotect registers parameters to take account of PROT_GROWSDOWN. */ void fixup_mprotect_growsdown_parameters(Task* t); /** * Move the mapping [old_addr, old_addr + old_num_bytes) to * [new_addr, old_addr + new_num_bytes), preserving metadata. */ void remap(Task* t, remote_ptr old_addr, size_t old_num_bytes, remote_ptr new_addr, size_t new_num_bytes); /** * Notify that data was written to this address space by rr or * by the kernel. * |flags| can contain values from Task::WriteFlags. */ void notify_written(remote_ptr addr, size_t num_bytes, uint32_t flags); /** Ensure a breakpoint of |type| is set at |addr|. */ bool add_breakpoint(remote_code_ptr addr, BreakpointType type); /** * Remove a |type| reference to the breakpoint at |addr|. If * the removed reference was the last, the breakpoint is * destroyed. */ void remove_breakpoint(remote_code_ptr addr, BreakpointType type); /** * Destroy all breakpoints in this VM, regardless of their * reference counts. */ void remove_all_breakpoints(); /** * Temporarily remove the breakpoint at |addr|. */ void suspend_breakpoint_at(remote_code_ptr addr); /** * Restore any temporarily removed breakpoint at |addr|. */ void restore_breakpoint_at(remote_code_ptr addr); /** * Manage watchpoints. Analogous to breakpoint-managing * methods above, except that watchpoints can be set for an * address range. */ bool add_watchpoint(remote_ptr addr, size_t num_bytes, WatchType type); void remove_watchpoint(remote_ptr addr, size_t num_bytes, WatchType type); void remove_all_watchpoints(); std::vector all_watchpoints(); /** * Save all watchpoint state onto a stack. */ void save_watchpoints(); /** * Pop all watchpoint state from the saved-state stack. */ bool restore_watchpoints(); /** * Notify that at least one watchpoint was hit --- recheck them all. * Returns true if any watchpoint actually triggered. Note that * debug_status can indicate a hit watchpoint that doesn't actually * trigger, because the value of a write-watchpoint did not change. * Likewise, debug_status can indicate a watchpoint wasn't hit that * actually was (because in some configurations, e.g. VMWare * hypervisor with 32-bit x86 guest, debug_status watchpoint bits * are known to not be set on singlestep). */ bool notify_watchpoint_fired(uintptr_t debug_status, remote_ptr hit_addr, remote_code_ptr address_of_singlestep_start); /** * Return true if any watchpoint has fired. Will keep returning true until * consume_watchpoint_changes() is called. */ bool has_any_watchpoint_changes(); /** * Return true if an EXEC watchpoint has fired at addr since the last * consume_watchpoint_changes. */ bool has_exec_watchpoint_fired(remote_code_ptr addr); /** * Return all changed watchpoints in |watches| and clear their changed flags. */ std::vector consume_watchpoint_changes(); void set_shm_size(remote_ptr addr, size_t bytes) { shm_sizes[addr] = bytes; } /** * Dies if no shm size is registered for the address. */ size_t get_shm_size(remote_ptr addr) { return shm_sizes[addr]; } void remove_shm_size(remote_ptr addr) { shm_sizes.erase(addr); } /** * Make [addr, addr + num_bytes) inaccessible within this * address space. */ void unmap(Task* t, remote_ptr addr, ssize_t num_bytes); /** * Notification of madvise call. */ void advise(Task* t, remote_ptr addr, ssize_t num_bytes, int advice); /** Return the vdso mapping of this. */ KernelMapping vdso() const; bool has_vdso() const { return has_mapping(vdso_start_addr); } /** * Verify that this cached address space matches what the * kernel thinks it should be. */ void verify(Task* t) const; bool has_breakpoints() { return !breakpoints.empty(); } bool has_watchpoints() { return !watchpoints.empty(); } ScopedFd& mem_fd() { return child_mem_fd; } void set_mem_fd(ScopedFd&& fd) { child_mem_fd = std::move(fd); } Monkeypatcher& monkeypatcher() { DEBUG_ASSERT(monkeypatch_state); return *monkeypatch_state; } void at_preload_init(Task* t); /* The address of the syscall instruction from which traced syscalls made by * the syscallbuf will originate. */ remote_code_ptr traced_syscall_ip() const { return traced_syscall_ip_; } /* The address of the syscall instruction from which privileged traced * syscalls made by the syscallbuf will originate. */ remote_code_ptr privileged_traced_syscall_ip() const { return privileged_traced_syscall_ip_; } bool syscallbuf_enabled() const { return syscallbuf_enabled_; } /** * We'll map a page of memory here into every exec'ed process for our own * use. */ static remote_ptr rr_page_start() { return RR_PAGE_ADDR; } /** * This might not be the length of an actual system page, but we allocate * at least this much space. */ static uint32_t rr_page_size() { return 4096; } static remote_ptr rr_page_end() { return rr_page_start() + rr_page_size(); } static remote_ptr preload_thread_locals_start() { return rr_page_start() + rr_page_size(); } static uint32_t preload_thread_locals_size() { return PRELOAD_THREAD_LOCALS_SIZE; } enum Traced { TRACED, UNTRACED }; enum Privileged { PRIVILEGED, UNPRIVILEGED }; /** * Depending on which entry point this is and whether or not we're recording * or replaying, the instruction in the rr page, may be something other than * a syscall. This enum encodes the combination of instructions for each entry * point: * * Enabled | Record | Replay * ---------------------|---------|------- * RECORDING_ONLY | syscall | nop * REPLAY_ONLY | nop | syscall * RECORDING_AND_REPLAY | syscall | syscall * REPLAY_ASSIST | syscall | int3 * * The REPLAY_ASSIST is used for a syscall that is untraced during record (so * we can save the context switch penalty), but requires us to apply side * effects during replay. The int3 lets the replayer stop and apply these * at the appropriate point. */ enum Enabled { RECORDING_ONLY, REPLAY_ONLY, RECORDING_AND_REPLAY, REPLAY_ASSIST }; static remote_code_ptr rr_page_syscall_exit_point(Traced traced, Privileged privileged, Enabled enabled, SupportedArch arch); static remote_code_ptr rr_page_syscall_entry_point(Traced traced, Privileged privileged, Enabled enabled, SupportedArch arch); struct SyscallType { Traced traced; Privileged privileged; Enabled enabled; }; static std::vector rr_page_syscalls(); static const SyscallType* rr_page_syscall_from_exit_point( SupportedArch arch, remote_code_ptr ip); static const SyscallType* rr_page_syscall_from_entry_point( SupportedArch arch, remote_code_ptr ip); /** * Return a pointer to 8 bytes of 0xFF. * (Currently only set during record / not part of the ABI) */ static remote_ptr rr_page_record_ff_bytes() { return RR_PAGE_FF_BYTES; } /** * Locate a syscall instruction in t's VDSO. * This gives us a way to execute remote syscalls without having to write * a syscall instruction into executable tracee memory (which might not be * possible with some kernels, e.g. PaX). */ remote_code_ptr find_syscall_instruction(Task* t); /** * Task |t| just forked from this address space. Apply dont_fork and * wipe_on_fork settings. */ void did_fork_into(Task* t); void set_first_run_event(FrameTime event) { first_run_event_ = event; } FrameTime first_run_event() { return first_run_event_; } const std::vector& saved_auxv() { return saved_auxv_; } void save_auxv(Task* t); remote_ptr saved_interpreter_base() { return saved_interpreter_base_; } void save_interpreter_base(Task* t, std::vector auxv); std::string saved_ld_path() { return saved_ld_path_;} void save_ld_path(Task* t, remote_ptr); void read_mm_map(Task* t, NativeArch::prctl_mm_map* map); /** * Reads the /proc//maps entry for a specific address. Does no caching. * If performed on a file in a btrfs file system, this may return the * wrong device number! If you stick to anonymous or special file * mappings, this should be OK. */ KernelMapping read_kernel_mapping(Task* t, remote_ptr addr); /** * Same as read_kernel_mapping, but reads rr's own memory map. */ static KernelMapping read_local_kernel_mapping(uint8_t* addr); static uint32_t chaos_mode_min_stack_size() { return 8 * 1024 * 1024; } remote_ptr chaos_mode_find_free_memory(RecordTask* t, size_t len, remote_ptr hint); remote_ptr find_free_memory( size_t len, remote_ptr after = remote_ptr()); PropertyTable& properties() { return properties_; } /** * The return value indicates whether we (re)created the preload_thread_locals * area. */ bool post_vm_clone(Task* t); /** * TaskUid for the task whose locals are stored in the preload_thread_locals * area. */ const TaskUid& thread_locals_tuid() { return thread_locals_tuid_; } void set_thread_locals_tuid(const TaskUid& tuid) { thread_locals_tuid_ = tuid; } /** * Call this when the memory at [addr,addr+len) was externally overwritten. * This will attempt to update any breakpoints that may be set within the * range (resetting them and storing the new value). */ void maybe_update_breakpoints(Task* t, remote_ptr addr, size_t len); /** * Call this to ensure that the mappings in `range` during replay has the same length * is collapsed to a single mapping. The caller guarantees that all the * mappings in the range can be coalesced (because they corresponded to a single * mapping during recording). * The end of the range might be in the middle of a mapping. * The start of the range might also be in the middle of a mapping. */ void ensure_replay_matches_single_recorded_mapping(Task* t, MemoryRange range); /** * Print process maps. */ static void print_process_maps(Task* t); void add_stap_semaphore_range(Task* t, MemoryRange range); void remove_stap_semaphore_range(Task* t, MemoryRange range); bool is_stap_semaphore(remote_ptr addr); bool legacy_breakpoint_mode() { return stopping_breakpoint_table_ != nullptr; } remote_code_ptr do_breakpoint_fault_addr() { return do_breakpoint_fault_addr_; } remote_code_ptr stopping_breakpoint_table() { return stopping_breakpoint_table_; } int stopping_breakpoint_table_entry_size() { return stopping_breakpoint_table_entry_size_; } // Also sets brk_ptr. enum { RRVDSO_PAGE_OFFSET = 2, RRPAGE_RECORD_PAGE_OFFSET = 3, RRPAGE_REPLAY_PAGE_OFFSET = 4 }; void map_rr_page(AutoRemoteSyscalls& remote); void unmap_all_but_rr_page(AutoRemoteSyscalls& remote); void erase_task(Task* t) { this->HasTaskSet::erase_task(t); if (task_set().size() != 0) { fd_tables_changed(); } } /** * Called when the set of different fd tables associated with tasks * in this address space may have changed (e.g. a task changed its fd table, * or a task got added or removed, etc). */ void fd_tables_changed(); static MemoryRange get_global_exclusion_range(); private: struct Breakpoint; typedef std::map BreakpointMap; class Watchpoint; /** * Called after a successful execve to set up the new AddressSpace. * Also called once for the initial spawn. */ AddressSpace(Task* t, const std::string& exe, uint32_t exec_count); /** * Called when an AddressSpace is cloned due to a fork() or a Session * clone. After this, and the task is properly set up, post_vm_clone will * be called. */ AddressSpace(Session* session, const AddressSpace& o, pid_t leader_tid, uint32_t leader_serial, uint32_t exec_count); /** * After an exec, populate the new address space of |t| with * the existing mappings we find in /proc/maps. */ void populate_address_space(Task* t); void unmap_internal(Task* t, remote_ptr addr, ssize_t num_bytes); bool update_watchpoint_value(const MemoryRange& range, Watchpoint& watchpoint); void update_watchpoint_values(remote_ptr start, remote_ptr end); enum WatchpointFilter { ALL_WATCHPOINTS, CHANGED_WATCHPOINTS }; std::vector get_watchpoints_internal(WatchpointFilter filter); enum WillSetTaskState { SETTING_TASK_STATE, NOT_SETTING_TASK_STATE }; std::vector get_watch_configs( WillSetTaskState will_set_task_state); /** * Construct a minimal set of watchpoints to be enabled based * on |set_watchpoint()| calls, and program them for each task * in this address space. */ bool allocate_watchpoints(); /** * Merge the mappings adjacent to |it| in memory that are * semantically "adjacent mappings" of the same resource as * well, for example have adjacent file offsets and the same * prot and flags. */ void coalesce_around(Task* t, MemoryMap::iterator it); /** * Erase |it| from |breakpoints| and restore any memory in * this it may have overwritten. */ void destroy_breakpoint(BreakpointMap::const_iterator it); /** * For each mapped segment overlapping [addr, addr + * num_bytes), call |f|. Pass |f| the overlapping mapping, * the mapped resource, and the range of addresses remaining * to be iterated over. * * Pass |ITERATE_CONTIGUOUS| to stop iterating when the last * contiguous mapping after |addr| within the region is seen. * Default is to iterate all mappings in the region. */ enum { ITERATE_DEFAULT, ITERATE_CONTIGUOUS }; void for_each_in_range( remote_ptr addr, ssize_t num_bytes, std::function f, int how = ITERATE_DEFAULT); /** * Map |m| of |r| into this address space, and coalesce any * mappings of |r| that are adjacent to |m|. */ void map_and_coalesce(Task* t, const KernelMapping& m, const KernelMapping& recorded_map, EmuFile::shr_ptr emu_file, std::unique_ptr mapped_file_stat, void* local_addr, std::shared_ptr&& monitored); void remove_from_map(const MemoryRange& range) { mem.erase(range); monitored_mem.erase(range.start()); } void add_to_map(const Mapping& m) { mem[m.map] = m; if (m.monitored_shared_memory) { monitored_mem.insert(m.map.start()); } } /** * Call this only during recording. */ template void at_preload_init_arch(Task* t); enum { EXEC_BIT = 1 << 0, READ_BIT = 1 << 1, WRITE_BIT = 1 << 2 }; /** Return the access bits above needed to watch |type|. */ static int access_bits_of(WatchType type); /** * Represents a refcount set on a particular address. Because there * can be multiple refcounts of multiple types set on a single * address, Breakpoint stores explicit USER and INTERNAL breakpoint * refcounts. Clients adding/removing breakpoints at this addr must * call ref()/unref() as appropriate. */ struct Breakpoint { Breakpoint() : internal_count(0), user_count(0) {} Breakpoint(const Breakpoint& o) = default; // AddressSpace::destroy_all_breakpoints() can cause this // destructor to be invoked while we have nonzero total // refcount, so the most we can DEBUG_ASSERT is that the refcounts // are valid. ~Breakpoint() { DEBUG_ASSERT(internal_count >= 0 && user_count >= 0); } void ref(BreakpointType which) { DEBUG_ASSERT(internal_count >= 0 && user_count >= 0); ++*counter(which); } int unref(BreakpointType which) { DEBUG_ASSERT(internal_count > 0 || user_count > 0); --*counter(which); DEBUG_ASSERT(internal_count >= 0 && user_count >= 0); return internal_count + user_count; } BreakpointType type() const { // NB: USER breakpoints need to be processed before // INTERNAL ones. We want to give the debugger a // chance to dispatch commands before we attend to the // internal rr business. So if there's a USER "ref" // on the breakpoint, treat it as a USER breakpoint. return user_count > 0 ? BKPT_USER : BKPT_INTERNAL; } uint8_t* original_data() { return overwritten_data; } // "Refcounts" of breakpoints set at |addr|. The breakpoint // object must be unique since we have to save the overwritten // data, and we can't enforce the order in which breakpoints // are set/removed. int internal_count, user_count; uint8_t overwritten_data[MAX_BKPT_INSTRUCTION_LENGTH]; int* counter(BreakpointType which) { DEBUG_ASSERT(BKPT_INTERNAL == which || BKPT_USER == which); int* p = BKPT_USER == which ? &user_count : &internal_count; DEBUG_ASSERT(*p >= 0); return p; } }; // XXX one is tempted to merge Breakpoint and Watchpoint into a single // entity, but the semantics are just different enough that separate // objects are easier for now. /** * Track the watched accesses of a contiguous range of memory * addresses. */ class Watchpoint { public: Watchpoint(size_t num_bytes) : exec_count(0), read_count(0), write_count(0), value_bytes(num_bytes), valid(false), changed(false) {} Watchpoint(const Watchpoint&) = default; ~Watchpoint() { assert_valid(); } void watch(int which) { assert_valid(); exec_count += (EXEC_BIT & which) != 0; read_count += (READ_BIT & which) != 0; write_count += (WRITE_BIT & which) != 0; } int unwatch(int which) { assert_valid(); if (EXEC_BIT & which) { DEBUG_ASSERT(exec_count > 0); --exec_count; } if (READ_BIT & which) { DEBUG_ASSERT(read_count > 0); --read_count; } if (WRITE_BIT & which) { DEBUG_ASSERT(write_count > 0); --write_count; } return exec_count + read_count + write_count; } int watched_bits() const { return (exec_count > 0 ? EXEC_BIT : 0) | (read_count > 0 ? READ_BIT : 0) | (write_count > 0 ? WRITE_BIT : 0); } void assert_valid() const { DEBUG_ASSERT(exec_count >= 0 && read_count >= 0 && write_count >= 0); } // Watchpoints stay alive until all watched access typed have // been cleared. We track refcounts of each watchable access // separately. int exec_count, read_count, write_count; // Debug registers allocated for read/exec access checking. // Write watchpoints are always triggered by checking for actual memory // value changes. Read/exec watchpoints can't be triggered that way, so // we look for these registers being triggered instead. std::vector debug_regs_for_exec_read; std::vector value_bytes; bool valid; bool changed; }; PropertyTable properties_; // All breakpoints set in this VM. BreakpointMap breakpoints; /* Path of the real executable image this address space was * exec()'d with. */ std::string exe; /* Pid of first task for this address space */ pid_t leader_tid_; /* Serial number of first task for this address space */ uint32_t leader_serial; uint32_t exec_count; // Only valid during recording remote_ptr brk_start; /* Current brk. Not necessarily page-aligned. */ remote_ptr brk_end; /* All segments mapped into this address space. */ MemoryMap mem; /* Sizes of SYSV shm segments, by address. We use this to determine the size * of memory regions unmapped via shmdt(). */ std::map, size_t> shm_sizes; std::set> monitored_mem; /* madvise DONTFORK regions */ std::set dont_fork; /* madvise WIPEONFORK regions */ std::set wipe_on_fork; // The session that created this. We save a ref to it so that // we can notify it when we die. Session* session_; // tid of the task whose thread-locals are in preload_thread_locals TaskUid thread_locals_tuid_; /* First mapped byte of the vdso. */ remote_ptr vdso_start_addr; // The monkeypatcher that's handling this address space. std::unique_ptr monkeypatch_state; // The watchpoints set for tasks in this VM. Watchpoints are // programmed per Task, but we track them per address space on // behalf of debuggers that assume that model. std::map watchpoints; std::vector> saved_watchpoints; // Tracee memory is read and written through this fd, which is // opened for the tracee's magic /proc/[tid]/mem device. The // advantage of this over ptrace is that we can access it even // when the tracee isn't at a ptrace-stop. It's also // theoretically faster for large data transfers, which rr can // do often. // // Users of child_mem_fd should fall back to ptrace-based memory // access when child_mem_fd is not open. ScopedFd child_mem_fd; remote_code_ptr traced_syscall_ip_; remote_code_ptr privileged_traced_syscall_ip_; bool syscallbuf_enabled_; remote_code_ptr do_breakpoint_fault_addr_; // These fields are deprecated and have been replaced by the // breakpoint_value mechanism. They are retained for replayability // of old traces. remote_code_ptr stopping_breakpoint_table_; int stopping_breakpoint_table_entry_size_; std::vector saved_auxv_; remote_ptr saved_interpreter_base_; std::string saved_ld_path_; /** * The time of the first event that ran code for a task in this address space. * 0 if no such event has occurred. */ FrameTime first_run_event_; std::set> stap_semaphores; /** * For each architecture, the offset of a syscall instruction with that * architecture's VDSO, or 0 if not known. */ static uint32_t offset_to_syscall_in_vdso[SupportedArch_MAX + 1]; /** * Ensure that the cached mapping of |t| matches /proc/maps, * using adjancent-map-merging heuristics that are as lenient * as possible given the data available from /proc/maps. */ static void check_segment_iterator(void* vasp, Task* t, const struct map_iterator_data* data); AddressSpace operator=(const AddressSpace&) = delete; }; /** * The following helper is used to iterate over a tracee's memory * map. */ class KernelMapIterator { public: KernelMapIterator(Task* t); KernelMapIterator(pid_t tid) : tid(tid) { init(); } ~KernelMapIterator(); // It's very important to keep in mind that btrfs files can have the wrong // device number! const KernelMapping& current(std::string* raw_line = nullptr) { if (raw_line) { *raw_line = this->raw_line; } return km; } bool at_end() { return !maps_file; } void operator++(); private: void init(); pid_t tid; FILE* maps_file; std::string raw_line; KernelMapping km; }; } // namespace rr #endif /* RR_ADDRESS_SPACE_H_ */ rr-5.5.0/src/AutoRemoteSyscalls.cc000066400000000000000000000561441412202446200170530ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "AutoRemoteSyscalls.h" #include #include #include #include #include "rr/rr.h" #include "RecordSession.h" #include "RecordTask.h" #include "ReplaySession.h" #include "Session.h" #include "Task.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" #include "util.h" using namespace std; namespace rr { /** * The ABI of the socketcall syscall is a nightmare; the first arg to * the kernel is the sub-operation, and the second argument is a * pointer to the args. The args depend on the sub-op. */ template struct socketcall_args { typename Arch::signed_long args[3]; } __attribute__((packed)); void AutoRestoreMem::init(const void* mem, ssize_t num_bytes) { ASSERT(remote.task(), remote.enable_mem_params() == AutoRemoteSyscalls::ENABLE_MEMORY_PARAMS) << "Memory parameters were disabled"; len = num_bytes; saved_sp = remote.regs().sp(); remote.regs().set_sp(remote.regs().sp() - len); remote.task()->set_regs(remote.regs()); addr = remote.regs().sp(); data.resize(len); bool ok = true; remote.task()->read_bytes_helper(addr, len, data.data(), &ok); if (mem) { remote.task()->write_bytes_helper(addr, len, mem, &ok); } if (!ok) { addr = nullptr; } } AutoRestoreMem::~AutoRestoreMem() { DEBUG_ASSERT(saved_sp == remote.regs().sp() + len); if (addr) { // XXX what should we do if this task was sigkilled but the address // space is used by other live tasks? remote.task()->write_bytes_helper(addr, len, data.data()); } remote.regs().set_sp(remote.regs().sp() + len); remote.task()->set_regs(remote.regs()); } static bool is_SIGTRAP_default_and_unblocked(Task* t) { if (!t->session().is_recording()) { return true; } RecordTask* rt = static_cast(t); return rt->sig_disposition(SIGTRAP) == SIGNAL_DEFAULT && !rt->is_sig_blocked(SIGTRAP); } AutoRemoteSyscalls::AutoRemoteSyscalls(Task* t, MemParamsEnabled enable_mem_params) : t(t), initial_regs(t->regs()), initial_ip(t->ip()), initial_sp(t->regs().sp()), initial_at_seccomp(t->ptrace_event() == PTRACE_EVENT_SECCOMP), restore_wait_status(t->status()), new_tid_(-1), scratch_mem_was_mapped(false), use_singlestep_path(false), enable_mem_params_(enable_mem_params) { if (initial_at_seccomp) { // This should only ever happen during recording - we don't use the // seccomp traps during replay. ASSERT(t, t->session().is_recording()); } // We support two paths for syscalls: // -- a fast path using a privileged untraced syscall and PTRACE_SINGLESTEP. // This only requires a single task-wait. // -- a slower path using a privileged traced syscall and PTRACE_SYSCALL/ // PTRACE_CONT via Task::enter_syscall(). This requires 2 or 3 task-waits // depending on whether the seccomp event fires before the syscall-entry // event. // Use the slow path when running under rr, because the rr recording us // needs to see and trace these tracee syscalls, and if they're untraced by // us they're also untraced by the outer rr. // Use the slow path if SIGTRAP is blocked or ignored because otherwise // the PTRACE_SINGLESTEP will cause the kernel to unblock it. setup_path(t->vm()->has_rr_page() && !running_under_rr() && is_SIGTRAP_default_and_unblocked(t)); if (enable_mem_params == ENABLE_MEMORY_PARAMS) { maybe_fix_stack_pointer(); } } void AutoRemoteSyscalls::setup_path(bool enable_singlestep_path) { #if defined(__aarch64__) // XXXkhuey this fast path doesn't work on AArch64 yet, go slow instead enable_singlestep_path = false; #endif if (!replaced_bytes.empty()) { // XXX what to do here to clean up if the task died unexpectedly? t->write_mem(remote_ptr(initial_regs.ip().to_data_ptr()), replaced_bytes.data(), replaced_bytes.size()); } remote_code_ptr syscall_ip; use_singlestep_path = enable_singlestep_path; if (use_singlestep_path) { syscall_ip = AddressSpace::rr_page_syscall_entry_point( AddressSpace::UNTRACED, AddressSpace::PRIVILEGED, AddressSpace::RECORDING_AND_REPLAY, t->arch()); } else { syscall_ip = t->vm()->traced_syscall_ip(); } initial_regs.set_ip(syscall_ip); // We need to make sure to clear any breakpoints or other alterations of // the syscall instruction we're using. Note that the tracee may have set its // own breakpoints or otherwise modified the instruction, so suspending our // own breakpoint is insufficient. std::vector syscall = rr::syscall_instruction(t->arch()); bool ok = true; replaced_bytes = t->read_mem(initial_regs.ip().to_data_ptr(), syscall.size(), &ok); if (!ok) { // The task died return; } if (replaced_bytes == syscall) { replaced_bytes.clear(); } else { t->write_mem(initial_regs.ip().to_data_ptr(), syscall.data(), syscall.size(), &ok); } } static bool is_usable_area(const KernelMapping& km) { return (km.prot() & (PROT_READ | PROT_WRITE)) == (PROT_READ | PROT_WRITE) && (km.flags() & MAP_PRIVATE); } void AutoRemoteSyscalls::maybe_fix_stack_pointer() { if (!t->session().done_initial_exec()) { return; } remote_ptr last_stack_byte = t->regs().sp() - 1; if (t->vm()->has_mapping(last_stack_byte)) { auto m = t->vm()->mapping_of(last_stack_byte); if (is_usable_area(m.map) && m.map.start() + 2048 <= t->regs().sp()) { // 'sp' is in a stack region and there's plenty of space there. No need // to fix anything. return; } } MemoryRange found_stack; for (const auto& m : t->vm()->maps()) { if (is_usable_area(m.map)) { found_stack = m.map; break; } }; if (found_stack.start().is_null()) { AutoRemoteSyscalls remote(t, DISABLE_MEMORY_PARAMS); found_stack = MemoryRange(remote.infallible_mmap_syscall( remote_ptr(), 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), 4096); scratch_mem_was_mapped = true; } fixed_sp = found_stack.end(); DEBUG_ASSERT(!fixed_sp.is_null()); initial_regs.set_sp(fixed_sp); } AutoRemoteSyscalls::~AutoRemoteSyscalls() { restore_state_to(t); } void AutoRemoteSyscalls::restore_state_to(Task* t) { // Unmap our scatch region if required if (scratch_mem_was_mapped) { AutoRemoteSyscalls remote(t, DISABLE_MEMORY_PARAMS); remote.infallible_syscall(syscall_number_for_munmap(arch()), fixed_sp - 4096, 4096); } if (!replaced_bytes.empty()) { // XXX how to clean up if the task died and the address space is shared with live task? t->write_mem(remote_ptr(initial_regs.ip().to_data_ptr()), replaced_bytes.data(), replaced_bytes.size()); } auto regs = initial_regs; regs.set_ip(initial_ip); regs.set_sp(initial_sp); // Restore stomped registers. t->set_regs(regs); // If we were sitting at a seccomp trap, try to get back there by resuming // here. Since the original register contents caused a seccomp trap, // re-running the syscall with the same registers should put us right back // to this same seccomp trap. if (initial_at_seccomp && t->ptrace_event() != PTRACE_EVENT_SECCOMP) { RecordTask* rt = static_cast(t); while (true) { rt->resume_execution(RESUME_CONT, RESUME_WAIT, RESUME_NO_TICKS); if (rt->ptrace_event()) break; rt->stash_sig(); } ASSERT(rt, rt->ptrace_event() == PTRACE_EVENT_SECCOMP); } t->set_status(restore_wait_status); } static bool ignore_signal(Task* t) { int sig = t->stop_sig(); if (!sig) { return false; } if (t->session().is_replaying()) { if (ReplaySession::is_ignored_signal(sig)) { return true; } } else if (t->session().is_recording()) { auto rt = static_cast(t); if (sig != rt->session().syscallbuf_desched_sig()) { rt->stash_sig(); } return true; } ASSERT(t, false) << "Unexpected signal " << signal_name(sig); return false; } long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) { LOG(debug) << "syscall " << syscall_name(syscallno, t->arch()) << " " << callregs; if (t->is_dying()) { LOG(debug) << "Task is dying, don't try anything."; return -ESRCH; } if ((int)callregs.arg1() == SIGTRAP && use_singlestep_path && (is_sigaction_syscall(syscallno, t->arch()) || is_rt_sigaction_syscall(syscallno, t->arch()) || is_signal_syscall(syscallno, t->arch()))) { // Don't use the fast path if we're about to set up a signal handler // for SIGTRAP! LOG(debug) << "Disabling singlestep path due to SIGTRAP sigaction"; setup_path(false); callregs.set_ip(initial_regs.ip()); } callregs.set_original_syscallno(syscallno); callregs.set_syscallno(syscallno); t->set_regs(callregs); if (use_singlestep_path) { while (true) { t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS); LOG(debug) << "Used singlestep path; status=" << t->status(); // When a PTRACE_EVENT_EXIT is returned we don't update registers if (t->ip() != callregs.ip()) { // We entered the syscall, so stop now break; } if (t->ptrace_event() == PTRACE_EVENT_EXIT) { // We died, just let it be break; } if (t->stop_sig() == SIGTRAP && t->get_siginfo().si_code == TRAP_TRACE) { // On aarch64, if we were previously in a syscall-exit stop, continuing // with PTRACE_SINGLESTEP will result in incurring a trap upon execution // of the first instruction in userspace. Ignore such a trap. continue; } if (ignore_signal(t)) { // We were interrupted by a signal before we even entered the syscall continue; } ASSERT(t, false) << "Unexpected status " << t->status(); } } else { if (initial_at_seccomp && t->ptrace_event() == PTRACE_EVENT_SECCOMP) { LOG(debug) << "Skipping enter_syscall - already at seccomp stop"; } else { t->enter_syscall(); } LOG(debug) << "Used enter_syscall; status=" << t->status(); // proceed to syscall exit t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); LOG(debug) << "syscall exit status=" << t->status(); } while (true) { // If the syscall caused the task to exit, just stop now with that status. if (t->ptrace_event() == PTRACE_EVENT_EXIT) { restore_wait_status = t->status(); break; } if (t->status().is_syscall() || (t->stop_sig() == SIGTRAP && is_kernel_trap(t->get_siginfo().si_code))) { // If we got a SIGTRAP then we assume that's our singlestep and we're // done. break; } if (is_clone_syscall(syscallno, t->arch()) && t->clone_syscall_is_complete(&new_tid_, t->arch())) { t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); LOG(debug) << "got clone event; new status=" << t->status(); continue; } if (ignore_signal(t)) { if (t->regs().syscall_may_restart()) { t->enter_syscall(); LOG(debug) << "signal ignored; restarting syscall, status=" << t->status(); t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); LOG(debug) << "syscall exit status=" << t->status(); continue; } LOG(debug) << "signal ignored"; // We have been notified of a signal after a non-interruptible syscall // completed. Don't continue, we're done here. break; } ASSERT(t, false) << "Unexpected status " << t->status(); break; } if (t->is_dying()) { LOG(debug) << "Task is dying, no status result"; return -ESRCH; } else { LOG(debug) << "done, result=" << t->regs().syscall_result(); return t->regs().syscall_result(); } } SupportedArch AutoRemoteSyscalls::arch() const { return t->arch(); } template static void write_socketcall_args(Task* t, remote_ptr remote_mem, typename Arch::signed_long arg1, typename Arch::signed_long arg2, typename Arch::signed_long arg3, bool* ok) { socketcall_args sc_args = { { arg1, arg2, arg3 } }; t->write_mem(remote_mem.cast>(), sc_args, ok); } template struct fd_message { // Unfortunately we need to send at least one byte of data in our // message for it to work char data; typename Arch::iovec msgdata; char cmsgbuf[Arch::cmsg_space(sizeof(int))]; typename Arch::msghdr msg; // XXX: Could make this conditional on Arch socketcall_args socketcall; void init(remote_ptr> base) { data = 0; msgdata.iov_base = REMOTE_PTR_FIELD(base, data); msgdata.iov_len = 1; memset(&msg, 0, sizeof(msg)); msg.msg_control = REMOTE_PTR_FIELD(base, cmsgbuf); msg.msg_controllen = sizeof(cmsgbuf); msg.msg_iov = REMOTE_PTR_FIELD(base, msgdata); msg.msg_iovlen = 1; } fd_message(remote_ptr> base) { init(base); } fd_message() { init((uintptr_t)this); } remote_ptr> remote_this() { return msgdata.iov_base.rptr().as_int(); } remote_ptr remote_msg() { return REMOTE_PTR_FIELD(remote_this(), msg); } remote_ptr> remote_sc_args() { return REMOTE_PTR_FIELD(remote_this(), socketcall); } remote_ptr remote_cmsgdata() { return REMOTE_PTR_FIELD(remote_this(), cmsgbuf).as_int() + (uintptr_t)Arch::cmsg_data(NULL); } }; template static long child_sendmsg(AutoRemoteSyscalls& remote, int child_sock, int fd) { AutoRestoreMem remote_buf(remote, nullptr, sizeof(fd_message)); fd_message msg(remote_buf.get().cast>()); // Pull the puppet strings to have the child send its fd // to us. Similarly to above, we DONT_WAIT on the // call to finish, since it's likely not defined whether the // sendmsg() may block on our recvmsg()ing what the tracee // sent us (in which case we would deadlock with the tracee). // We call sendmsg on child socket, but first we have to prepare a lot of // data. auto cmsg = reinterpret_cast(msg.cmsgbuf); cmsg->cmsg_len = Arch::cmsg_len(sizeof(fd)); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; *static_cast(Arch::cmsg_data(cmsg)) = fd; if (has_socketcall_syscall(Arch::arch())) { socketcall_args sc_args = { { child_sock, (typename Arch::signed_long)msg.remote_msg().as_int(), 0 } }; msg.socketcall = sc_args; } bool ok = true; remote.task()->write_bytes_helper(remote_buf.get().cast(), sizeof(msg), &msg, &ok); if (!ok) { return -ESRCH; } if (!has_socketcall_syscall(Arch::arch())) { return remote.syscall(Arch::sendmsg, child_sock, msg.remote_msg(), 0); } return remote.syscall(Arch::socketcall, SYS_SENDMSG, msg.remote_sc_args()); } template static long child_recvmsg(AutoRemoteSyscalls& remote, int child_sock) { AutoRestoreMem remote_buf(remote, nullptr, sizeof(fd_message)); fd_message msg(remote_buf.get().cast>()); bool ok = true; if (has_socketcall_syscall(Arch::arch())) { socketcall_args sc_args = { { child_sock, (typename Arch::signed_long)msg.remote_msg().as_int(), 0 } }; msg.socketcall = sc_args; } remote.task()->write_bytes_helper(remote_buf.get().cast(), sizeof(msg), &msg, &ok); if (!ok) { return -ESRCH; } int ret = 0; if (has_socketcall_syscall(Arch::arch())) { ret = remote.syscall(Arch::socketcall, SYS_RECVMSG, msg.remote_sc_args()); } else { ret = remote.syscall(Arch::recvmsg, child_sock, msg.remote_msg(), 0); } if (ret < 0) { return ret; } int their_fd = remote.task()->read_mem(msg.remote_cmsgdata(), &ok); if (!ok) { return -ESRCH; } return their_fd; } static int recvmsg_socket(ScopedFd& sock) { fd_message msg; struct msghdr *msgp = (struct msghdr*)&msg.msg; if (0 > recvmsg(sock, msgp, MSG_CMSG_CLOEXEC)) { return -1; } struct cmsghdr* cmsg = CMSG_FIRSTHDR(msgp); DEBUG_ASSERT(cmsg && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS); int our_fd = *(int*)CMSG_DATA(cmsg); DEBUG_ASSERT(our_fd >= 0); return our_fd; } static void sendmsg_socket(ScopedFd& sock, int fd_to_send) { fd_message msg; struct msghdr *msgp = (struct msghdr*)&msg.msg; struct cmsghdr* cmsg = CMSG_FIRSTHDR(msgp); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; cmsg->cmsg_len = CMSG_LEN(sizeof(fd_to_send)); *(int*)CMSG_DATA(cmsg) = fd_to_send; if (0 > sendmsg(sock, msgp, 0)) { FATAL() << "Failed to send fd"; } } template ScopedFd AutoRemoteSyscalls::retrieve_fd_arch(int fd) { long child_syscall_result = child_sendmsg(*this, task()->session().tracee_fd_number(), fd); if (child_syscall_result == -ESRCH) { return ScopedFd(); } ASSERT(t, child_syscall_result > 0) << "Failed to sendmsg() in tracee; err=" << errno_name(-child_syscall_result); int our_fd = recvmsg_socket(task()->session().tracee_socket_fd()); ASSERT(t, our_fd >= 0) << "Failed to receive fd"; return ScopedFd(our_fd); } ScopedFd AutoRemoteSyscalls::retrieve_fd(int fd) { RR_ARCH_FUNCTION(retrieve_fd_arch, arch(), fd); } template int AutoRemoteSyscalls::send_fd_arch(const ScopedFd &our_fd) { if (!our_fd.is_open()) { return -EBADF; } LOG(debug) << "Sending fd " << our_fd.get() << " via socket fd " << task()->session().tracee_socket_fd().get(); sendmsg_socket(task()->session().tracee_socket_fd(), our_fd.get()); long child_syscall_result = child_recvmsg(*this, task()->session().tracee_fd_number()); if (child_syscall_result == -ESRCH) { /* The child did not receive the message. Read it out of the socket buffer so it doesn't get read by another child later! */ int fd = recvmsg_socket(task()->session().tracee_socket_receiver_fd()); if (fd >= 0) { close(fd); } return -ESRCH; } ASSERT(t, child_syscall_result >= 0) << "Failed to recvmsg() in tracee; err=" << errno_name(-child_syscall_result); return child_syscall_result; } int AutoRemoteSyscalls::send_fd(const ScopedFd &our_fd) { RR_ARCH_FUNCTION(send_fd_arch, arch(), our_fd); } void AutoRemoteSyscalls::infallible_send_fd_dup(const ScopedFd& our_fd, int dup_to) { int remote_fd = send_fd(our_fd); ASSERT(task(), remote_fd >= 0); if (remote_fd != dup_to) { long ret = infallible_syscall(syscall_number_for_dup3(arch()), remote_fd, dup_to, O_CLOEXEC); ASSERT(task(), ret == dup_to); infallible_syscall(syscall_number_for_close(arch()), remote_fd); } } remote_ptr AutoRemoteSyscalls::infallible_mmap_syscall( remote_ptr addr, size_t length, int prot, int flags, int child_fd, uint64_t offset_pages) { // The first syscall argument is called "arg 1", so // our syscall-arg-index template parameter starts // with "1". remote_ptr ret = has_mmap2_syscall(arch()) ? infallible_syscall_ptr(syscall_number_for_mmap2(arch()), addr, length, prot, flags, child_fd, (off_t)offset_pages) : infallible_syscall_ptr(syscall_number_for_mmap(arch()), addr, length, prot, flags, child_fd, offset_pages * page_size()); if (flags & MAP_FIXED) { ASSERT(t, addr == ret) << "MAP_FIXED at " << addr << " but got " << ret; } return ret; } int64_t AutoRemoteSyscalls::infallible_lseek_syscall(int fd, int64_t offset, int whence) { switch (arch()) { case x86: { AutoRestoreMem mem(*this, &offset, sizeof(int64_t)); infallible_syscall(syscall_number_for__llseek(arch()), fd, offset >> 32, offset, mem.get(), whence); return t->read_mem(mem.get().cast()); } case x86_64: return infallible_syscall(syscall_number_for_lseek(arch()), fd, offset, whence); default: ASSERT(t, false) << "Unknown arch"; return -1; } } void AutoRemoteSyscalls::check_syscall_result(long ret, int syscallno, bool allow_death) { if (word_size(t->arch()) == 4) { // Sign-extend ret because it can be a 32-bit negative errno ret = (int)ret; } if (allow_death && ret == -ESRCH) { return; } if (-4096 < ret && ret < 0) { string extra_msg; if (is_open_syscall(syscallno, arch())) { extra_msg = " opening " + t->read_c_str(t->regs().arg1()); } else if (is_openat_syscall(syscallno, arch())) { extra_msg = " opening " + t->read_c_str(t->regs().arg2()); } ASSERT(t, false) << "Syscall " << syscall_name(syscallno, arch()) << " failed with errno " << errno_name(-ret) << extra_msg; } } void AutoRemoteSyscalls::finish_direct_mmap( remote_ptr rec_addr, size_t length, int prot, int flags, const string& backing_file_name, int backing_file_open_flags, off64_t backing_offset_pages, struct stat& real_file, string& real_file_name) { int fd; LOG(debug) << "directly mmap'ing " << length << " bytes of " << backing_file_name << " at page offset " << HEX(backing_offset_pages); ASSERT(task(), !(flags & MAP_GROWSDOWN)); /* Open in the tracee the file that was mapped during * recording. */ { AutoRestoreMem child_str(*this, backing_file_name.c_str()); fd = infallible_syscall(syscall_number_for_openat(arch()), -1, child_str.get().as_int(), backing_file_open_flags); } /* And mmap that file. */ infallible_mmap_syscall(rec_addr, length, /* (We let SHARED|WRITEABLE * mappings go through while * they're not handled properly, * but we shouldn't do that.) */ prot, (flags & ~MAP_SYNC) | MAP_FIXED, fd, /* MAP_SYNC is used to request direct mapping * (DAX) from the filesystem for persistent * memory devices (requires * MAP_SHARED_VALIDATE). Drop it for the * backing file. */ backing_offset_pages); // While it's open, grab the link reference. real_file = task()->stat_fd(fd); real_file_name = task()->file_name_of_fd(fd); /* Don't leak the tmp fd. The mmap doesn't need the fd to * stay open. */ infallible_syscall(syscall_number_for_close(arch()), fd); } } // namespace rr rr-5.5.0/src/AutoRemoteSyscalls.h000066400000000000000000000222201412202446200167010ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_AUTO_REMOTE_SYSCALLS_H_ #define RR_AUTO_REMOTE_SYSCALLS_H_ #include #include #include "Registers.h" #include "ScopedFd.h" #include "Task.h" namespace rr { class AutoRemoteSyscalls; class Task; /** * Helpers to make remote syscalls on behalf of a Task. Usage looks * like * * AutoRemoteSyscalls remote(t); // prepare remote syscalls * remote.syscall(syscall_number_for_open(remote.arch()), ...); // make *syscalls * ... * // when |remote| goes out of scope, remote syscalls are finished */ /** * Cookie used to restore stomped memory, usually prepared as the * argument to a remote syscall. */ class AutoRestoreMem { public: /** * Write |mem| into address space of the Task prepared for * remote syscalls in |remote|, in such a way that the write * will be undone. The address of the reserved mem space is * available via |get|. * If |mem| is null, data is not written, only the space is reserved. */ AutoRestoreMem(AutoRemoteSyscalls& remote, const void* mem, ssize_t num_bytes) : remote(remote) { init(mem, num_bytes); } /** * Convenience constructor for pushing a C string |str|, including * the trailing '\0' byte. */ AutoRestoreMem(AutoRemoteSyscalls& remote, const char* str) : remote(remote) { init((const uint8_t*)str, strlen(str) + 1 /*null byte*/); } ~AutoRestoreMem(); /** * Get a pointer to the reserved memory. * Returns null if we failed. */ remote_ptr get() const { return addr; } /** * Return size of reserved memory buffer. */ size_t size() const { return data.size(); } private: void init(const void* mem, ssize_t num_bytes); AutoRemoteSyscalls& remote; /* Address of tmp mem. */ remote_ptr addr; /* Saved data. */ std::vector data; /* (We keep this around for error checking.) */ remote_ptr saved_sp; /* Length of tmp mem. */ size_t len; AutoRestoreMem& operator=(const AutoRestoreMem&) = delete; AutoRestoreMem(const AutoRestoreMem&) = delete; void* operator new(size_t) = delete; void operator delete(void*) = delete; }; /** * RAII helper to prepare a Task for remote syscalls and undo any * preparation upon going out of scope. Note that this restores register * values when going out of scope, so *all* changes to Task's register * state are lost. */ class AutoRemoteSyscalls { public: enum MemParamsEnabled { ENABLE_MEMORY_PARAMS, DISABLE_MEMORY_PARAMS }; /** * Prepare |t| for a series of remote syscalls. * * NBBB! Before preparing for a series of remote syscalls, * the caller *must* ensure the callee will not receive any * signals. This code does not attempt to deal with signals. */ AutoRemoteSyscalls(Task* t, MemParamsEnabled enable_mem_params = ENABLE_MEMORY_PARAMS); /** * Undo in |t| any preparations that were made for a series of * remote syscalls. */ ~AutoRemoteSyscalls(); /** * If t's stack pointer doesn't look valid, temporarily adjust it to * the top of *some* stack area. */ void maybe_fix_stack_pointer(); /** * "Initial" registers saved from the target task. * * NB: a non-const reference is returned because some power * users want to update the registers that are restored after * finishing remote syscalls. Perhaps these users should be * fixed, or you should just be careful. */ Registers& regs() { return initial_regs; } /** * Undo any preparations to make remote syscalls in the context of |t|. * * This is usually called automatically by the destructor; * don't call it directly unless you really know what you'd * doing. *ESPECIALLY* don't call this on a |t| other than * the one passed to the contructor, unless you really know * what you're doing. */ void restore_state_to(Task* t); /** * Make |syscallno| with variadic |args| (limited to 6 on * x86). Return the raw kernel return value. * Returns -ESRCH if the process dies or has died. */ template long syscall(int syscallno, Rest... args) { Registers callregs = regs(); // The first syscall argument is called "arg 1", so // our syscall-arg-index template parameter starts // with "1". return syscall_helper<1>(syscallno, callregs, args...); } template long infallible_syscall(int syscallno, Rest... args) { Registers callregs = regs(); // The first syscall argument is called "arg 1", so // our syscall-arg-index template parameter starts // with "1". long ret = syscall_helper<1>(syscallno, callregs, args...); check_syscall_result(ret, syscallno); return ret; } template long infallible_syscall_if_alive(int syscallno, Rest... args) { Registers callregs = regs(); // The first syscall argument is called "arg 1", so // our syscall-arg-index template parameter starts // with "1". long ret = syscall_helper<1>(syscallno, callregs, args...); check_syscall_result(ret, syscallno, true); return ret; } template remote_ptr infallible_syscall_ptr(int syscallno, Rest... args) { Registers callregs = regs(); long ret = syscall_helper<1>(syscallno, callregs, args...); check_syscall_result(ret, syscallno); return ret; } /** * Remote mmap syscalls are common and non-trivial due to the need to * select either mmap2 or mmap. */ remote_ptr infallible_mmap_syscall(remote_ptr addr, size_t length, int prot, int flags, int child_fd, uint64_t offset_pages); int64_t infallible_lseek_syscall(int fd, int64_t offset, int whence); /** The Task in the context of which we're making syscalls. */ Task* task() const { return t; } /** * A small helper to get at the Task's arch. * Out-of-line to avoid including Task.h here. */ SupportedArch arch() const; /** * Arranges for 'fd' to be transmitted to this process and returns * our opened version of it. * Returns a closed fd if the process dies or has died. */ ScopedFd retrieve_fd(int fd); /** * Arranges for 'fd' to be transmitted to the tracee and returns * a file descriptor in the tracee that corresponds to the same file * description. * Returns a negative value if the process dies or has died. */ int send_fd(const ScopedFd &fd); /** * `send_fd` the given file descriptor, making sure that it ends up as fd * `dup_to`, (dup'ing it there and closing the original if necessary) */ void infallible_send_fd_dup(const ScopedFd& our_fd, int dup_to); /** * Remotely invoke in |t| the specified syscall with the given * arguments. The arguments must of course be valid in |t|, * and no checking of that is done by this function. * * The syscall is finished in |t| and the result is returned. */ long syscall_base(int syscallno, Registers& callregs); MemParamsEnabled enable_mem_params() { return enable_mem_params_; } /** * When the syscall is 'clone', this will be recovered from the * PTRACE_EVENT_FORK/VFORK/CLONE. */ pid_t new_tid() { return new_tid_; } /* Do the open/mmap/close dance for a particular file */ void finish_direct_mmap(remote_ptr rec_addr, size_t length, int prot, int flags, const std::string& backing_file_name, int backing_file_open_flags, off64_t backing_offset_pages, struct stat& real_file, std::string& real_file_name); private: void setup_path(bool enable_singlestep_path); void check_syscall_result(long ret, int syscallno, bool allow_death=false); /** * "Recursively" build the set of syscall registers in * |callregs|. |Index| is the syscall arg that will be set to * |arg|, and |args| are the remaining arguments. */ template long syscall_helper(int syscallno, Registers& callregs, T arg, Rest... args) { callregs.set_arg(arg); return syscall_helper(syscallno, callregs, args...); } /** * "Recursion" "base case": no more arguments to build, so * just make the syscall and return the kernel return value. */ template long syscall_helper(int syscallno, Registers& callregs) { return syscall_base(syscallno, callregs); } template ScopedFd retrieve_fd_arch(int fd); template int send_fd_arch(const ScopedFd &fd); Task* t; Registers initial_regs; remote_code_ptr initial_ip; remote_ptr initial_sp; bool initial_at_seccomp; remote_ptr fixed_sp; std::vector replaced_bytes; WaitStatus restore_wait_status; pid_t new_tid_; /* Whether we had to mmap a scratch region because none was found */ bool scratch_mem_was_mapped; bool use_singlestep_path; MemParamsEnabled enable_mem_params_; AutoRemoteSyscalls& operator=(const AutoRemoteSyscalls&) = delete; AutoRemoteSyscalls(const AutoRemoteSyscalls&) = delete; }; } // namespace rr #endif // RR_AUTO_REMOTE_SYSCALLS_H_ rr-5.5.0/src/BreakpointCondition.h000066400000000000000000000005461412202446200170530ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_BREAKPOINT_CONDITION_H_ #define RR_BREAKPOINT_CONDITION_H_ namespace rr { class Task; class BreakpointCondition { public: virtual ~BreakpointCondition() {} virtual bool evaluate(Task* t) const = 0; }; } // namespace rr #endif // RR_BREAKPOINT_CONDITION_H_ rr-5.5.0/src/BuildidCommand.cc000066400000000000000000000022761412202446200161210ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include "log.h" #include "Command.h" #include "ElfReader.h" #include "ScopedFd.h" using namespace std; namespace rr { class BuildidCommand : public Command { public: virtual int run(vector& args) override; protected: BuildidCommand(const char* name, const char* help) : Command(name, help) {} static BuildidCommand singleton; }; BuildidCommand BuildidCommand::singleton( "buildid", " rr buildid\n" " Accepts paths on stdin, prints buildids on stdout. Will terminate when\n" " either an empty line or an invalid path is provided.\n"); int BuildidCommand::run(vector& args) { if (!args.empty()) { fprintf(stderr, "Unexpected arguments!"); return 1; } string input; while (getline(cin, input)) { if (input.empty()) { break; } ScopedFd fd = ScopedFd(input.c_str(), O_RDONLY, 0); if (!fd.is_open()) { LOG(error) << "Failed to open `" << input << "`"; return 1; } ElfFileReader reader(fd); auto buildid = reader.read_buildid(); fprintf(stdout, "%s\n", buildid.c_str()); } return 0; } } // namespace rr rr-5.5.0/src/CPUFeaturesCommand.cc000066400000000000000000000027131412202446200166670ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Command.h" #include "GdbServer.h" #include "main.h" #include "util.h" using namespace std; namespace rr { class CPUFeaturesCommand : public Command { public: virtual int run(vector& args) override; protected: CPUFeaturesCommand(const char* name, const char* help) : Command(name, help) {} static CPUFeaturesCommand singleton; }; CPUFeaturesCommand CPUFeaturesCommand::singleton( "cpufeatures", " rr cpufeatures\n" " Print `rr record` command line options that will limit the tracee\n" " to CPU features this machine supports.\n" " Useful for trace portability: run `rr cpufeatures` on the machine\n" " you plan to replay on, then add those command-line parameters to\n" " `rr record` on the recording machine.\n"); int CPUFeaturesCommand::run(vector& args) { while (parse_global_option(args)) { } CPUIDData features = cpuid(CPUID_GETFEATURES, 0); CPUIDData extended_features = cpuid(CPUID_GETEXTENDEDFEATURES, 0); CPUIDData features_xsave = cpuid(CPUID_GETXSAVE, 1); fprintf(stdout, "--disable-cpuid-features 0x%x,0x%x " "--disable-cpuid-features-ext 0x%x,0x%x,0x%x " "--disable-cpuid-features-xsave 0x%x\n", ~features.ecx, ~features.edx, ~extended_features.ebx, ~extended_features.ecx, ~extended_features.edx, ~features_xsave.eax); return 0; } } // namespace rr rr-5.5.0/src/CPUIDBugDetector.cc000066400000000000000000000050061412202446200162340ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "CPUIDBugDetector.h" #include "Event.h" #include "Flags.h" #include "ReplaySession.h" #include "ReplayTask.h" #include "kernel_abi.h" using namespace std; namespace rr { #if defined(__i386__) || defined(__x86_64__) extern "C" int cpuid_loop(int iterations); void CPUIDBugDetector::run_detection_code() { // Call cpuid_loop to generate trace data we can use to detect // the cpuid rcb undercount bug. This generates 4 geteuid // calls which should have 2 rcbs between each of the // 3 consecutive pairs. cpuid_loop(4); } #else // Other platforms don't have cpuid, but keep the calling code clean, by // just making this a no-op there. void CPUIDBugDetector::run_detection_code() {} #endif static bool rcb_counts_ok(ReplayTask* t, uint64_t prev, uint64_t current) { uint32_t expected_count = 2 + PerfCounters::ticks_for_direct_call(t); if (current - prev == expected_count) { return true; } if (!Flags::get().suppress_environment_warnings) { fprintf( stderr, "\n" "rr: Warning: You appear to be running in a VMWare guest with a bug\n" " where a conditional branch instruction between two CPUID " "instructions\n" " sometimes fails to be counted by the conditional branch " "performance\n" " counter. Work around this problem by adding\n" " monitor_control.disable_hvsim_clusters = true\n" " to your .vmx file.\n" "\n"); } return false; } void CPUIDBugDetector::notify_reached_syscall_during_replay(ReplayTask* t) { // We only care about events that happen before the first exec, // when our detection code runs. if (!is_x86ish(t->arch())) { return; } if (t->session().done_initial_exec()) { return; } const Event& ev = t->current_trace_frame().event(); if (!is_geteuid32_syscall(ev.Syscall().number, t->arch()) && !is_geteuid_syscall(ev.Syscall().number, t->arch())) { return; } uint64_t trace_rcb_count = t->current_trace_frame().ticks(); uint64_t actual_rcb_count = t->tick_count(); if (trace_rcb_count_at_last_geteuid32 > 0 && !detected_cpuid_bug) { if (!rcb_counts_ok(t, trace_rcb_count_at_last_geteuid32, trace_rcb_count) || !rcb_counts_ok(t, actual_rcb_count_at_last_geteuid32, actual_rcb_count)) { detected_cpuid_bug = true; } } trace_rcb_count_at_last_geteuid32 = trace_rcb_count; actual_rcb_count_at_last_geteuid32 = actual_rcb_count; } } // namespace rr rr-5.5.0/src/CPUIDBugDetector.h000066400000000000000000000025231412202446200160770ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_CPUID_BUG_DETECTOR_H_ #define RR_CPUID_BUG_DETECTOR_H_ #include namespace rr { class ReplayTask; /** * Helper to detect when the "CPUID can cause rcbs to be lost" bug is present. * See http://robert.ocallahan.org/2014/09/vmware-cpuid-conditional-branch.html * * This bug is caused by VMM optimizations described in * https://www.usenix.org/system/files/conference/atc12/atc12-final158.pdf * that cause instruction sequences related to CPUID to be optimized, * eliminating the user-space execution of a conditional branch between two * CPUID instructions (in some circumstances). */ class CPUIDBugDetector { public: CPUIDBugDetector() : trace_rcb_count_at_last_geteuid32(0), actual_rcb_count_at_last_geteuid32(0), detected_cpuid_bug(false) {} /** * Call this in the context of the first spawned process to run the * code that triggers the bug. */ static void run_detection_code(); /** * Call this when task t enters a traced syscall during replay. */ void notify_reached_syscall_during_replay(ReplayTask* t); private: uint64_t trace_rcb_count_at_last_geteuid32; uint64_t actual_rcb_count_at_last_geteuid32; bool detected_cpuid_bug; }; } // namespace rr #endif /* RR_CPUID_BUG_DETECTOR_H_ */ rr-5.5.0/src/Command.cc000066400000000000000000000115211412202446200146150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #define _BSD_SOURCE #include "Command.h" #include #include #include #include "TraceStream.h" #include "core.h" #include "main.h" using namespace std; namespace rr { bool ParsedOption::verify_valid_int(int64_t min, int64_t max) const { if (int_value < min || int_value > max) { fprintf( stderr, "Value %s for parameter %s was not valid (allowed range %lld-%lld)\n", value.c_str(), arg.c_str(), (long long)min, (long long)max); return false; } return true; } static vector* command_list; Command::Command(const char* name, const char* help) : name(name), help(help) { if (!command_list) { command_list = new vector(); } command_list->push_back(this); } Command* Command::command_for_name(const std::string& name) { for (auto& it : *command_list) { if (strcmp(it->name, name.c_str()) == 0) { return it; } } return nullptr; } bool Command::less_than_by_name(Command* c1, Command* c2) { return strcmp(c1->name, c2->name) < 0; } void Command::print_help_all(FILE* out) { vector cmds; for (auto& it : *command_list) { if (!it->help) { continue; } cmds.push_back(it); } sort(cmds.begin(), cmds.end(), less_than_by_name); for (auto& it : cmds) { const char* c = strchr(it->help, '\n'); if (c) { fprintf(out, "%.*s\n", (int)(c - it->help), it->help); } else { fputs(it->help, out); } } } void Command::print_help(FILE* out) { if (help) { fputs(help, out); print_global_options(out); } else { print_usage(out); } } static bool consume_args(std::vector& args, size_t count) { args.erase(args.begin(), args.begin() + count); return true; } static void assign_param(ParsedOption* opt, const char* s) { opt->value = s; opt->int_value = INT64_MIN; if (!opt->value.empty()) { char* end; int64_t v = strtoll(s, &end, 0); if (*end == 0) { opt->int_value = v; } } } bool Command::parse_option(std::vector& args, const OptionSpec* option_specs, size_t count, ParsedOption* out) { if (args.size() == 0 || args[0][0] != '-') { return false; } out->arg = args[0]; for (size_t i = 0; i < count; ++i) { if (args[0][1] == option_specs[i].short_name && args[0][1] >= 32) { out->short_name = option_specs[i].short_name; switch (option_specs[i].param) { case NO_PARAMETER: if (args[0][2] == 0) { return consume_args(args, 1); } return false; case HAS_PARAMETER: if (args[0][2] == '=') { assign_param(out, args[0].c_str() + 3); return consume_args(args, 1); } if (args[0][2] != 0) { assign_param(out, args[0].c_str() + 2); return consume_args(args, 1); } if (args.size() >= 2) { assign_param(out, args[1].c_str()); return consume_args(args, 2); } return false; default: DEBUG_ASSERT(0 && "Unknown parameter type"); } } else if (args[0][1] == '-') { size_t equals = args[0].find('='); if (strncmp(args[0].c_str() + 2, option_specs[i].long_name, (equals == string::npos ? 9999 : equals) - 2) == 0) { out->short_name = option_specs[i].short_name; switch (option_specs[i].param) { case NO_PARAMETER: return consume_args(args, 1); case HAS_PARAMETER: if (equals == string::npos) { if (args.size() >= 2) { assign_param(out, args[1].c_str()); return consume_args(args, 2); } return false; } assign_param(out, args[0].c_str() + equals + 1); return consume_args(args, 1); default: DEBUG_ASSERT(0 && "Unknown parameter type"); } } } } return false; } bool Command::verify_not_option(std::vector& args) { if (args.size() > 0 && args[0][0] == '-') { if (args[0].length() == 2 && args[0][1] == '-') { args.erase(args.begin()); return true; } fprintf(stderr, "Invalid option %s\n", args[0].c_str()); return false; } return true; } bool Command::parse_optional_trace_dir(vector& args, string* out) { if (!verify_not_option(args)) { return false; } if (args.size() > 0) { *out = args[0]; args.erase(args.begin()); } else { *out = string(); } return true; } bool Command::parse_literal(std::vector& args, const char* lit) { if (args.size() > 0 && args[0] == lit) { args.erase(args.begin()); return true; } else { return false; } } } // namespace rr rr-5.5.0/src/Command.h000066400000000000000000000037401412202446200144630ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_COMMAND_H_ #define RR_COMMAND_H_ #ifndef _DEFAULT_SOURCE #define _DEFAULT_SOURCE 1 #endif #include #include #include #include #include namespace rr { class TraceReader; enum OptionParameters { NO_PARAMETER, HAS_PARAMETER }; struct OptionSpec { char short_name; const char* long_name; OptionParameters param; }; struct ParsedOption { char short_name; std::string arg; std::string value; int64_t int_value; bool verify_valid_int(int64_t min = INT64_MIN + 1, int64_t max = INT64_MAX) const; }; /** * rr command-line commands. Objects of this class must be static, since * they are expected to be immortal. */ class Command { public: static Command* command_for_name(const std::string& name); static void print_help_all(FILE* out); /* Runs the command with the given parameters. Returns an exit code. */ virtual int run(std::vector& args) = 0; void print_help(FILE* out); static bool verify_not_option(std::vector& args); static bool parse_optional_trace_dir(std::vector& args, std::string* out); static bool parse_option(std::vector& args, const OptionSpec* option_specs, size_t count, ParsedOption* out); template static bool parse_option(std::vector& args, const OptionSpec (&option_specs)[N], ParsedOption* out) { return parse_option(args, option_specs, N, out); } static bool parse_literal(std::vector& args, const char* lit); protected: Command(const char* name, const char* help); virtual ~Command() {} static bool less_than_by_name(Command* c1, Command* c2); const char* name; const char* help; }; } // namespace rr #endif // RR_COMMAND_H_ rr-5.5.0/src/CompressedReader.cc000066400000000000000000000117331412202446200164730ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #define _LARGEFILE64_SOURCE #include "CompressedReader.h" #include #include #include #include #include #include #include #include "CompressedWriter.h" #include "core.h" #include "util.h" using namespace std; namespace rr { CompressedReader::CompressedReader(const string& filename) : fd(new ScopedFd(filename.c_str(), O_CLOEXEC | O_RDONLY | O_LARGEFILE)) { fd_offset = 0; error = !fd->is_open(); if (error) { eof = false; } else { char ch; eof = pread(*fd, &ch, 1, fd_offset) == 0; } buffer_read_pos = 0; have_saved_state = false; } CompressedReader::CompressedReader(const CompressedReader& other) { fd = other.fd; fd_offset = other.fd_offset; error = other.error; eof = other.eof; buffer_read_pos = other.buffer_read_pos; buffer = other.buffer; have_saved_state = false; DEBUG_ASSERT(!other.have_saved_state); } CompressedReader::~CompressedReader() { close(); } static bool read_all(const ScopedFd& fd, size_t size, void* data, uint64_t* offset) { ssize_t ret = read_to_end(fd, *offset, data, size); if (ret == (ssize_t)size) { *offset += size; return true; } return false; } static bool do_decompress(std::vector& compressed, std::vector& uncompressed) { size_t out_size = uncompressed.size(); return BrotliDecoderDecompress(compressed.size(), compressed.data(), &out_size, uncompressed.data()) == BROTLI_DECODER_RESULT_SUCCESS && out_size == uncompressed.size(); } bool CompressedReader::get_buffer(const uint8_t** data, size_t* size) { if (error) { return false; } if (buffer_read_pos >= buffer.size() && !eof) { if (!refill_buffer()) { return false; } DEBUG_ASSERT(buffer_read_pos < buffer.size()); } *data = &buffer[buffer_read_pos]; *size = buffer.size() - buffer_read_pos; return true; } bool CompressedReader::skip(size_t size) { while (size > 0) { if (error) { return false; } if (buffer_read_pos < buffer.size()) { size_t amount = std::min(size, buffer.size() - buffer_read_pos); size -= amount; buffer_read_pos += amount; continue; } if (!refill_buffer()) { return false; } } return true; } bool CompressedReader::read(void* data, size_t size) { while (size > 0) { if (error) { return false; } if (buffer_read_pos < buffer.size()) { size_t amount = std::min(size, buffer.size() - buffer_read_pos); memcpy(data, &buffer[buffer_read_pos], amount); size -= amount; data = static_cast(data) + amount; buffer_read_pos += amount; continue; } if (!refill_buffer()) { return false; } } return true; } bool CompressedReader::refill_buffer() { if (have_saved_state && !have_saved_buffer) { std::swap(buffer, saved_buffer); have_saved_buffer = true; } CompressedWriter::BlockHeader header; if (!read_all(*fd, sizeof(header), &header, &fd_offset)) { error = true; return false; } std::vector compressed_buf; compressed_buf.resize(header.compressed_length); if (!read_all(*fd, compressed_buf.size(), &compressed_buf[0], &fd_offset)) { error = true; return false; } char ch; if (pread(*fd, &ch, 1, fd_offset) == 0) { eof = true; } buffer.resize(header.uncompressed_length); buffer_read_pos = 0; if (!do_decompress(compressed_buf, buffer)) { error = true; return false; } return true; } void CompressedReader::rewind() { DEBUG_ASSERT(!have_saved_state); fd_offset = 0; buffer_read_pos = 0; buffer.clear(); eof = false; } void CompressedReader::close() { fd = nullptr; } void CompressedReader::save_state() { DEBUG_ASSERT(!have_saved_state); have_saved_state = true; have_saved_buffer = false; saved_fd_offset = fd_offset; saved_buffer_read_pos = buffer_read_pos; } void CompressedReader::restore_state() { DEBUG_ASSERT(have_saved_state); have_saved_state = false; if (saved_fd_offset < fd_offset) { eof = false; } fd_offset = saved_fd_offset; if (have_saved_buffer) { std::swap(buffer, saved_buffer); saved_buffer.clear(); } buffer_read_pos = saved_buffer_read_pos; } void CompressedReader::discard_state() { DEBUG_ASSERT(have_saved_state); have_saved_state = false; if (have_saved_buffer) { saved_buffer.clear(); } } uint64_t CompressedReader::uncompressed_bytes() const { uint64_t offset = 0; uint64_t uncompressed_bytes = 0; CompressedWriter::BlockHeader header; while (read_all(*fd, sizeof(header), &header, &offset)) { uncompressed_bytes += header.uncompressed_length; offset += header.compressed_length; } return uncompressed_bytes; } uint64_t CompressedReader::compressed_bytes() const { return lseek(*fd, 0, SEEK_END); } } // namespace rr rr-5.5.0/src/CompressedReader.h000066400000000000000000000052151412202446200163330ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_COMPRESSED_READER_H_ #define RR_COMPRESSED_READER_H_ #include #include #include #include #include #include "ScopedFd.h" namespace rr { /** * CompressedReader opens an input file written by CompressedWriter * and reads data from it. Currently data is decompressed by the thread that * calls read(). */ class CompressedReader { public: CompressedReader(const std::string& filename); CompressedReader(const CompressedReader& aOther); ~CompressedReader(); bool good() const { return !error; } bool at_end() const { return eof && buffer_read_pos == buffer.size(); } // Returns true if successful. Otherwise there's an error and good() // will be false. bool read(void* data, size_t size); // Returns pointer/size of some buffered data. Does not change the state. // Returns zero size if at EOF. bool get_buffer(const uint8_t** data, size_t* size); // Advances the read position by the given size. bool skip(size_t size); void rewind(); void close(); /** * Save the current position. Nested saves are not allowed. */ void save_state(); /** * Restore previously saved position. */ void restore_state(); /** * Discard saved position */ void discard_state(); /** * Gathers stats on the file stream. These are independent of what's * actually been read. */ uint64_t uncompressed_bytes() const; uint64_t compressed_bytes() const; template CompressedReader& operator>>(T& value) { read(&value, sizeof(value)); return *this; } CompressedReader& operator>>(std::string& value) { value.empty(); while (true) { char ch; read(&ch, 1); if (ch == 0) { break; } value.append(1, ch); } return *this; } template CompressedReader& operator>>(std::vector& value) { size_t len; *this >> len; value.resize(0); for (size_t i = 0; i < len; ++i) { T v; *this >> v; value.push_back(v); } return *this; } protected: bool refill_buffer(); /* Our fd might be the dup of another fd, so we can't rely on its current file position. Instead track the current position in fd_offset and use pread. */ uint64_t fd_offset; std::shared_ptr fd; bool error; bool eof; std::vector buffer; size_t buffer_read_pos; bool have_saved_state; bool have_saved_buffer; uint64_t saved_fd_offset; std::vector saved_buffer; size_t saved_buffer_read_pos; }; } // namespace rr #endif /* RR_COMPRESSED_READER_H_ */ rr-5.5.0/src/CompressedWriter.cc000066400000000000000000000177051412202446200165520ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #define _LARGEFILE64_SOURCE #include "CompressedWriter.h" #include #include #include #include #include #include #include #include "core.h" #include "util.h" using namespace std; namespace rr { /* See * http://robert.ocallahan.org/2017/07/selecting-compression-algorithm-for-rr.html */ static const int BROTLI_LEVEL = 5; void* CompressedWriter::compression_thread_callback(void* p) { static_cast(p)->compression_thread(); return nullptr; } CompressedWriter::CompressedWriter(const string& filename, size_t block_size, uint32_t num_threads) : fd(filename.c_str(), O_CLOEXEC | O_WRONLY | O_CREAT | O_EXCL | O_LARGEFILE, 0400) { this->block_size = block_size; threads.resize(num_threads); thread_pos.resize(num_threads); buffer.resize(block_size * (num_threads + 2)); pthread_mutex_init(&mutex, nullptr); pthread_cond_init(&cond, nullptr); for (uint32_t i = 0; i < num_threads; ++i) { thread_pos[i] = UINT64_MAX; } next_thread_pos = 0; next_thread_end_pos = 0; closing = false; write_error = false; producer_reserved_pos = 0; producer_reserved_write_pos = 0; producer_reserved_upto_pos = 0; error = false; if (fd < 0) { error = true; return; } // Hold the lock so threads don't inspect the 'threads' array // until we've finished initializing it. pthread_mutex_lock(&mutex); for (uint32_t i = 0; i < num_threads; ++i) { while (true) { int err = pthread_create(&threads[i], nullptr, compression_thread_callback, this); if (err == EAGAIN) { sched_yield(); // Give other processes a chance to exit. continue; } else if (err != 0) { SAFE_FATAL(err, "Failed to create compression threads!"); } break; } size_t last_slash = filename.rfind('/'); string thread_name = string("compress ") + (last_slash == string::npos ? filename : filename.substr(last_slash + 1)); pthread_setname_np(threads[i], thread_name.substr(0, 15).c_str()); } pthread_mutex_unlock(&mutex); } CompressedWriter::~CompressedWriter() { close(); pthread_mutex_destroy(&mutex); pthread_cond_destroy(&cond); } void CompressedWriter::write(const void* data, size_t size) { while (!error && size > 0) { uint64_t reservation_size = producer_reserved_upto_pos - producer_reserved_write_pos; if (reservation_size == 0) { update_reservation(WAIT); continue; } size_t buf_offset = (size_t)(producer_reserved_write_pos % buffer.size()); size_t amount = min(buffer.size() - buf_offset, (size_t)min(reservation_size, size)); memcpy(&buffer[buf_offset], data, amount); producer_reserved_write_pos += amount; data = static_cast(data) + amount; size -= amount; } if (!error && producer_reserved_write_pos - producer_reserved_pos >= buffer.size() / 2) { update_reservation(NOWAIT); } } void CompressedWriter::update_reservation(WaitFlag wait_flag) { pthread_mutex_lock(&mutex); next_thread_end_pos = producer_reserved_write_pos; producer_reserved_pos = producer_reserved_write_pos; // Wake up threads that might be waiting to consume data. pthread_cond_broadcast(&cond); while (!error) { if (write_error) { error = true; break; } uint64_t completed_pos = next_thread_pos; for (uint32_t i = 0; i < thread_pos.size(); ++i) { completed_pos = min(completed_pos, thread_pos[i]); } producer_reserved_upto_pos = completed_pos + buffer.size(); if (producer_reserved_pos < producer_reserved_upto_pos || wait_flag == NOWAIT) { break; } pthread_cond_wait(&cond, &mutex); } pthread_mutex_unlock(&mutex); } void CompressedWriter::compression_thread() { pthread_mutex_lock(&mutex); int thread_index; pthread_t self = pthread_self(); for (thread_index = 0; threads[thread_index] != self; ++thread_index) { } // Add slop for incompressible data vector outputbuf; outputbuf.resize((size_t)(block_size * 1.1) + sizeof(BlockHeader)); BlockHeader* header = reinterpret_cast(&outputbuf[0]); while (true) { if (!write_error && next_thread_pos < next_thread_end_pos && (closing || next_thread_pos + block_size <= next_thread_end_pos)) { thread_pos[thread_index] = next_thread_pos; next_thread_pos = min(next_thread_end_pos, next_thread_pos + block_size); // header->uncompressed_length must be <= block_size, // therefore fits in a size_t. header->uncompressed_length = (size_t)(next_thread_pos - thread_pos[thread_index]); pthread_mutex_unlock(&mutex); header->compressed_length = do_compress(thread_pos[thread_index], header->uncompressed_length, &outputbuf[sizeof(BlockHeader)], outputbuf.size() - sizeof(BlockHeader)); pthread_mutex_lock(&mutex); if (header->compressed_length == 0) { write_error = true; } // wait until we're the next thread that needs to write while (!write_error) { bool other_thread_write_first = false; for (uint32_t i = 0; i < thread_pos.size(); ++i) { if (thread_pos[i] < thread_pos[thread_index]) { other_thread_write_first = true; } } if (!other_thread_write_first) { break; } pthread_cond_wait(&cond, &mutex); } if (!write_error) { pthread_mutex_unlock(&mutex); write_all(fd, &outputbuf[0], sizeof(BlockHeader) + header->compressed_length); pthread_mutex_lock(&mutex); } thread_pos[thread_index] = UINT64_MAX; // do a broadcast because we might need to unblock // the producer thread or a compressor thread waiting // for us to write. pthread_cond_broadcast(&cond); continue; } if (closing && (write_error || next_thread_pos == next_thread_end_pos)) { break; } pthread_cond_wait(&cond, &mutex); } pthread_mutex_unlock(&mutex); } void CompressedWriter::close(Sync sync) { if (!fd.is_open()) { return; } update_reservation(NOWAIT); pthread_mutex_lock(&mutex); closing = true; pthread_cond_broadcast(&cond); pthread_mutex_unlock(&mutex); for (auto i = threads.begin(); i != threads.end(); ++i) { pthread_join(*i, nullptr); } if (sync == SYNC) { if (fsync(fd) < 0) { error = true; } } if (write_error) { error = true; } fd.close(); } size_t CompressedWriter::do_compress(uint64_t offset, size_t length, uint8_t* outputbuf, size_t outputbuf_len) { BrotliEncoderState* state = BrotliEncoderCreateInstance(NULL, NULL, NULL); if (!state) { DEBUG_ASSERT(0 && "BrotliEncoderCreateInstance failed"); } if (!BrotliEncoderSetParameter(state, BROTLI_PARAM_QUALITY, BROTLI_LEVEL)) { DEBUG_ASSERT(0 && "Brotli initialization failed"); } size_t ret = 0; while (length > 0) { size_t buf_offset = (size_t)(offset % buffer.size()); size_t amount = min(length, buffer.size() - buf_offset); const uint8_t* in = &buffer[buf_offset]; if (!BrotliEncoderCompressStream(state, BROTLI_OPERATION_PROCESS, &amount, &in, &outputbuf_len, &outputbuf, &ret)) { DEBUG_ASSERT(0 && "Brotli compression failed"); } size_t consumed = in - &buffer[buf_offset]; offset += consumed; length -= consumed; } size_t zero = 0; if (!BrotliEncoderCompressStream(state, BROTLI_OPERATION_FINISH, &zero, NULL, &outputbuf_len, &outputbuf, &ret)) { DEBUG_ASSERT(0 && "Brotli compression failed"); } BrotliEncoderDestroyInstance(state); return ret; } } // namespace rr rr-5.5.0/src/CompressedWriter.h000066400000000000000000000062431412202446200164070ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_COMPRESSED_WRITER_H_ #define RR_COMPRESSED_WRITER_H_ #include #include #include #include #include #include "ScopedFd.h" namespace rr { /** * CompressedWriter opens an output file and writes compressed blocks to it. * Blocks of a fixed but unspecified size (currently 1MB) are compressed. * Each block of compressed data is written to the file preceded by two * 32-bit words: the size of the compressed data (excluding block header) * and the size of the uncompressed data, in that order. See BlockHeader below. * * We use multiple threads to perform compression. The threads are * responsible for the actual data writes. The thread that creates the * CompressedWriter is the "producer" thread and must also be the caller of * 'write'. The producer thread may block in 'write' if 'buffer_size' bytes are * being compressed. * * Each data block is compressed independently using zlib. */ class CompressedWriter { public: CompressedWriter(const std::string& filename, size_t buffer_size, uint32_t num_threads); ~CompressedWriter(); // Call only on producer thread bool good() const { return !error; } // Call only on producer thread. void write(const void* data, size_t size); enum Sync { DONT_SYNC, SYNC }; // Call only on producer thread void close(Sync sync = DONT_SYNC); struct BlockHeader { uint32_t compressed_length; uint32_t uncompressed_length; }; template CompressedWriter& operator<<(const T& value) { write(&value, sizeof(value)); return *this; } CompressedWriter& operator<<(const std::string& value) { write(value.c_str(), value.size() + 1); return *this; } template CompressedWriter& operator<<(const std::vector& value) { *this << value.size(); for (auto& i : value) { *this << i; } return *this; } protected: enum WaitFlag { WAIT, NOWAIT }; void update_reservation(WaitFlag wait_flag); static void* compression_thread_callback(void* p); void compression_thread(); size_t do_compress(uint64_t offset, size_t length, uint8_t* outputbuf, size_t outputbuf_len); // Immutable while threads are running ScopedFd fd; int block_size; pthread_mutex_t mutex; pthread_cond_t cond; std::vector threads; // Carefully shared... std::vector buffer; // BEGIN protected by 'mutex' /* position in output stream that this thread is currently working on, * or UINT64_MAX if it's idle */ std::vector thread_pos; /* position in output stream of data to dispatch to next thread */ uint64_t next_thread_pos; /* position in output stream of end of data ready to dispatch */ uint64_t next_thread_end_pos; bool closing; bool write_error; // END protected by 'mutex' /* producer thread only */ /* Areas in the buffer that have been reserved for write() */ uint64_t producer_reserved_pos; uint64_t producer_reserved_write_pos; uint64_t producer_reserved_upto_pos; bool error; }; } // namespace rr #endif /* RR_COMPRESSED_WRITER_H_ */ rr-5.5.0/src/DiversionSession.cc000066400000000000000000000153761412202446200165610ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "DiversionSession.h" #include "AutoRemoteSyscalls.h" #include "ReplaySession.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" using namespace std; namespace rr { DiversionSession::DiversionSession() : emu_fs(EmuFs::create()) {} DiversionSession::~DiversionSession() { // We won't permanently leak any OS resources by not ensuring // we've cleaned up here, but sessions can be created and // destroyed many times, and we don't want to temporarily hog // resources. kill_all_tasks(); DEBUG_ASSERT(tasks().size() == 0 && vms().size() == 0); DEBUG_ASSERT(emu_fs->size() == 0); } static void finish_emulated_syscall_with_ret(Task* t, long ret) { t->finish_emulated_syscall(); Registers r = t->regs(); r.set_syscall_result(ret); t->set_regs(r); } /** * Execute the syscall contained in |t|'s current register set. The * return value of the syscall is set for |t|'s registers, to be * returned to the tracee task. */ static void execute_syscall(Task* t) { t->finish_emulated_syscall(); AutoRemoteSyscalls remote(t); remote.syscall(remote.regs().original_syscallno(), remote.regs().arg1(), remote.regs().arg2(), remote.regs().arg3(), remote.regs().arg4(), remote.regs().arg5(), remote.regs().arg6()); remote.regs().set_syscall_result(t->regs().syscall_result()); } template static void process_syscall_arch(Task* t, int syscallno) { LOG(debug) << "Processing " << syscall_name(syscallno, Arch::arch()); if (syscallno == Arch::ioctl && t->is_desched_event_syscall()) { // The arm/disarm-desched ioctls are emulated as no-ops. // However, because the rr preload library expects these // syscalls to succeed and aborts if they don't, we fudge a // "0" return value. finish_emulated_syscall_with_ret(t, 0); return; } switch (syscallno) { // We blacklist these syscalls because the params include // namespaced identifiers that are different in replay than // recording, and during replay they may refer to different, // live resources. For example, if a recorded tracees kills // one of its threads, then during replay that killed pid // might refer to a live process outside the tracee tree. We // don't want diversion tracees randomly shooting down other // processes! // // We optimistically assume that filesystem operations were // intended by the user. // // There's a potential problem with "fd confusion": in the // diversion tasks, fds returned from open() during replay are // emulated. But those fds may accidentally refer to live fds // in the task fd table. So write()s etc may not be writing // to the file the tracee expects. However, the only real fds // that leak into tracees are the stdio fds, and there's not // much harm that can be caused by accidental writes to them. case Arch::ipc: case Arch::kill: case Arch::rt_sigqueueinfo: case Arch::rt_tgsigqueueinfo: case Arch::tgkill: case Arch::tkill: { LOG(debug) << "Suppressing syscall " << syscall_name(syscallno, t->arch()); Registers r = t->regs(); r.set_syscall_result(-ENOSYS); t->set_regs(r); return; } case Arch::gettid: { auto tid = t->own_namespace_tid(); LOG(debug) << "Emulating gettid with " << tid; Registers r = t->regs(); r.set_syscall_result(tid); t->set_regs(r); return; } case Arch::getpid: { auto pid = t->thread_group()->tgid_own_namespace; LOG(debug) << "Emulating getpid with " << pid; Registers r = t->regs(); r.set_syscall_result(pid); t->set_regs(r); return; } } LOG(debug) << "Executing syscall " << syscall_name(syscallno, t->arch()); return execute_syscall(t); } static void process_syscall(Task* t, int syscallno){ RR_ARCH_FUNCTION(process_syscall_arch, t->arch(), t, syscallno) } static void handle_ptrace_exit_event(Task *t) { t->did_kill(); t->detach(); delete t; } /** * Advance execution until either a signal is received (including a SIGTRAP * generated by a single-step) or a syscall is made. */ DiversionSession::DiversionResult DiversionSession::diversion_step( Task* t, RunCommand command, int signal_to_deliver) { DEBUG_ASSERT(command != RUN_SINGLESTEP_FAST_FORWARD); assert_fully_initialized(); DiversionResult result; // An exit might have occurred while processing a previous syscall. if (t->ptrace_event() == PTRACE_EVENT_EXIT) { // We're about to destroy the task, so capture the context while // we can. TaskContext context(t); handle_ptrace_exit_event(t); // This is now a dangling pointer, so clear it. context.task = nullptr; result.status = DIVERSION_EXITED; result.break_status.task_context = context; result.break_status.task_exit = true; return result; } t->set_in_diversion(true); switch (command) { case RUN_CONTINUE: LOG(debug) << "Continuing to next syscall"; t->resume_execution(RESUME_SYSEMU, RESUME_WAIT, RESUME_UNLIMITED_TICKS, signal_to_deliver); break; case RUN_SINGLESTEP: LOG(debug) << "Stepping to next insn/syscall"; t->resume_execution(RESUME_SYSEMU_SINGLESTEP, RESUME_WAIT, RESUME_UNLIMITED_TICKS, signal_to_deliver); break; default: FATAL() << "Illegal run command " << command; } if (t->ptrace_event() == PTRACE_EVENT_EXIT) { handle_ptrace_exit_event(t); result.status = DIVERSION_EXITED; return result; } result.status = DIVERSION_CONTINUE; if (t->stop_sig()) { LOG(debug) << "Pending signal: " << t->get_siginfo(); result.break_status = diagnose_debugger_trap(t, command); if (!result.break_status.breakpoint_hit && result.break_status.watchpoints_hit.empty() && !result.break_status.singlestep_complete && (t->stop_sig() == SIGTRAP)) { result.break_status.signal = unique_ptr(new siginfo_t(t->get_siginfo())); result.break_status.signal->si_signo = t->stop_sig(); } LOG(debug) << "Diversion break at ip=" << (void*)t->ip().register_value() << "; break=" << result.break_status.breakpoint_hit << ", watch=" << !result.break_status.watchpoints_hit.empty() << ", singlestep=" << result.break_status.singlestep_complete; ASSERT(t, !result.break_status.singlestep_complete || command == RUN_SINGLESTEP); return result; } if (t->status().is_syscall()) { t->apply_syscall_entry_regs(); } process_syscall(t, t->regs().original_syscallno()); check_for_watchpoint_changes(t, result.break_status); return result; } } // namespace rr rr-5.5.0/src/DiversionSession.h000066400000000000000000000041311412202446200164060ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_DIVERSION_SESSION_H_ #define RR_DIVERSION_SESSION_H_ #include "EmuFs.h" #include "Session.h" namespace rr { class ReplaySession; /** * A DiversionSession lets you run task(s) forward without replay. * Clone a ReplaySession to a DiversionSession to execute some arbitrary * code for its side effects. * * Diversion allows tracees to execute freely, as in "recorder" * mode, but doesn't attempt to record any data. Diverter * emulates the syscalls it's able to (such as writes to stdio fds), * and essentially ignores the syscalls it doesn't know how to * implement. Tracees can easily get into inconsistent states within * diversion mode, and no attempt is made to detect or rectify that. * * Diverter mode is designed to support short-lived diversions from * "replayer" sessions, as required to support gdb's |call foo()| * feature. A diversion is created for the call frame, then discarded * when the call finishes (loosely speaking). */ class DiversionSession : public Session { public: DiversionSession(); typedef std::shared_ptr shr_ptr; ~DiversionSession(); EmuFs& emufs() const { return *emu_fs; } enum DiversionStatus { // Some execution was done. diversion_step() can be called again. DIVERSION_CONTINUE, // All tracees are dead. diversion_step() should not be called again. DIVERSION_EXITED }; struct DiversionResult { DiversionStatus status; BreakStatus break_status; }; /** * Try make progress in this diversion session. Run task t if possible. */ DiversionResult diversion_step(Task* t, RunCommand command = RUN_CONTINUE, int signal_to_deliver = 0); virtual DiversionSession* as_diversion() override { return this; } void set_tracee_fd_number(int fd_number) { tracee_socket_fd_number = fd_number; } void on_create(Task *t) override { this->Session::on_create(t); } private: friend class ReplaySession; std::shared_ptr emu_fs; }; } // namespace rr #endif // RR_DIVERSION_SESSION_H_ rr-5.5.0/src/DumpCommand.cc000066400000000000000000000312441412202446200154470ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "DumpCommand.h" #include #include #include #include #include #include #include "preload/preload_interface.h" #include "AddressSpace.h" #include "Command.h" #include "TraceStream.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" #include "main.h" #include "util.h" using namespace std; namespace rr { class DumpCommand : public Command { public: virtual int run(vector& args) override; protected: DumpCommand(const char* name, const char* help) : Command(name, help) {} static DumpCommand singleton; }; DumpCommand DumpCommand::singleton( "dump", " rr dump [OPTIONS] [] [...]\n" " Event specs can be either an event number like `127', or a range\n" " like `1000-5000', or `end' for the last record in the trace.\n" " By default, all events are dumped.\n" " -b, --syscallbuf dump syscallbuf contents\n" " -e, --task-events dump task events\n" " -m, --recorded-metadata dump recorded data metadata\n" " -p, --mmaps dump mmap data\n" " -r, --raw dump trace frames in a more easily\n" " machine-parseable format instead of the\n" " default human-readable format\n" " -s, --statistics dump statistics about the trace\n" " -t, --tid= dump events only for the specified tid\n"); static bool parse_dump_arg(vector& args, DumpFlags& flags) { if (parse_global_option(args)) { return true; } static const OptionSpec options[] = { { 0, "socket-addresses", NO_PARAMETER }, { 'b', "syscallbuf", NO_PARAMETER }, { 'e', "task-events", NO_PARAMETER }, { 'm', "recorded-metadata", NO_PARAMETER }, { 'p', "mmaps", NO_PARAMETER }, { 'r', "raw", NO_PARAMETER }, { 's', "statistics", NO_PARAMETER }, { 't', "tid", HAS_PARAMETER }, }; ParsedOption opt; if (!Command::parse_option(args, options, &opt)) { return false; } switch (opt.short_name) { case 'b': flags.dump_syscallbuf = true; break; case 'e': flags.dump_task_events = true; break; case 'm': flags.dump_recorded_data_metadata = true; break; case 'p': flags.dump_mmaps = true; break; case 'r': flags.raw_dump = true; break; case 's': flags.dump_statistics = true; break; case 't': if (!opt.verify_valid_int(1, INT32_MAX)) { return false; } flags.only_tid = opt.int_value; break; case 0: flags.dump_socket_addrs = true; break; default: DEBUG_ASSERT(0 && "Unknown option"); } return true; } static void dump_syscallbuf_data(TraceReader& trace, FILE* out, const TraceFrame& frame) { if (frame.event().type() != EV_SYSCALLBUF_FLUSH) { return; } auto buf = trace.read_raw_data(); size_t bytes_remaining = buf.data.size() - sizeof(struct syscallbuf_hdr); auto flush_hdr = reinterpret_cast(buf.data.data()); if (flush_hdr->num_rec_bytes > bytes_remaining) { fprintf(stderr, "Malformed trace file (bad recorded-bytes count)\n"); notifying_abort(); } bytes_remaining = flush_hdr->num_rec_bytes; auto record_ptr = reinterpret_cast(flush_hdr + 1); auto end_ptr = record_ptr + bytes_remaining; while (record_ptr < end_ptr) { auto record = reinterpret_cast(record_ptr); // Buffered syscalls always use the task arch fprintf(out, " { syscall:'%s', ret:0x%lx, size:0x%lx%s%s }\n", syscall_name(record->syscallno, frame.regs().arch()).c_str(), (long)record->ret, (long)record->size, record->desched ? ", desched:1" : "", record->replay_assist ? ", replay_assist:1" : ""); if (record->size < sizeof(*record)) { fprintf(stderr, "Malformed trace file (bad record size)\n"); notifying_abort(); } record_ptr += stored_record_size(record->size); } } static void print_socket_addr(FILE* out, const struct NativeArch::sockaddr_storage& sa) { char buf[256]; auto sockaddr = reinterpret_cast(&sa); switch (sockaddr->ss_family) { case AF_INET: { auto sockaddr_in = reinterpret_cast(sockaddr); if (inet_ntop(AF_INET, &sockaddr_in->sin_addr, buf, sizeof(buf) - 1)) { fprintf(out, "%s:%d", buf, sockaddr_in->sin_port); } else { FATAL(); } break; } case AF_INET6: { auto sockaddr_in6 = reinterpret_cast(sockaddr); if (inet_ntop(AF_INET6, &sockaddr_in6->sin6_addr, buf, sizeof(buf) - 1)) { fprintf(out, "%s:%d", buf, sockaddr_in6->sin6_port); } else { FATAL(); } break; } default: fputs("", out); break; } } static void dump_socket_addrs(FILE* out, const TraceFrame& frame) { if (frame.event().type() != EV_SYSCALL) { return; } auto syscall = frame.event().Syscall(); if (syscall.socket_addrs) { fputs(" Local socket address '", out); print_socket_addr(out, (*syscall.socket_addrs.get())[0]); fputs("' Remote socket address '", out); print_socket_addr(out, (*syscall.socket_addrs.get())[1]); fputs("'\n", out); } } static void dump_task_event(FILE* out, const TraceTaskEvent& event) { switch (event.type()) { case TraceTaskEvent::CLONE: fprintf(out, " TraceTaskEvent::CLONE tid=%d parent=%d clone_flags=0x%x\n", event.tid(), event.parent_tid(), event.clone_flags()); break; case TraceTaskEvent::EXEC: fprintf(out, " TraceTaskEvent::EXEC tid=%d file=%s\n", event.tid(), event.file_name().c_str()); break; case TraceTaskEvent::EXIT: fprintf(out, " TraceTaskEvent::EXIT tid=%d status=%d\n", event.tid(), event.exit_status().get()); break; case TraceTaskEvent::DETACH: fprintf(out, " TraceTaskEvent::DETACH tid=%d\n", event.tid()); break; default: FATAL() << "Unknown TraceTaskEvent"; break; } } /** * Dump all events from the current to trace that match |spec| to * |out|. |spec| has the following syntax: /\d+(-\d+)?/, expressing * either a single event number of a range, and may be null to * indicate "dump all events". * * This function is side-effect-y, in that the trace file isn't * rewound in between matching each spec. Therefore specs should be * constructed so as to match properly on a serial linear scan; that * is, they should comprise disjoint and monotonically increasing * event sets. No attempt is made to enforce this or normalize specs. */ static void dump_events_matching(TraceReader& trace, const DumpFlags& flags, FILE* out, const string* spec, const unordered_map& task_events) { uint32_t start = 0, end = numeric_limits::max(); bool only_end = false; if (spec && *spec == "end") { only_end = true; } else { // Try to parse the "range" syntax '[start]-[end]'. if (spec && 2 > sscanf(spec->c_str(), "%u-%u", &start, &end)) { // Fall back on assuming the spec is a single event // number, however it parses out with atoi(). start = end = atoi(spec->c_str()); } } bool process_raw_data = flags.dump_syscallbuf || flags.dump_recorded_data_metadata; while (!trace.at_end()) { auto frame = trace.read_frame(); if (end < frame.time()) { return; } if (only_end ? trace.at_end() : (start <= frame.time() && frame.time() <= end && (!flags.only_tid || flags.only_tid == frame.tid()))) { if (flags.raw_dump) { frame.dump_raw(out); } else { frame.dump(out); } if (flags.dump_syscallbuf) { dump_syscallbuf_data(trace, out, frame); } if (flags.dump_task_events) { auto it = task_events.find(frame.time()); if (it != task_events.end()) { dump_task_event(out, it->second); } } while (true) { TraceReader::MappedData data; bool found; KernelMapping km = trace.read_mapped_region(&data, &found, TraceReader::DONT_VALIDATE); if (!found) { break; } if (flags.dump_mmaps) { char prot_flags[] = "rwxp"; if (!(km.prot() & PROT_READ)) { prot_flags[0] = '-'; } if (!(km.prot() & PROT_WRITE)) { prot_flags[1] = '-'; } if (!(km.prot() & PROT_EXEC)) { prot_flags[2] = '-'; } if (km.flags() & MAP_SHARED) { prot_flags[3] = 's'; } const char* fsname = km.fsname().c_str(); if (data.source == TraceReader::SOURCE_ZERO) { static const char source_zero[] = ""; fsname = source_zero; } fprintf(out, " { map_file:\"%s\", addr:%p, length:%p, " "prot_flags:\"%s\", file_offset:0x%llx, " "device:%lld, inode:%lld, " "data_file:\"%s\", data_offset:0x%llx, " "file_size:0x%llx }\n", fsname, (void*)km.start().as_int(), (void*)km.size(), prot_flags, (long long)km.file_offset_bytes(), (long long)km.device(), (long long)km.inode(), data.file_name.c_str(), (long long)data.data_offset_bytes, (long long)data.file_size_bytes); } } TraceReader::RawDataMetadata data; while (process_raw_data && trace.read_raw_data_metadata_for_frame(data)) { if (flags.dump_recorded_data_metadata) { fprintf(out, " { tid:%d, addr:%p, length:%p", data.rec_tid, (void*)data.addr.as_int(), (void*)data.size); if (!data.holes.empty()) { fputs(", holes:[", out); bool first = true; for (auto& h : data.holes) { if (!first) { fputs(", ", out); } fprintf(out, "%p-%p", (void*)h.offset, (void*)(h.offset + h.size)); } fputs("]", out); } fputs(" }\n", out); } } if (flags.dump_socket_addrs) { dump_socket_addrs(out, frame); } if (!flags.raw_dump) { fprintf(out, "}\n"); } } else { while (true) { TraceReader::MappedData data; KernelMapping km = trace.read_mapped_region(&data, nullptr, TraceReader::DONT_VALIDATE); if (km.size() == 0) { break; } } TraceReader::RawDataMetadata data; while (process_raw_data && trace.read_raw_data_metadata_for_frame(data)) { } } } } static void dump_statistics(const TraceReader& trace, FILE* out) { uint64_t uncompressed = trace.uncompressed_bytes(); uint64_t compressed = trace.compressed_bytes(); fprintf(out, "// Uncompressed bytes %" PRIu64 ", compressed bytes %" PRIu64 ", ratio %.2fx\n", uncompressed, compressed, double(uncompressed) / compressed); } void dump(const string& trace_dir, const DumpFlags& flags, const vector& specs, FILE* out) { TraceReader trace(trace_dir); if (flags.raw_dump) { fprintf(out, "global_time tid reason ticks " "hw_interrupts page_faults instructions " "eax ebx ecx edx esi edi ebp orig_eax esp eip eflags\n"); } unordered_map task_events; FrameTime last_time = 0; while (true) { FrameTime time; TraceTaskEvent r = trace.read_task_event(&time); if (time < last_time) { FATAL() << "TraceTaskEvent times non-monotonic"; } if (r.type() == TraceTaskEvent::NONE) { break; } task_events.insert(make_pair(time, r)); last_time = time; } if (specs.size() > 0) { for (size_t i = 0; i < specs.size(); ++i) { dump_events_matching(trace, flags, out, &specs[i], task_events); } } else { // No specs => dump all events. dump_events_matching(trace, flags, out, nullptr /*all events*/, task_events); } if (flags.dump_statistics) { dump_statistics(trace, out); } } int DumpCommand::run(vector& args) { DumpFlags flags; while (parse_dump_arg(args, flags)) { } string trace_dir; if (!parse_optional_trace_dir(args, &trace_dir)) { print_help(stderr); return 1; } dump(trace_dir, flags, args, stdout); return 0; } } // namespace rr rr-5.5.0/src/DumpCommand.h000066400000000000000000000016471412202446200153150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_DUMP_COMMAND_H_ #define RR_DUMP_COMMAND_H_ #ifndef _DEFAULT_SOURCE #define _DEFAULT_SOURCE 1 #endif #include #include #include #include namespace rr { struct DumpFlags { bool dump_syscallbuf; bool dump_recorded_data_metadata; bool dump_mmaps; bool dump_task_events; bool raw_dump; bool dump_statistics; bool dump_socket_addrs; int only_tid; DumpFlags() : dump_syscallbuf(false), dump_recorded_data_metadata(false), dump_mmaps(false), dump_task_events(false), raw_dump(false), dump_statistics(false), dump_socket_addrs(false), only_tid(0) {} }; void dump(const std::string& trace_dir, const DumpFlags& flags, const std::vector& specs, FILE* out); } // namespace rr #endif // RR_DUMP_COMMAND_H_ rr-5.5.0/src/Dwarf.cc000066400000000000000000000536171412202446200143160ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Dwarf.h" #include #include "log.h" using namespace std; namespace rr { struct Dwarf32 { typedef uint32_t Offset; static const uint8_t EntrySize = 4; struct CompilationUnitPreamble { uint32_t unit_length; }; }; struct Dwarf64 { typedef uint64_t Offset; static const uint8_t EntrySize = 8; struct __attribute__((packed)) CompilationUnitPreamble { uint32_t magic; /* 0xffffffff */ uint64_t unit_length; }; }; template struct __attribute__((packed)) Dwarf4CompilationUnitHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; typename D::Offset debug_abbrev_offset; uint8_t address_size; void install_dwo_id(DwarfCompilationUnit* unit) const { unit->set_dwo_id(0); } }; template struct __attribute__((packed)) Dwarf5CompilationUnitHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; uint8_t unit_type; uint8_t address_size; typename D::Offset debug_abbrev_offset; void install_dwo_id(DwarfCompilationUnit* unit) const { unit->set_dwo_id(0); } }; template struct __attribute__((packed)) Dwarf5SkeletonSplitCompilationUnitHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; uint8_t unit_type; uint8_t address_size; typename D::Offset debug_abbrev_offset; uint64_t dwo_id; void install_dwo_id(DwarfCompilationUnit* unit) const { unit->set_dwo_id(dwo_id); } }; template struct __attribute__((packed)) Dwarf2LineNumberTableHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; typename D::Offset header_length; uint8_t minimum_instruction_length; uint8_t default_is_stmt; int8_t line_base; uint8_t line_range; uint8_t opcode_base; bool read_directories(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, std::vector& directories, std::vector& files) const; }; template struct __attribute__((packed)) Dwarf4LineNumberTableHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; typename D::Offset header_length; uint8_t minimum_instruction_length; uint8_t maximum_operations_per_instruction; uint8_t default_is_stmt; int8_t line_base; uint8_t line_range; uint8_t opcode_base; bool read_directories(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, std::vector& directories, std::vector& files) const; }; template struct __attribute__((packed)) Dwarf5LineNumberTableHeader { typedef D Size; typename D::CompilationUnitPreamble preamble; uint16_t version; uint8_t address_size; uint8_t segment_selector_size; typename D::Offset header_length; uint8_t minimum_instruction_length; uint8_t maximum_operations_per_instruction; uint8_t default_is_stmt; int8_t line_base; uint8_t line_range; uint8_t opcode_base; bool read_directories(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, std::vector& directories, std::vector& files) const; }; uint64_t DwarfSpan::read_uleb(bool* ok) { uint64_t ret = 0; int shift = 0; while (start < end) { uint8_t b = *start; ++start; ret |= (b & 0x7f) << shift; if (!(b & 0x80)) { return ret; } shift += 7; if (shift >= 64) { *ok = false; return 0; } } *ok = false; return 0; } DwarfSpan DwarfSpan::read_leb_ref(bool* ok) { DwarfSpan ret(*this); while (start < end) { if (!(*start & 0x80)) { ++start; ret.end = start; return ret; } ++start; } *ok = false; return ret; } const char* DwarfSpan::read_null_terminated_string(bool* ok) { const void* p = memchr(start, 0, size()); if (!p) { LOG(warn) << "String was not null-terminated"; *ok = false; return nullptr; } const char* ret = reinterpret_cast(start); start = static_cast(p) + 1; return ret; } DwarfAbbrev* DwarfAbbrevSet::lookup(uint64_t code) { auto it = abbrevs.find(code); if (it != abbrevs.end()) { return it->second.get(); } while (!remaining_span.empty()) { bool ok = true; uint64_t abbrev_code = remaining_span.read_uleb(&ok); unique_ptr abbrev(new DwarfAbbrev); abbrev->tag = (DWTag)remaining_span.read_uleb(&ok); abbrev->children = (DWChildren)remaining_span.read_value(&ok); auto abbrev_raw = abbrev.get(); while (true) { uint64_t name = remaining_span.read_uleb(&ok); DWForm form = (DWForm)remaining_span.read_uleb(&ok); if (!name && !form) { break; } DwarfSpan constant; if (form == DW_FORM_implicit_const) { constant = remaining_span.read_leb_ref(&ok); } abbrev->attributes.push_back({ name, form, constant }); } if (!ok) { LOG(warn) << "Invalid DWARF abbrev table!"; return nullptr; } abbrevs.insert(make_pair(abbrev_code, move(abbrev))); if (code == abbrev_code) { return abbrev_raw; } } return nullptr; } DwarfAbbrevSet& DwarfAbbrevs::lookup(uint64_t offset) { auto it = abbrevs.find(offset); if (it != abbrevs.end()) { return *it->second; } unique_ptr set(new DwarfAbbrevSet(debug_abbrev.subspan(offset))); auto set_raw = set.get(); abbrevs.insert(make_pair(offset, move(set))); return *set_raw; } static DwarfAbbrev null_abbrev; DwarfDIE::DwarfDIE(DwarfSpan span, DwarfAbbrevSet& abbrevs, uint8_t dwarf_size, uint8_t address_size, bool* ok) : address_size(address_size), dwarf_size(dwarf_size) { uint64_t code = span.read_uleb(ok); if (!ok) { return; } if (code == 0) { abbrev = &null_abbrev; return; } abbrev = abbrevs.lookup(code); if (!abbrev) { LOG(warn) << "No abbrev found for DIE"; *ok = false; return; } attr_span = span; } static size_t form_size(DWForm form, size_t address_size, size_t dwarf_size, DwarfSpan* span, bool* ok) { if (form == DW_FORM_indirect) { form = (DWForm)span->read_uleb(ok); if (!ok) { return 0; } } if (form == DW_FORM_udata) { auto before = span->size(); DwarfSpan a_span(*span); a_span.read_uleb(ok); if (!ok) { return 0; } return before - a_span.size(); } switch (form) { case DW_FORM_addr: return address_size; case DW_FORM_addrx: return dwarf_size; case DW_FORM_data1: return 1; case DW_FORM_data2: return 2; case DW_FORM_data4: return 4; case DW_FORM_data8: return 8; case DW_FORM_data16: return 16; case DW_FORM_flag: return 1; case DW_FORM_strp: return dwarf_size; case DW_FORM_line_strp: return dwarf_size; case DW_FORM_strx: return dwarf_size; case DW_FORM_strx1: return 1; case DW_FORM_strx2: return 2; case DW_FORM_strx3: return 3; case DW_FORM_strx4: return 4; case DW_FORM_string: { auto before = span->size(); DwarfSpan a_span(*span); a_span.read_null_terminated_string(ok); if (!ok) { return 0; } return before - a_span.size(); } case DW_FORM_sec_offset: return dwarf_size; case DW_FORM_flag_present: return 0; case DW_FORM_implicit_const: return 0; case DW_FORM_strp_sup: return dwarf_size; case DW_FORM_GNU_strp_alt: return dwarf_size; default: LOG(warn) << "form " << form << " not supported!"; *ok = false; return 0; } } DwarfSpan DwarfDIE::find_attribute(DWAttr attr, DWForm* form, bool* ok) const { DwarfSpan span = attr_span; for (auto& a : abbrev->attributes) { size_t size = form_size(a.form, address_size, dwarf_size, &span, ok); DwarfSpan a_span = span.consume(size); if (a.name == attr) { *form = a.form; if (a.form == DW_FORM_implicit_const) { a_span = a.constant; } return a_span; } } return DwarfSpan(); } static uint64_t decode_unsigned_literal(DwarfSpan span, bool* ok) { int shift = 0; uint64_t ret = 0; while (!span.empty()) { if (shift >= 64) { LOG(warn) << "Literal too large"; *ok = false; return 0; } ret |= (uint64_t)span.read_value(ok) << shift; shift += 8; } return ret; } static int64_t decode_section_ptr(DwarfSpan span, DWForm form, bool* ok) { switch (form) { case DW_FORM_data1: case DW_FORM_data2: case DW_FORM_data4: case DW_FORM_data8: case DW_FORM_sec_offset: { uint64_t ret = decode_unsigned_literal(span, ok); if (ret > INT64_MAX) { LOG(warn) << "section ptr out of range"; *ok = false; return 0; } return ret; } default: LOG(warn) << "Unknown section ptr form " << form; *ok = false; return 0; } } static uint64_t decode_unsigned(DwarfSpan span, DWForm form, bool* ok) { switch (form) { case DW_FORM_data1: case DW_FORM_data2: case DW_FORM_data4: case DW_FORM_data8: { return decode_unsigned_literal(span, ok); } case DW_FORM_udata: { return span.read_uleb(ok); } default: LOG(warn) << "Unknown unsigned form " << form; *ok = false; return 0; } } static const char* decode_string(const DwarfCompilationUnit& cu, DwarfSpan span, DWForm form, const DebugStrSpans& debug_strs, bool* ok) { switch (form) { case DW_FORM_strp: { uint64_t offset = decode_unsigned_literal(span, ok); if (!*ok) { return nullptr; } return debug_strs.debug_str.subspan(offset).read_null_terminated_string(ok); } case DW_FORM_strp_sup: case DW_FORM_GNU_strp_alt: { uint64_t offset = decode_unsigned_literal(span, ok); if (!*ok) { return nullptr; } return debug_strs.debug_str_sup.subspan(offset).read_null_terminated_string(ok); } case DW_FORM_line_strp: { uint64_t offset = decode_unsigned_literal(span, ok); if (!*ok) { return nullptr; } return debug_strs.debug_line_str.subspan(offset).read_null_terminated_string(ok); } case DW_FORM_strx: case DW_FORM_strx1: case DW_FORM_strx2: case DW_FORM_strx3: case DW_FORM_strx4: { uint64_t index = decode_unsigned_literal(span, ok) * cu.entry_size() + cu.str_offsets_base(); if (!*ok) { return nullptr; } uint64_t offset = cu.read_entry_sized_value(debug_strs.debug_str_offsets.subspan(index), ok); if (!*ok) { return nullptr; } return debug_strs.debug_str.subspan(offset).read_null_terminated_string(ok); } case DW_FORM_string: return span.read_null_terminated_string(ok); default: LOG(warn) << "Unknown string form " << form; *ok = false; return 0; } } int64_t DwarfDIE::section_ptr_attr(DWAttr attr, bool* ok) const { DWForm form; auto span = find_attribute(attr, &form, ok); if (span.empty() || !ok) { return -1; } return decode_section_ptr(span, form, ok); } uint64_t DwarfDIE::unsigned_attr(DWAttr attr, bool* found, bool* ok) const { DWForm form; auto span = find_attribute(attr, &form, ok); if (span.empty() || !ok) { *found = false; return 0; } *found = true; return decode_unsigned(span, form, ok); } const char* DwarfDIE::string_attr(const DwarfCompilationUnit& cu, DWAttr attr, const DebugStrSpans& debug_strs, bool* ok) const { DWForm form; auto span = find_attribute(attr, &form, ok); if (span.empty() || !ok) { return nullptr; } return decode_string(cu, span, form, debug_strs, ok); } DwarfCompilationUnit DwarfCompilationUnit::next(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok) { DwarfCompilationUnit ret; uint32_t word = DwarfSpan(*debug_info).read_value(ok); if (!ok) { return ret; } if (word == 0xFFFFFFFF) { ret.init_size(debug_info, abbrevs, ok); } else { ret.init_size(debug_info, abbrevs, ok); } return ret; } template void DwarfCompilationUnit::init_size(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok) { auto h = DwarfSpan(*debug_info).read>(ok); if (!ok) { return; } if (2 <= h->version && h->version <= 4) { init>(debug_info, abbrevs, ok); } else if (h->version == 5) { auto hh = DwarfSpan(*debug_info).read>(ok); if (!ok) { return; } if (hh->unit_type == DW_UT_skeleton || hh->unit_type == DW_UT_split_compile) { init>(debug_info, abbrevs, ok); } else { init>(debug_info, abbrevs, ok); } } else { LOG(warn) << "Unknown compilation unit version " << h->version; *ok = false; } } template void DwarfCompilationUnit::init(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok) { DwarfSpan span(*debug_info); auto h = span.read(ok); if (!*ok) { return; } uint64_t length = h->preamble.unit_length; if (length >= UINT64_MAX - 12) { LOG(warn) << "Invalid CU length"; *ok = false; return; } debug_info->consume(length + sizeof(h->preamble)); DwarfAbbrevSet& abbrev_set = abbrevs.lookup(h->debug_abbrev_offset); die_ = make_unique(span, abbrev_set, sizeof(typename H::Size::Offset), h->address_size, ok); if (!*ok) { return; } if (die_->tag() != DW_TAG_compile_unit && die_->tag() != DW_TAG_partial_unit && die_->tag() != DW_TAG_skeleton_unit) { LOG(warn) << "CU DIE is not DW_TAG_compilation_unit/DW_TAG_partial_unit/DW_TAG_skeleton_unit!"; *ok = false; return; } entry_size_ = H::Size::EntrySize; h->install_dwo_id(this); } uint64_t DwarfCompilationUnit::read_entry_sized_value(DwarfSpan span, bool* ok) const { if (entry_size() == 4) { return span.read_value(ok); } else if (entry_size() == 8) { return span.read_value(ok); } else { LOG(warn) << "Unknown entry size " << entry_size(); *ok = false; return 0; } } DwarfLineNumberTable::DwarfLineNumberTable(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, bool* ok) { uint32_t word = DwarfSpan(span).read_value(ok); if (!ok) { return; } if (word == 0xFFFFFFFF) { init_size(cu, span, debug_str, ok); } else { init_size(cu, span, debug_str, ok); } } template void DwarfLineNumberTable::init_size(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, bool* ok) { auto h = DwarfSpan(span).read>(ok); if (!ok) { return; } if (2 <= h->version && h->version <= 3) { init>(cu, span, debug_str, ok); } else if (h->version == 4) { init>(cu, span, debug_str, ok); } else if (h->version == 5) { init>(cu, span, debug_str, ok); } else { LOG(warn) << "Unknown compilation unit version " << h->version; *ok = false; } } static bool read_dwarf2_directories(DwarfSpan span, std::vector& directories, std::vector& files) { bool ok = true; directories.push_back(nullptr); while (true) { const char* dir = span.read_null_terminated_string(&ok); if (!ok) { return ok; } if (!*dir) { break; } directories.push_back(dir); } files.push_back({ 0, nullptr }); while (true) { const char* file = span.read_null_terminated_string(&ok); if (!ok) { return ok; } if (!*file) { break; } uint64_t dir = span.read_uleb(&ok); if (dir >= directories.size()) { LOG(warn) << "Invalid directory index, bailing"; return false; } span.read_uleb(&ok); // timestamp span.read_uleb(&ok); // length if (!ok) { return ok; } files.push_back({ dir, file }); } return ok; } template bool Dwarf2LineNumberTableHeader::read_directories(const DwarfCompilationUnit&, DwarfSpan span, const DebugStrSpans&, std::vector& directories, std::vector& files) const { return read_dwarf2_directories(span, directories, files); } template bool Dwarf4LineNumberTableHeader::read_directories(const DwarfCompilationUnit&, DwarfSpan span, const DebugStrSpans&, std::vector& directories, std::vector& files) const { return read_dwarf2_directories(span, directories, files); } struct FileEntryFormat { DWLnct content_type; DWForm form; }; template bool Dwarf5LineNumberTableHeader::read_directories(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, std::vector& directories, std::vector& files) const { bool ok = true; uint64_t directory_entry_format_count = span.read_uleb(&ok); if (!ok) { return ok; } bool seen_lnct_path = false; std::vector directory_formats; for (uint64_t i = 0; i < directory_entry_format_count; ++i) { DWLnct content_type = (DWLnct)span.read_uleb(&ok); if (!ok) { return ok; } if (content_type == DW_LNCT_path) { if (seen_lnct_path) { LOG(warn) << "DW_LNCT_path appears twice in directories!"; return false; } seen_lnct_path = true; } DWForm form = (DWForm)span.read_uleb(&ok); if (!ok) { return ok; } directory_formats.push_back({ content_type, form }); } if (!seen_lnct_path) { LOG(warn) << "DW_LNCT_path does not appear in directories"; return false; } uint64_t directories_count = span.read_uleb(&ok); if (!ok) { return ok; } for (uint64_t i = 0; i < directories_count; ++i) { for (auto format: directory_formats) { switch (format.content_type) { case DW_LNCT_path: { size_t size = form_size(format.form, address_size, Size::EntrySize, &span, &ok); DwarfSpan a_span = span.consume(size); auto directory = decode_string(cu, a_span, format.form, debug_str, &ok); if (!ok) { return ok; } directories.push_back(directory); break; } default: LOG(warn) << "Unknown DW_LNCT " << format.content_type << " for directory"; return false; } } } uint64_t file_entry_format_count = span.read_uleb(&ok); if (!ok) { return ok; } seen_lnct_path = false; std::vector file_formats; for (uint64_t i = 0; i < file_entry_format_count; ++i) { DWLnct content_type = (DWLnct)span.read_uleb(&ok); if (!ok) { return ok; } if (content_type == DW_LNCT_path) { if (seen_lnct_path) { LOG(warn) << "DW_LNCT_path appears twice in files!"; return false; } seen_lnct_path = true; } DWForm form = (DWForm)span.read_uleb(&ok); if (!ok) { return ok; } file_formats.push_back({ content_type, form }); } if (!seen_lnct_path) { LOG(warn) << "DW_LNCT_path does not appear in files"; return false; } uint64_t files_count = span.read_uleb(&ok); if (!ok) { return ok; } for (uint64_t i = 0; i < files_count; ++i) { uint64_t directory_index = 0; const char* file_path = NULL; for (auto format: file_formats) { switch (format.content_type) { case DW_LNCT_path: { size_t size = form_size(format.form, address_size, Size::EntrySize, &span, &ok); DwarfSpan a_span = span.consume(size); file_path = decode_string(cu, a_span, format.form, debug_str, &ok); if (!ok) { return ok; } break; } case DW_LNCT_directory_index: { size_t size = form_size(format.form, address_size, Size::EntrySize, &span, &ok); DwarfSpan a_span = span.consume(size); directory_index = decode_unsigned(a_span, format.form, &ok); if (!ok) { return ok; } break; } case DW_LNCT_md5: { if (format.form != DW_FORM_data16) { LOG(warn) << "md5 has unexpected form " << format.form; return false; } size_t size = form_size(format.form, address_size, Size::EntrySize, &span, &ok); span.consume(size); break; } default: LOG(warn) << "Unknown DW_LNCT " << format.content_type << " for file"; return false; } } files.push_back({ directory_index, file_path }); } return true; } template void DwarfLineNumberTable::init(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_str, bool* ok) { auto h = span.read(ok); if (!ok) { return; } for (uint8_t i = 1; i < h->opcode_base; ++i) { span.read_uleb(ok); } if (!ok) { return; } *ok = h->read_directories(cu, span, debug_str, directories_, file_names_); } } // namespace rr rr-5.5.0/src/Dwarf.h000066400000000000000000000150041412202446200141440ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_DWARF_H_ #define RR_DWARF_H_ #include #include #include #include namespace rr { enum DWTag { DW_TAG_null = 0, DW_TAG_compile_unit = 0x11, DW_TAG_partial_unit = 0x3c, DW_TAG_skeleton_unit = 0x4a, }; enum DWAttr { DW_AT_name = 0x03, DW_AT_stmt_list = 0x10, DW_AT_comp_dir = 0x1b, DW_AT_str_offsets_base = 0x72, DW_AT_dwo_name = 0x76, DW_AT_GNU_dwo_name = 0x2130, DW_AT_GNU_dwo_id = 0x2131, }; enum DWChildren { DW_CHILDREN_no = 0x00, DW_CHILDREN_yes = 0x01 }; enum DWForm { DW_FORM_addr = 0x01, DW_FORM_block2 = 0x03, DW_FORM_block4 = 0x04, DW_FORM_data2 = 0x05, DW_FORM_data4 = 0x06, DW_FORM_data8 = 0x07, DW_FORM_string = 0x08, DW_FORM_data1 = 0x0b, DW_FORM_flag = 0x0c, DW_FORM_strp = 0x0e, DW_FORM_udata= 0x0f, DW_FORM_indirect = 0x16, DW_FORM_sec_offset = 0x17, DW_FORM_flag_present = 0x19, DW_FORM_strx = 0x1a, DW_FORM_addrx = 0x1b, DW_FORM_strp_sup = 0x1d, DW_FORM_data16 = 0x1e, DW_FORM_line_strp = 0x1f, DW_FORM_implicit_const = 0x21, DW_FORM_strx1 = 0x25, DW_FORM_strx2 = 0x26, DW_FORM_strx3 = 0x27, DW_FORM_strx4 = 0x28, DW_FORM_GNU_strp_alt = 0x1f21, }; enum DWLnct { DW_LNCT_path = 0x1, DW_LNCT_directory_index = 0x2, DW_LNCT_md5 = 0x5, }; enum DWUt { DW_UT_compile = 0x01, DW_UT_skeleton = 0x04, DW_UT_split_compile = 0x05, }; class DwarfSpan { public: DwarfSpan(const uint8_t* start, const uint8_t* end) : start(start), end(end) {} DwarfSpan(const DwarfSpan& other) = default; DwarfSpan() : start(nullptr), end(nullptr) {} size_t size() const { return end - start; } uint64_t read_uleb(bool* ok); DwarfSpan read_leb_ref(bool* ok); const char* read_null_terminated_string(bool* ok); template const T* read(bool *ok) { if (size() < sizeof(T)) { *ok = false; return nullptr; } auto ret = reinterpret_cast(start); start += sizeof(T); return ret; } template T read_value(bool *ok) { const T* r = read(ok); return r ? *r : T(); } bool empty() { return start == end; } DwarfSpan subspan(uint64_t offset, uint64_t sz = UINT64_MAX) const { DwarfSpan ret(*this); if (size() <= offset) { ret.start = end; return ret; } ret.start += offset; if (ret.size() <= sz) { return ret; } ret.end = ret.start + sz; return ret; } DwarfSpan consume(uint64_t sz) { DwarfSpan ret(*this); if (size() <= sz) { start = end; return ret; } ret.end = ret.start + sz; start = ret.end; return ret; } private: const uint8_t* start; const uint8_t* end; }; struct DebugStrSpans { DwarfSpan debug_str; DwarfSpan debug_str_sup; DwarfSpan debug_str_offsets; DwarfSpan debug_line_str; }; struct DwarfAbbrevAttribute { uint64_t name; DWForm form; DwarfSpan constant; // DWARF5 }; struct DwarfAbbrev { DwarfAbbrev() : tag(DW_TAG_null), children(DW_CHILDREN_no) {} std::vector attributes; DWTag tag; DWChildren children; }; class DwarfAbbrevSet { public: DwarfAbbrevSet(DwarfSpan span) : remaining_span(span) {} DwarfAbbrev* lookup(uint64_t code); private: std::unordered_map> abbrevs; DwarfSpan remaining_span; }; class DwarfAbbrevs { public: DwarfAbbrevs(DwarfSpan debug_abbrev) : debug_abbrev(debug_abbrev) {} DwarfAbbrevSet& lookup(uint64_t offset); private: DwarfSpan debug_abbrev; std::unordered_map> abbrevs; }; class DwarfCompilationUnit; class DwarfDIE { public: DwarfDIE(DwarfSpan span, DwarfAbbrevSet& abbrevs, uint8_t dwarf_size, uint8_t address_size, bool* ok); DWTag tag() const { return abbrev->tag; } // Returns empty span if not found DwarfSpan find_attribute(DWAttr attr, DWForm* form, bool* ok) const; // Returns -1 if no attr int64_t section_ptr_attr(DWAttr attr, bool* ok) const; // Sets *found to false if not found. uint64_t unsigned_attr(DWAttr attr, bool* found, bool* ok) const; // Returns nullptr if no attr const char* string_attr(const DwarfCompilationUnit& unit, DWAttr attr, const DebugStrSpans& debug_str, bool* ok) const; private: DwarfAbbrev* abbrev; DwarfSpan attr_span; uint8_t address_size; uint8_t dwarf_size; }; class DwarfCompilationUnit { public: // Consumes debug_info span and leaves rest behind static DwarfCompilationUnit next(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok); const DwarfDIE& die() const { return *die_; } uint64_t dwo_id() const { return dwo_id_; } void set_dwo_id(uint64_t dwo_id) { dwo_id_ = dwo_id; } uint64_t str_offsets_base() const { return str_offsets_base_; } void set_str_offsets_base(uint64_t str_offsets_base) { str_offsets_base_ = str_offsets_base; } uint8_t entry_size() const { return entry_size_; } uint64_t read_entry_sized_value(DwarfSpan span, bool* ok) const; private: DwarfCompilationUnit() {} template void init_size(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok); template void init(DwarfSpan* debug_info, DwarfAbbrevs& abbrevs, bool* ok); std::unique_ptr die_; uint64_t dwo_id_; uint64_t str_offsets_base_; uint8_t entry_size_; }; struct DwarfSourceFile { uint64_t directory_index; const char* file_name; }; class DwarfLineNumberTable { public: DwarfLineNumberTable(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_strs, bool* ok); // Null directory pointer means "compilation dir". The first entry is null. const std::vector& directories() const { return directories_; } // Null file name means "compilation unit name". The first entry is null. const std::vector& file_names() const { return file_names_; } private: template void init_size(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_strs, bool* ok); template void init(const DwarfCompilationUnit& cu, DwarfSpan span, const DebugStrSpans& debug_strs, bool* ok); std::vector directories_; std::vector file_names_; }; #if __cplusplus == 201103L /** * Implementation of make_unique for C++11 (from https://herbsutter.com/gotw/_102/). */ template std::unique_ptr make_unique( Args&& ...args ) { return std::unique_ptr( new T( std::forward(args)... ) ); } #endif /* __cplusplus == 201103L */ } // namespace rr #endif /* RR_DWARF_H_ */ rr-5.5.0/src/ElfReader.cc000066400000000000000000000364361412202446200151040ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "ElfReader.h" #include #include #include #include "log.h" #include "util.h" using namespace std; namespace rr { class ElfReaderImplBase { public: ElfReaderImplBase(ElfReader& r) : r(r), ok_(false) {} virtual ~ElfReaderImplBase() {} virtual SymbolTable read_symbols(const char* symtab, const char* strtab) = 0; virtual DynamicSection read_dynamic() = 0; virtual Debuglink read_debuglink() = 0; virtual Debugaltlink read_debugaltlink() = 0; virtual string read_buildid() = 0; virtual bool addr_to_offset(uintptr_t addr, uintptr_t& offset) = 0; virtual SectionOffsets find_section_file_offsets(const char* name) = 0; bool ok() { return ok_; } protected: ElfReader& r; bool ok_; }; template class ElfReaderImpl : public ElfReaderImplBase { public: ElfReaderImpl(ElfReader& r); virtual SymbolTable read_symbols(const char* symtab, const char* strtab) override; virtual DynamicSection read_dynamic() override; virtual Debuglink read_debuglink() override; virtual Debugaltlink read_debugaltlink() override; virtual string read_buildid() override; virtual bool addr_to_offset(uintptr_t addr, uintptr_t& offset) override; virtual SectionOffsets find_section_file_offsets(const char* name) override; private: const typename Arch::ElfShdr* find_section(const char* n); const typename Arch::ElfEhdr* elfheader; const typename Arch::ElfShdr* sections; size_t sections_size; vector section_names; }; template unique_ptr elf_reader_impl_arch(ElfReader& r) { return unique_ptr(new ElfReaderImpl(r)); } unique_ptr elf_reader_impl(ElfReader& r, SupportedArch arch) { RR_ARCH_FUNCTION(elf_reader_impl_arch, arch, r); } template ElfReaderImpl::ElfReaderImpl(ElfReader& r) : ElfReaderImplBase(r) { elfheader = r.read(0); if (!elfheader || memcmp(elfheader, ELFMAG, SELFMAG) != 0 || elfheader->e_ident[EI_CLASS] != Arch::elfclass || elfheader->e_ident[EI_DATA] != Arch::elfendian || elfheader->e_machine != Arch::elfmachine || elfheader->e_shentsize != sizeof(typename Arch::ElfShdr) || elfheader->e_shstrndx >= elfheader->e_shnum) { LOG(debug) << "Invalid ELF file: invalid header"; return; } sections = r.read(elfheader->e_shoff, elfheader->e_shnum); if (!sections || !elfheader->e_shnum) { LOG(debug) << "Invalid ELF file: no sections"; return; } sections_size = elfheader->e_shnum; auto& section_names_section = sections[elfheader->e_shstrndx]; const char* section_names_ptr = r.read(section_names_section.sh_offset, section_names_section.sh_size); if (!section_names_ptr || !section_names_section.sh_size) { LOG(debug) << "Invalid ELF file: can't read section names"; return; } // Ensure final 0 section_names.resize(section_names_section.sh_size); memcpy(section_names.data(), section_names_ptr, section_names.size()); section_names[section_names.size() - 1] = 0; ok_ = true; } template const typename Arch::ElfShdr* ElfReaderImpl::find_section(const char* n) { const typename Arch::ElfShdr* section = nullptr; for (size_t i = 0; i < sections_size; ++i) { auto& s = sections[i]; if (s.sh_name >= section_names.size()) { LOG(debug) << "Invalid ELF file: invalid name offset for section " << i; continue; } const char* name = section_names.data() + s.sh_name; if (strcmp(name, n) == 0) { if (section) { LOG(debug) << "Invalid ELF file: duplicate symbol section " << n; return nullptr; } section = &s; } } if (!section) { LOG(debug) << "Missing section " << n; } return section; } template SectionOffsets ElfReaderImpl::find_section_file_offsets( const char* name) { SectionOffsets offsets = { 0, 0 }; const typename Arch::ElfShdr* section = find_section(name); if (!section) { return offsets; } offsets.start = section->sh_offset; offsets.end = section->sh_offset + section->sh_size; return offsets; } template SymbolTable ElfReaderImpl::read_symbols(const char* symtab, const char* strtab) { SymbolTable result; if (!ok()) { return result; } const typename Arch::ElfShdr* symbols = find_section(symtab); if (!symbols) { return result; } const typename Arch::ElfShdr* strings = find_section(strtab); if (!strtab) { return result; } if (symbols->sh_entsize != sizeof(typename Arch::ElfSym)) { LOG(debug) << "Invalid ELF file: incorrect symbol size " << symbols->sh_entsize; return result; } if (symbols->sh_size % symbols->sh_entsize) { LOG(debug) << "Invalid ELF file: incorrect symbol section size " << symbols->sh_size; return result; } if (strings->sh_size == 0) { LOG(debug) << "Invalid ELF file: empty string table"; return result; } size_t symbol_list_size = symbols->sh_size / symbols->sh_entsize; auto symbol_list = r.read( symbols->sh_offset, symbol_list_size); if (!symbol_list) { LOG(debug) << "Invalid ELF file: can't read symbols " << symtab; return result; } auto strtab_ptr = r.read(strings->sh_offset, strings->sh_size); if (!strtab_ptr) { LOG(debug) << "Invalid ELF file: can't read strings " << strtab; } result.strtab.resize(strings->sh_size); memcpy(result.strtab.data(), strtab_ptr, result.strtab.size()); result.strtab[result.strtab.size() - 1] = 0; result.symbols.resize(symbol_list_size); for (size_t i = 0; i < symbol_list_size; ++i) { auto& s = symbol_list[i]; if (s.st_shndx >= sections_size) { // Don't leave this entry uninitialized result.symbols[i] = SymbolTable::Symbol(0, 0); continue; } result.symbols[i] = SymbolTable::Symbol(s.st_value, s.st_name); } return result; } template DynamicSection ElfReaderImpl::read_dynamic() { DynamicSection result; if (!ok()) { return result; } const typename Arch::ElfShdr* dynamic = find_section(".dynamic"); if (!dynamic) { return result; } const typename Arch::ElfShdr* dynstr = find_section(".dynstr"); if (!dynstr) { return result; } if (dynamic->sh_entsize != sizeof(typename Arch::ElfDyn)) { LOG(debug) << "Invalid ELF file: incorrect .dynamic size " << dynamic->sh_entsize; return result; } if (!dynamic->sh_size) { return result; } if (dynamic->sh_size % dynamic->sh_entsize) { LOG(debug) << "Invalid ELF file: incorrect .dynamic section size " << dynamic->sh_size; return result; } if (dynstr->sh_size == 0) { LOG(debug) << "Invalid ELF file: empty string table"; return result; } size_t dyn_list_size = dynamic->sh_size / dynamic->sh_entsize; auto dyn_list = r.read( dynamic->sh_offset, dyn_list_size); if (!dyn_list) { LOG(debug) << "Invalid ELF file: can't read .dynamic"; return result; } auto strtab = r.read(dynstr->sh_offset, dynstr->sh_size); if (!strtab) { LOG(debug) << "Invalid ELF file: can't read .dynstr"; } result.strtab.resize(dynstr->sh_size); memcpy(result.strtab.data(), strtab, result.strtab.size()); result.strtab[result.strtab.size() - 1] = 0; result.entries.resize(dyn_list_size); for (size_t i = 0; i < dyn_list_size; ++i) { auto& s = dyn_list[i]; result.entries[i] = DynamicSection::Entry(s.d_tag, s.d_val); } return result; } static bool null_terminated(const char* p, size_t size, string& out) { size_t len = strnlen(p, size); if (len == size) { LOG(warn) << "Invalid file name"; return false; } out = string(p, len); return true; } template Debuglink ElfReaderImpl::read_debuglink() { Debuglink result; if (!ok()) { return result; } const typename Arch::ElfShdr* debuglink = find_section(".gnu_debuglink"); if (!debuglink) { return result; } if (debuglink->sh_size < 8) { LOG(warn) << "Invalid ELF file: unexpected .gnu_debuglink length"; return result; } size_t crc_offset = debuglink->sh_size - 4; if (!r.read_into(debuglink->sh_offset + crc_offset, &result.crc)) { LOG(warn) << "Invalid ELF file: can't read .gnu_debuglink crc checksum"; return result; } const char* file_name = r.read(debuglink->sh_offset, crc_offset); if (!file_name) { LOG(warn) << "Invalid ELF file: can't read .gnu_debuglink file_name"; return result; } null_terminated(file_name, crc_offset, result.file_name); return result; } template Debugaltlink ElfReaderImpl::read_debugaltlink() { Debugaltlink result; if (!ok()) { return result; } const typename Arch::ElfShdr* debuglink = find_section(".gnu_debugaltlink"); if (!debuglink) { return result; } // Last 20 bytes are the build ID of the target file. Ignore for now. if (debuglink->sh_size < 21) { LOG(warn) << "Invalid ELF file: unexpected .gnu_debugaltlink length"; return result; } size_t build_id_offset = debuglink->sh_size - 20; const char* file_name = r.read(debuglink->sh_offset, build_id_offset); if (!file_name) { LOG(warn) << "Invalid ELF file: can't read .gnu_debugaltlink file_name"; return result; } null_terminated(file_name, build_id_offset, result.file_name); return result; } template string ElfReaderImpl::read_buildid() { string result; if (!ok()) { return result; } for (size_t i = 0; i < sections_size; ++i) { auto& s = sections[i]; if (s.sh_type != SHT_NOTE) { continue; } auto offset = s.sh_offset; auto nhdr = r.read(offset); if (!nhdr) { LOG(error) << "Failed to read ELF note"; return result; } offset += sizeof(*nhdr); char name[4] = { 0 }; if (!(nhdr->n_namesz == 4 && r.read_into(offset, &name) && memcmp("GNU", name, 4) == 0 && nhdr->n_descsz > 0)) { continue; } // Note members are 4 byte aligned, twiddle bits to round up if necessary. offset += (nhdr->n_namesz + 3) & ~0x3; if (nhdr->n_type != NT_GNU_BUILD_ID) { continue; } const uint8_t* id = r.read(offset, nhdr->n_descsz); if (!id) { LOG(error) << "Failed to read ELF note contents"; return result; } result.reserve(nhdr->n_descsz); for (unsigned i = 0; i < nhdr->n_descsz; ++i) { char byte[3] = { 0 }; snprintf(&byte[0], 3, "%02x", id[i]); result.append(byte); } break; } return result; } template bool ElfReaderImpl::addr_to_offset(uintptr_t addr, uintptr_t& offset) { for (size_t i = 0; i < sections_size; ++i) { const auto& section = sections[i]; // Skip the section if it either "occupies no space in the file" or // doesn't have a valid address because it does not "occupy memory // during process execution". if (section.sh_type == SHT_NOBITS || !(section.sh_flags & SHF_ALLOC)) { continue; } if (addr >= section.sh_addr && addr - section.sh_addr < section.sh_size) { offset = addr - section.sh_addr + section.sh_offset; return true; } } return false; } ElfReader::ElfReader(SupportedArch arch) : arch(arch), map(nullptr), size(0) {} ElfReader::~ElfReader() {} ElfReaderImplBase& ElfReader::impl() { if (!impl_) { impl_ = elf_reader_impl(*this, arch); } return *impl_; } SymbolTable ElfReader::read_symbols(const char* symtab, const char* strtab) { return impl().read_symbols(symtab, strtab); } DynamicSection ElfReader::read_dynamic() { return impl().read_dynamic(); } Debuglink ElfReader::read_debuglink() { return impl().read_debuglink(); } Debugaltlink ElfReader::read_debugaltlink() { return impl().read_debugaltlink(); } SectionOffsets ElfReader::find_section_file_offsets(const char* name) { return impl().find_section_file_offsets(name); } DwarfSpan ElfReader::dwarf_section(const char* name) { SectionOffsets offsets = impl().find_section_file_offsets(name); return DwarfSpan(map + offsets.start, map + offsets.end); } string ElfReader::read_buildid() { return impl().read_buildid(); } bool ElfReader::addr_to_offset(uintptr_t addr, uintptr_t& offset) { return impl().addr_to_offset(addr, offset); } bool ElfReader::ok() { return impl().ok(); } ElfFileReader::ElfFileReader(ScopedFd& fd, SupportedArch arch) : ElfReader(arch) { struct stat st; if (fstat(fd, &st) < 0) { FATAL() << "Can't stat fd"; } if (st.st_size > 0) { map = static_cast(mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0)); if (map == MAP_FAILED) { FATAL() << "Can't map fd"; } } size = st.st_size; } ElfFileReader::~ElfFileReader() { if (map) { munmap(map, size); } } ScopedFd ElfFileReader::open_debug_file(const std::string& elf_file_name) { if (elf_file_name.empty() || elf_file_name[0] != '/') { return ScopedFd(); } Debuglink debuglink = read_debuglink(); if (debuglink.file_name.empty()) { return ScopedFd(); } size_t last_slash = elf_file_name.find_last_of('/'); string debug_path = "/usr/lib/debug/"; debug_path += elf_file_name.substr(0, last_slash) + '/' + debuglink.file_name; ScopedFd debug_fd(debug_path.c_str(), O_RDONLY); if (!debug_fd.is_open()) { return ScopedFd(); } // Verify that the CRC checksum matches, in case the debuginfo and text file // are in separate packages that are out of sync. uint32_t crc = 0xffffffff; while (true) { unsigned char buf[4096]; ssize_t ret = ::read(debug_fd.get(), buf, sizeof(buf)); if (ret < 0) { if (errno != EINTR) { LOG(debug) << "Error reading " << debug_path; return ScopedFd(); } } else if (ret == 0) { break; } else { crc = crc32(crc, buf, ret); } } if ((crc ^ 0xffffffff) == debuglink.crc) { return debug_fd; } return ScopedFd(); } SupportedArch ElfFileReader::identify_arch(ScopedFd& fd) { /** * This code is quite lax. That's OK because this is only used to create * a specific ElfReaderImpl, which does much more thorough checking of the * header. */ static const int header_prefix_size = 20; char buf[header_prefix_size]; ssize_t ret = read_to_end(fd, 0, buf, sizeof(buf)); if (ret != (ssize_t)sizeof(buf) || buf[5] != 1) { return NativeArch::arch(); } switch (buf[18] | (buf[19] << 8)) { case 0x03: return x86; case 0x3e: return x86_64; default: return NativeArch::arch(); } } bool ElfFileReader::is_x32_abi(__attribute__((unused)) ScopedFd& fd) { #if defined(__x86_64__) static const int header_prefix_size = 20; char buf[header_prefix_size]; ssize_t ret = read_to_end(fd, 0, buf, sizeof(buf)); if (ret != (ssize_t)sizeof(buf) || buf[5] != 1) { // Who knows what this is. return false; } if ((buf[18] | (buf[19] << 8)) == 0x3e) { // x32 ABI programs declare themselves with the amd64 architecture but // only 4 byte wide pointers. return buf[4] == 1; } #endif return false; } } // namespace rr rr-5.5.0/src/ElfReader.h000066400000000000000000000065321412202446200147400ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_ELF_READER_H_ #define RR_ELF_READER_H_ #include #include #include #include "Dwarf.h" #include "ScopedFd.h" #include "kernel_abi.h" namespace rr { class ElfReaderImplBase; class SymbolTable { public: const char* name(size_t i) const { size_t offset = symbols[i].name_index; return offset < strtab.size() ? &strtab[offset] : nullptr; } bool is_name(size_t i, const char* name) const { size_t offset = symbols[i].name_index; return offset < strtab.size() && strcmp(&strtab[offset], name) == 0; } uintptr_t addr(size_t i) const { return symbols[i].addr; } size_t size() const { return symbols.size(); } struct Symbol { Symbol(uintptr_t addr, size_t name_index) : addr(addr), name_index(name_index) {} Symbol() {} uintptr_t addr; size_t name_index; }; std::vector symbols; // Last character is always null map = static_cast(fd); std::vector strtab; }; class DynamicSection { public: struct Entry { public: Entry(uint64_t tag, uint64_t val) : tag(tag), val(val) {} Entry() {} uint64_t tag; uint64_t val; }; std::vector entries; // Last character is always null std::vector strtab; }; class Debuglink { public: std::string file_name; uint32_t crc; }; class Debugaltlink { public: std::string file_name; }; struct SectionOffsets { uint64_t start; uint64_t end; }; class ElfReader { public: ElfReader(SupportedArch arch); virtual ~ElfReader(); const void* read_bytes(size_t offset, size_t size) { if (offset + size > this->size) { return nullptr; } return map + offset; } template const T* read(size_t offset, size_t count = 1) { return static_cast(read_bytes(offset, sizeof(T)*count)); } template bool read_into(size_t offset, T* out) { auto r = read(offset); if (!r) { return false; } memcpy(out, r, sizeof(*out)); return true; } bool ok(); SymbolTable read_symbols(const char* symtab, const char* strtab); DynamicSection read_dynamic(); Debuglink read_debuglink(); Debugaltlink read_debugaltlink(); std::string read_buildid(); // Returns true and sets file |offset| if ELF address |addr| is mapped from // a section in the ELF file. Returns false if no section maps to // |addr|. |addr| is an address indicated by the ELF file, not its // relocated address in memory. bool addr_to_offset(uintptr_t addr, uintptr_t& offset); SectionOffsets find_section_file_offsets(const char* name); DwarfSpan dwarf_section(const char* name); private: ElfReaderImplBase& impl(); std::unique_ptr impl_; SupportedArch arch; protected: uint8_t* map; size_t size; }; class ElfFileReader : public ElfReader { public: ElfFileReader(ScopedFd& fd, SupportedArch arch); ElfFileReader(ScopedFd& fd) : ElfFileReader(fd, identify_arch(fd)) {} ~ElfFileReader(); // Finds and opens the debug file corresponding to this reader. // |elf_file_name| is the name of the file already opened by this reader. ScopedFd open_debug_file(const std::string& elf_file_name); static SupportedArch identify_arch(ScopedFd& fd); static bool is_x32_abi(ScopedFd& fd); }; } // namespace rr #endif /* RR_ELF_READER_H_ */ rr-5.5.0/src/EmuFs.cc000066400000000000000000000137111412202446200142610ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "EmuFs.h" #include #include #include #include #include #include "AddressSpace.h" #include "ReplaySession.h" #include "core.h" #include "kernel_abi.h" #include "kernel_metadata.h" #include "log.h" using namespace std; namespace rr { EmuFile::~EmuFile() { LOG(debug) << " EmuFs::~File(einode:" << inode_ << ")"; owner.destroyed_file(*this); } EmuFile::shr_ptr EmuFile::clone(EmuFs& owner) { auto f = EmuFile::create(owner, orig_path.c_str(), device(), inode(), size_); // We could try using FICLONE but tmpfs doesn't support that yet so let's just // not bother for now. // Avoid copying holes. vector buf; uint64_t offset = 0; while (offset < size_) { ssize_t ret = lseek(fd(), offset, SEEK_HOLE); if (ret < 0) { ret = size_; } else { if (uint64_t(ret) < offset) { FATAL() << "lseek returned hole before requested offset"; } } uint64_t hole = ret; // Copy data while (offset < hole) { loff_t off_in = offset; loff_t off_out = offset; ssize_t ncopied = syscall(NativeArch::copy_file_range, file.get(), &off_in, f->fd().get(), &off_out, hole - offset, 0); if (ncopied >= 0) { if (ncopied == 0) { FATAL() << "Didn't copy anything"; } offset += ncopied; continue; } ssize_t amount = min(hole - offset, 4*1024*1024); buf.resize(amount); ret = pread64(fd(), buf.data(), amount, offset); if (ret <= 0) { FATAL() << "Couldn't read all the data"; } ssize_t written = pwrite_all_fallible(f->fd(), buf.data(), ret, offset); if (written < ret) { FATAL() << "Couldn't write all the data"; } offset += written; } if (offset < size_) { // Look for the end of the hole, if any ret = lseek(fd(), offset, SEEK_DATA); if (ret < 0) { if (errno != ENXIO) { FATAL() << "Couldn't find data"; } break; } if (uint64_t(ret) <= offset) { FATAL() << "Zero sized hole?"; } // Skip the hole offset = ret; } } return f; } string EmuFile::proc_path() const { stringstream ss; ss << "/proc/" << getpid() << "/fd/" << fd().get(); return ss.str(); } void EmuFile::update(dev_t device, ino_t inode, uint64_t size) { DEBUG_ASSERT(device_ == device && inode_ == inode); ensure_size(size); } void EmuFile::ensure_size(uint64_t size) { if (size_ < size) { resize_shmem_segment(file, size); size_ = size; } } std::string make_temp_name(const string& orig_path, dev_t orig_device, ino_t orig_inode) { stringstream name; name << "rr-emufs-" << getpid() << "-dev-" << orig_device << "-inode-" << orig_inode << "-" << orig_path; // The linux man page for memfd_create says the length limit for the name // argument is 249 bytes, evidently because it prepends "memfd:" to the // parameter before using it. return name.str().substr(0, 249); } /*static*/ EmuFile::shr_ptr EmuFile::create(EmuFs& owner, const string& orig_path, dev_t orig_device, ino_t orig_inode, uint64_t orig_file_size) { string real_name = make_temp_name(orig_path, orig_device, orig_inode); ScopedFd fd(open_memory_file(real_name)); if (!fd.is_open()) { FATAL() << "Failed to create shmem segment for " << real_name; } resize_shmem_segment(fd, orig_file_size); shr_ptr f(new EmuFile(owner, std::move(fd), orig_path, real_name, orig_device, orig_inode, orig_file_size)); LOG(debug) << "created emulated file for " << orig_path << " as " << real_name; return f; } EmuFile::EmuFile(EmuFs& owner, ScopedFd&& fd, const string& orig_path, const string& real_path, dev_t orig_device, ino_t orig_inode, uint64_t orig_file_size) : orig_path(orig_path), tmp_path(real_path), file(std::move(fd)), owner(owner), size_(orig_file_size), device_(orig_device), inode_(orig_inode) {} EmuFile::shr_ptr EmuFs::at(const KernelMapping& recorded_map) const { return files.at(FileId(recorded_map)).lock(); } bool EmuFs::has_file_for(const KernelMapping& recorded_map) const { return files.find(FileId(recorded_map)) != files.end(); } EmuFile::shr_ptr EmuFs::clone_file(EmuFile::shr_ptr file) { DEBUG_ASSERT(file); auto c = file->clone(*this); files[FileId(*file)] = c; return c; } EmuFile::shr_ptr EmuFs::get_or_create(const KernelMapping& recorded_km) { FileId id(recorded_km); auto it = files.find(id); uint64_t min_file_size = recorded_km.file_offset_bytes() + recorded_km.size(); if (it != files.end()) { it->second.lock()->update(recorded_km.device(), recorded_km.inode(), min_file_size); return it->second.lock(); } auto vf = EmuFile::create(*this, recorded_km.fsname(), recorded_km.device(), recorded_km.inode(), min_file_size); files[id] = vf; return vf; } EmuFile::shr_ptr EmuFs::find(dev_t device, ino_t inode) { FileId id(device, inode); auto it = files.find(id); if (it == files.end()) { return EmuFile::shr_ptr(); } return it->second.lock(); } void EmuFs::log() const { LOG(error) << "EmuFs " << this << " with " << files.size() << " files:"; for (auto& kv : files) { auto file = kv.second.lock(); LOG(error) << " " << file->emu_path(); } } /*static*/ EmuFs::shr_ptr EmuFs::create() { return shr_ptr(new EmuFs()); } EmuFs::EmuFs() {} FileId::FileId(const KernelMapping& recorded_map) : device(recorded_map.device()), inode(recorded_map.inode()) {} FileId::FileId(const EmuFile& emu_file) : device(emu_file.device()), inode(emu_file.inode()) {} } // namespace rr rr-5.5.0/src/EmuFs.h000066400000000000000000000132601412202446200141220ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_EMUFS_H_ #define RR_EMUFS_H_ #include #include #include #include #include "ScopedFd.h" namespace rr { class AddressSpace; class EmuFs; class KernelMapping; class ReplaySession; class Session; class Task; /** * Implement an "emulated file system" consisting of files that were * mmap'd shared during recording. These files require special * treatment because (i) they were most likely modified during * recording, so (ii) the original file contents only exist as * snapshots in the trace, but (iii) all mappings of the file must * point at the same underling resource, so that modifications are * seen by all mappees. * * The rr EmuFs creates "emulated files" in shared memory during * replay. Each efile is uniquely identified at a given event in the * trace by |(edev, einode)| (i.e., the recorded device ID and inode). * "What about inode recycling", you're probably thinking to yourself. * This scheme can cope with inode recycling, given a very important * assumption discussed below. * * Why is inode recycling not a problem? Assume that an mmap'd file * F_0 at trace time t_0 has the same (device, inode) ID as a * different file F_1 at trace time t_1. By definition, if the inode * ID was recycled in [t_0, t_1), then all references to F_0 must have * been dropped in that interval. A corollary of that is that all * memory mappings of F_0 must have been fully unmapped in the * interval. As per the first long comment in |gc()| below, an * emulated file can only be "live" during replay if some tracee still * has a mapping of it. Tracees' mappings of emulated files is a * subset of the ways they can create references to real files during * recording. Therefore the event during replay that drops the last * reference to the emulated F_0 must be a tracee unmapping of F_0. * * So as long as we GC emulated F_0 at the event of its fatal * unmapping, the lifetimes of emulated F_0 and emulated F_1 must be * disjoint. And F_0 being GC'd at that point is the important * assumption mentioned above. */ class EmuFile; struct FileId { FileId(const KernelMapping& recorded_map); FileId(const EmuFile& emu_file); FileId(dev_t device, ino_t inode) : device(device), inode(inode) {} bool operator==(const FileId& other) const { return (device == other.device && inode == other.inode); } bool operator<(const FileId& other) const { return device < other.device || (device == other.device && inode < other.inode); } dev_t device; ino_t inode; }; /** * A file within an EmuFs. The file is real, but it's mapped to file * ID that was recorded during replay. */ class EmuFile { public: typedef std::shared_ptr shr_ptr; ~EmuFile(); /** * Return the fd of the real file backing this. */ const ScopedFd& fd() const { return file; } /** * Return a pathname referring to the fd of this in this * tracer's address space. For example, "/proc/12345/fd/5". */ std::string proc_path() const; /** * Return the path of the original file from recording, the * one this is emulating. */ const std::string emu_path() const { return orig_path; } const std::string real_path() const { return tmp_path; } dev_t device() const { return device_; } ino_t inode() const { return inode_; } void ensure_size(uint64_t size); private: friend class EmuFs; EmuFile(EmuFs& owner, ScopedFd&& fd, const std::string& orig_path, const std::string& real_path, dev_t device, ino_t inode, uint64_t file_size); /** * Return a copy of this file. See |create()| for the meaning * of |fs_tag|. */ shr_ptr clone(EmuFs& owner); /** * Ensure that the emulated file is sized to match a later * stat() of it. */ void update(dev_t device, ino_t inode, uint64_t size); /** * Create a new emulated file for |orig_path| that will * emulate the recorded attributes |est|. |tag| is used to * uniquely identify this file among multiple EmuFs's that * might exist concurrently in this tracer process. */ static shr_ptr create(EmuFs& owner, const std::string& orig_path, dev_t orig_device, ino_t orig_inode, uint64_t orig_file_size); std::string orig_path; std::string tmp_path; ScopedFd file; EmuFs& owner; uint64_t size_; dev_t device_; ino_t inode_; EmuFile(const EmuFile&) = delete; EmuFile operator=(const EmuFile&) = delete; }; class EmuFs { public: typedef std::shared_ptr shr_ptr; /** * Return the EmuFile for |recorded_map|, which must exist or this won't * return. */ EmuFile::shr_ptr at(const KernelMapping& recorded_map) const; bool has_file_for(const KernelMapping& recorded_map) const; EmuFile::shr_ptr clone_file(EmuFile::shr_ptr emu_file); /** * Return an emulated file representing the recorded shared mapping * |recorded_km|. */ EmuFile::shr_ptr get_or_create(const KernelMapping& recorded_km); /** * Return an already-existing emulated file for the given device/inode. * Returns null if not found. */ EmuFile::shr_ptr find(dev_t device, ino_t inode); /** * Dump information about this emufs to the "error" log. */ void log() const; size_t size() const { return files.size(); } /** Create and return a new emufs. */ static shr_ptr create(); void destroyed_file(EmuFile& emu_file) { files.erase(FileId(emu_file)); } private: EmuFs(); typedef std::map> FileMap; FileMap files; EmuFs(const EmuFs&) = delete; EmuFs& operator=(const EmuFs&) = delete; }; } // namespace rr #endif // RR_EMUFS_H rr-5.5.0/src/Event.cc000066400000000000000000000130031412202446200143150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Event.h" #include #include #include #include "kernel_abi.h" #include "kernel_metadata.h" #include "log.h" #include "util.h" using namespace std; namespace rr { Event::Event(const Event& o) : event_type(o.event_type) { switch (event_type) { case EV_DESCHED: new (&Desched()) DeschedEvent(o.Desched()); return; case EV_PATCH_SYSCALL: new (&PatchSyscall()) PatchSyscallEvent(o.PatchSyscall()); return; case EV_SIGNAL: case EV_SIGNAL_DELIVERY: case EV_SIGNAL_HANDLER: new (&Signal()) SignalEvent(o.Signal()); return; case EV_SYSCALL: case EV_SYSCALL_INTERRUPTION: new (&Syscall()) SyscallEvent(o.Syscall()); return; case EV_SYSCALLBUF_FLUSH: new (&SyscallbufFlush()) SyscallbufFlushEvent(o.SyscallbufFlush()); return; default: return; } } Event::~Event() { switch (event_type) { case EV_DESCHED: Desched().~DeschedEvent(); return; case EV_PATCH_SYSCALL: PatchSyscall().~PatchSyscallEvent(); return; case EV_SIGNAL: case EV_SIGNAL_DELIVERY: case EV_SIGNAL_HANDLER: Signal().~SignalEvent(); return; case EV_SYSCALL: case EV_SYSCALL_INTERRUPTION: Syscall().~SyscallEvent(); return; case EV_SYSCALLBUF_FLUSH: SyscallbufFlush().~SyscallbufFlushEvent(); return; default: return; } } Event& Event::operator=(const Event& o) { if (this == &o) { return *this; } this->~Event(); new (this) Event(o); return *this; } bool Event::record_regs() const { switch (type()) { case EV_INSTRUCTION_TRAP: case EV_PATCH_SYSCALL: case EV_SCHED: case EV_SYSCALL: case EV_SIGNAL: case EV_SIGNAL_DELIVERY: case EV_SIGNAL_HANDLER: return true; default: return false; } } bool Event::record_extra_regs() const { switch (type()) { case EV_SYSCALL: { const SyscallEvent& sys_ev = Syscall(); // sigreturn/rt_sigreturn restores register state return sys_ev.state == EXITING_SYSCALL && (is_sigreturn(sys_ev.number, sys_ev.arch()) || is_execve_syscall(sys_ev.number, sys_ev.arch())); } case EV_SIGNAL_HANDLER: // entering a signal handler seems to clear FP/SSE regs, // so record these effects. return true; default: return false; } } bool Event::has_ticks_slop() const { switch (type()) { case EV_SYSCALLBUF_ABORT_COMMIT: case EV_SYSCALLBUF_FLUSH: case EV_SYSCALLBUF_RESET: case EV_DESCHED: case EV_GROW_MAP: return true; default: return false; } } bool Event::is_signal_event() const { switch (event_type) { case EV_SIGNAL: case EV_SIGNAL_DELIVERY: case EV_SIGNAL_HANDLER: return true; default: return false; } } bool Event::is_syscall_event() const { switch (event_type) { case EV_SYSCALL: case EV_SYSCALL_INTERRUPTION: return true; default: return false; } } string Event::str() const { stringstream ss; ss << type_name(); switch (event_type) { case EV_SIGNAL: case EV_SIGNAL_DELIVERY: case EV_SIGNAL_HANDLER: ss << ": " << signal_name(Signal().siginfo.si_signo) << "(" << (const char*)(Signal().deterministic == DETERMINISTIC_SIG ? "det" : "async") << ")"; break; case EV_SYSCALL: case EV_SYSCALL_INTERRUPTION: ss << ": " << syscall_name(Syscall().number, Syscall().regs.arch()); break; default: // No auxiliary information. break; } return ss.str(); } void Event::transform(EventType new_type) { switch (event_type) { case EV_SIGNAL: DEBUG_ASSERT(EV_SIGNAL_DELIVERY == new_type); break; case EV_SIGNAL_DELIVERY: DEBUG_ASSERT(EV_SIGNAL_HANDLER == new_type); break; case EV_SYSCALL: DEBUG_ASSERT(EV_SYSCALL_INTERRUPTION == new_type); break; case EV_SYSCALL_INTERRUPTION: DEBUG_ASSERT(EV_SYSCALL == new_type); break; default: FATAL() << "Can't transform immutable " << *this << " into " << new_type; } event_type = new_type; } std::string Event::type_name() const { switch (event_type) { case EV_SENTINEL: return "(none)"; #define CASE(_t) \ case EV_##_t: \ return #_t CASE(EXIT); CASE(NOOP); CASE(SCHED); CASE(SECCOMP_TRAP); CASE(INSTRUCTION_TRAP); CASE(SYSCALLBUF_FLUSH); CASE(SYSCALLBUF_ABORT_COMMIT); CASE(SYSCALLBUF_RESET); CASE(PATCH_SYSCALL); CASE(GROW_MAP); CASE(DESCHED); CASE(SIGNAL); CASE(SIGNAL_DELIVERY); CASE(SIGNAL_HANDLER); CASE(SYSCALL); CASE(SYSCALL_INTERRUPTION); CASE(TRACE_TERMINATION); #undef CASE default: FATAL() << "Unknown event type " << event_type; return ""; // not reached } } const char* state_name(SyscallState state) { switch (state) { #define CASE(_id) \ case _id: \ return #_id CASE(NO_SYSCALL); CASE(ENTERING_SYSCALL_PTRACE); CASE(ENTERING_SYSCALL); CASE(PROCESSING_SYSCALL); CASE(EXITING_SYSCALL); #undef CASE default: return "???state"; } } } // namespace rr rr-5.5.0/src/Event.h000066400000000000000000000320351412202446200141650ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_EVENT_H_ #define RR_EVENT_H_ #include #include #include #include #include #include "Registers.h" #include "core.h" #include "kernel_abi.h" #include "kernel_metadata.h" #include "preload/preload_interface.h" struct syscallbuf_record; namespace rr { /** * During recording, sometimes we need to ensure that an iteration of * RecordSession::record_step schedules the same task as in the previous * iteration. The PREVENT_SWITCH value indicates that this is required. * For example, the futex operation FUTEX_WAKE_OP modifies userspace * memory; those changes are only recorded after the system call completes; * and they must be replayed before we allow a context switch to a woken-up * task (because the kernel guarantees those effects are seen by woken-up * tasks). * Entering a potentially blocking system call must use ALLOW_SWITCH, or * we risk deadlock. Most non-blocking system calls could use PREVENT_SWITCH * or ALLOW_SWITCH; for simplicity we use ALLOW_SWITCH to indicate a call could * block and PREVENT_SWITCH otherwise. * Note that even if a system call uses PREVENT_SWITCH, as soon as we've * recorded the completion of the system call, we can switch to another task. */ enum Switchable { PREVENT_SWITCH, ALLOW_SWITCH }; /** * Events serve two purposes: tracking Task state during recording, and * being stored in traces to guide replay. Some events are only used during * recording and are never actually stored in traces (and are thus irrelevant * to replay). */ enum EventType { EV_UNASSIGNED, EV_SENTINEL, // TODO: this is actually a pseudo-pseudosignal: it will never // appear in a trace, but is only used to communicate between // different parts of the recorder code that should be // refactored to not have to do that. EV_NOOP, EV_DESCHED, EV_SECCOMP_TRAP, EV_SYSCALL_INTERRUPTION, // Not stored in trace, but synthesized when we reach the end of the trace. EV_TRACE_TERMINATION, // Events present in traces: // No associated data. EV_EXIT, // Scheduling signal interrupted the trace. EV_SCHED, // A disabled RDTSC or CPUID instruction. EV_INSTRUCTION_TRAP, // Recorded syscallbuf data for one or more buffered syscalls. EV_SYSCALLBUF_FLUSH, EV_SYSCALLBUF_ABORT_COMMIT, // The syscallbuf was reset to the empty state. We record this event // later than it really happens, because during replay we must proceed to // the event *after* a syscallbuf flush and then reset the syscallbuf, // to ensure we don't reset it while preload code is still using the data. EV_SYSCALLBUF_RESET, // Syscall was entered, the syscall instruction was patched, and the // syscall was aborted. Resume execution at the patch. EV_PATCH_SYSCALL, // Map memory pages due to a (future) memory access. This is associated // with a mmap entry for the new pages. EV_GROW_MAP, // Use .signal. EV_SIGNAL, EV_SIGNAL_DELIVERY, EV_SIGNAL_HANDLER, // Use .syscall. EV_SYSCALL, EV_LAST }; /** * Desched events track the fact that a tracee's desched-event * notification fired during a may-block buffered syscall, which rr * interprets as the syscall actually blocking (for a potentially * unbounded amount of time). After the syscall exits, rr advances * the tracee to where the desched is "disarmed" by the tracee. */ struct DeschedEvent { /** Desched of |rec|. */ DeschedEvent(remote_ptr rec) : rec(rec) {} // Record of the syscall that was interrupted by a desched // notification. It's legal to reference this memory /while // the desched is being processed only/, because |t| is in the // middle of a desched, which means it's successfully // allocated (but not yet committed) this syscall record. remote_ptr rec; }; struct PatchSyscallEvent { PatchSyscallEvent() : patch_after_syscall(false) {} // If true, this patch event comes after a syscall (whereas usually they // come before). We assume the trace has put us in the correct place // and don't try to execute any code to reach this event. bool patch_after_syscall; // It true, this patch is for the caller of a vsyscall entry point bool patch_vsyscall; }; struct SyscallbufFlushEvent { SyscallbufFlushEvent() {} std::vector mprotect_records; }; enum SignalDeterministic { NONDETERMINISTIC_SIG = 0, DETERMINISTIC_SIG = 1 }; enum SignalResolvedDisposition { DISPOSITION_FATAL = 0, DISPOSITION_USER_HANDLER = 1, DISPOSITION_IGNORED = 2, }; struct SignalEvent { /** * Signal |signo| is the signum, and |deterministic| is true * for deterministically-delivered signals (see * record_signal.cc). */ SignalEvent(const siginfo_t& siginfo, SignalDeterministic deterministic, SignalResolvedDisposition disposition) : siginfo(siginfo), deterministic(deterministic), disposition(disposition) {} // Signal info siginfo_t siginfo; // True if this signal will be deterministically raised as the // side effect of retiring an instruction during replay, for // example |load $r 0x0| deterministically raises SIGSEGV. SignalDeterministic deterministic; SignalResolvedDisposition disposition; }; /** * Syscall events track syscalls through entry into the kernel, * processing in the kernel, and exit from the kernel. * * This also models interrupted syscalls. During recording, only * descheduled buffered syscalls /push/ syscall interruptions; all * others are detected at exit time and transformed into syscall * interruptions from the original, normal syscalls. * * Normal system calls (interrupted or not) record two events: ENTERING_SYSCALL * and EXITING_SYSCALL. If the process exits before the syscall exit (because * this is an exit/exit_group syscall or the process gets SIGKILL), there's no * syscall exit event. * * When PTRACE_SYSCALL is used, there will be three events: * ENTERING_SYSCALL_PTRACE to run the process until it gets into the kernel, * then ENTERING_SYSCALL and EXITING_SYSCALL. We need three events to handle * PTRACE_SYSCALL with clone/fork/vfork and execve. The tracee must run to * the ENTERING_SYSCALL_PTRACE state, allow a context switch so the ptracer * can modify tracee registers, then perform ENTERING_SYSCALL (which actually * creates the new task or does the exec), allow a context switch so the * ptracer can modify the new task or post-exec state in a PTRACE_EVENT_EXEC/ * CLONE/FORK/VFORK, then perform EXITING_SYSCALL to get into the correct * post-syscall state. * * When PTRACE_SYSEMU is used, there will only be one event: an * ENTERING_SYSCALL_PTRACE. */ enum SyscallState { // Not present in trace. Just a dummy value. NO_SYSCALL, // Run to the given register state and enter the kernel but don't // perform any system call processing yet. ENTERING_SYSCALL_PTRACE, // Run to the given register state and enter the kernel, if not already // there due to a ENTERING_SYSCALL_PTRACE, and then perform the initial part // of the system call (any work required before issuing a during-system-call // ptrace event). ENTERING_SYSCALL, // Not present in trace. PROCESSING_SYSCALL, // Already in the kernel. Perform the final part of the system call and exit // with the recorded system call result. EXITING_SYSCALL }; struct OpenedFd { std::string path; int fd; dev_t device; ino_t inode; }; struct SyscallEvent { /** Syscall |syscallno| is the syscall number. */ SyscallEvent(int syscallno, SupportedArch arch) : arch_(arch), regs(arch), desched_rec(nullptr), write_offset(-1), state(NO_SYSCALL), number(syscallno), switchable(PREVENT_SWITCH), is_restart(false), failed_during_preparation(false), in_sysemu(false) {} std::string syscall_name() const { return rr::syscall_name(number, arch()); } SupportedArch arch() const { return arch_; } /** Change the architecture for this event. */ void set_arch(SupportedArch a) { arch_ = a; } SupportedArch arch_; // The original (before scratch is set up) arguments to the // syscall passed by the tracee. These are used to detect // restarted syscalls. Registers regs; // If this is a descheduled buffered syscall, points at the // record for that syscall. remote_ptr desched_rec; // Extra data for specific syscalls. Only used for exit events currently. // -1 to indicate there isn't one int64_t write_offset; std::vector exec_fds_to_close; std::vector opened; std::shared_ptr> socket_addrs; SyscallState state; // Syscall number. int number; // Records the switchable state when this syscall was prepared Switchable switchable; // True when this syscall was restarted after a signal interruption. bool is_restart; // True when this syscall failed during preparation: syscall entry events // that were interrupted by a user seccomp filter forcing SIGSYS or errno, // and clone system calls that failed. These system calls failed no matter // what the syscall-result register says. bool failed_during_preparation; // Syscall is being emulated via PTRACE_SYSEMU. bool in_sysemu; }; struct syscall_interruption_t { syscall_interruption_t(){}; }; static const syscall_interruption_t interrupted; /** * Sum type for all events (well, a C++ approximation thereof). An * Event always has a definted EventType. It can be down-casted to * one of the leaf types above iff the type tag is correct. */ struct Event { Event() : event_type(EV_UNASSIGNED) {} Event(const DeschedEvent& ev) : event_type(EV_DESCHED), desched(ev) {} Event(EventType type, const SignalEvent& ev) : event_type(type), signal(ev) {} Event(const SyscallbufFlushEvent& ev) : event_type(EV_SYSCALLBUF_FLUSH), syscallbuf_flush(ev) {} Event(const SyscallEvent& ev) : event_type(EV_SYSCALL), syscall(ev) {} Event(const syscall_interruption_t&, const SyscallEvent& ev) : event_type(EV_SYSCALL_INTERRUPTION), syscall(ev) {} Event(const Event& o); ~Event(); Event& operator=(const Event& o); DeschedEvent& Desched() { DEBUG_ASSERT(EV_DESCHED == event_type); return desched; } const DeschedEvent& Desched() const { DEBUG_ASSERT(EV_DESCHED == event_type); return desched; } PatchSyscallEvent& PatchSyscall() { DEBUG_ASSERT(EV_PATCH_SYSCALL == event_type); return patch; } const PatchSyscallEvent& PatchSyscall() const { DEBUG_ASSERT(EV_PATCH_SYSCALL == event_type); return patch; } SyscallbufFlushEvent& SyscallbufFlush() { DEBUG_ASSERT(EV_SYSCALLBUF_FLUSH == event_type); return syscallbuf_flush; } const SyscallbufFlushEvent& SyscallbufFlush() const { DEBUG_ASSERT(EV_SYSCALLBUF_FLUSH == event_type); return syscallbuf_flush; } SignalEvent& Signal() { DEBUG_ASSERT(is_signal_event()); return signal; } const SignalEvent& Signal() const { DEBUG_ASSERT(is_signal_event()); return signal; } SyscallEvent& Syscall() { DEBUG_ASSERT(is_syscall_event()); return syscall; } const SyscallEvent& Syscall() const { DEBUG_ASSERT(is_syscall_event()); return syscall; } bool record_regs() const; bool record_extra_regs() const; bool has_ticks_slop() const; /** * Return true if this is one of the indicated type of events. */ bool is_signal_event() const; bool is_syscall_event() const; /** Return a string describing this. */ std::string str() const; /** * Dynamically change the type of this. Only a small number * of type changes are allowed. */ void transform(EventType new_type); /** Return the current type of this. */ EventType type() const { return event_type; } /** Return a string naming |ev|'s type. */ std::string type_name() const; static Event noop() { return Event(EV_NOOP); } static Event trace_termination() { return Event(EV_TRACE_TERMINATION); } static Event instruction_trap() { return Event(EV_INSTRUCTION_TRAP); } static Event patch_syscall() { auto ev = Event(EV_PATCH_SYSCALL); ev.PatchSyscall().patch_after_syscall = false; ev.PatchSyscall().patch_vsyscall = false; return ev; } static Event sched() { return Event(EV_SCHED); } static Event seccomp_trap() { return Event(EV_SECCOMP_TRAP); } static Event syscallbuf_abort_commit() { return Event(EV_SYSCALLBUF_ABORT_COMMIT); } static Event syscallbuf_reset() { return Event(EV_SYSCALLBUF_RESET); } static Event grow_map() { return Event(EV_GROW_MAP); } static Event exit() { return Event(EV_EXIT); } static Event sentinel() { return Event(EV_SENTINEL); } private: Event(EventType type) : event_type(type) {} EventType event_type; union { DeschedEvent desched; PatchSyscallEvent patch; SignalEvent signal; SyscallEvent syscall; SyscallbufFlushEvent syscallbuf_flush; }; }; inline static std::ostream& operator<<(std::ostream& o, const Event& ev) { return o << ev.str(); } const char* state_name(SyscallState state); } // namespace rr #endif // EVENT_H_ rr-5.5.0/src/ExtraRegisters.cc000066400000000000000000000520051412202446200162140ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "ExtraRegisters.h" #include #include "core.h" #include "log.h" #include "util.h" using namespace std; namespace rr { // This is the byte offset at which the ST0-7 register data begins // with an xsave (or fxsave) block. static const int st_regs_offset = 32; // NB: each STx register holds 10 bytes of actual data, but each // occupies 16 bytes of space within (f)xsave, presumably for // alignment purposes. static const int st_reg_space = 16; // Byte offset at which the XMM0-15 register data begins with (f)xsave. static const int xmm_regs_offset = 160; static const int xmm_reg_space = 16; static const int xsave_feature_pkru = 9; static const uint8_t fxsave_387_ctrl_offsets[] = { // The Intel documentation says that the following layout is only valid in // 32-bit mode, or when fxsave is executed in 64-bit mode without an // appropriate REX prefix. The kernel seems to only use fxsave with the // REX prefix, so one would think these offsets would be different. But // GDB seems happy to use these offsets, so that's what we use too. 0, // DREG_64_FCTRL 2, // DREG_64_FSTAT 4, // DREG_64_FTAG 12, // DREG_64_FISEG 8, // DREG_64_FIOFF 20, // DREG_64_FOSEG 16, // DREG_64_FOOFF 6, // DREG_64_FOP }; static const int fip_offset = 8; static const int fdp_offset = 16; static const int mxcsr_offset = 24; struct RegData { int offset; int size; int xsave_feature_bit; RegData(int offset = -1, int size = 0) : offset(offset), size(size), xsave_feature_bit(-1) {} }; static bool reg_in_range(GdbRegister regno, GdbRegister low, GdbRegister high, int offset_base, int offset_stride, int size, RegData* out) { if (regno < low || regno > high) { return false; } out->offset = offset_base + offset_stride * (regno - low); out->size = size; return true; } static const int AVX_FEATURE_BIT = 2; static const size_t xsave_header_offset = 512; static const size_t xsave_header_size = 64; static const size_t xsave_header_end = xsave_header_offset + xsave_header_size; // This is always at 576 since AVX is always the first optional feature, // if present. static const size_t AVX_xsave_offset = 576; // Return the size and data location of register |regno|. // If we can't read the register, returns -1 in 'offset'. static RegData xsave_register_data(SupportedArch arch, GdbRegister regno) { // Check regno is in range, and if it's 32-bit then convert it to the // equivalent 64-bit register. switch (arch) { case x86: // Convert regno to the equivalent 64-bit version since the XSAVE layout // is compatible if (regno >= DREG_XMM0 && regno <= DREG_XMM7) { regno = (GdbRegister)(regno - DREG_XMM0 + DREG_64_XMM0); break; } if (regno >= DREG_YMM0H && regno <= DREG_YMM7H) { regno = (GdbRegister)(regno - DREG_YMM0H + DREG_64_YMM0H); break; } if (regno < DREG_FIRST_FXSAVE_REG || regno > DREG_LAST_FXSAVE_REG) { return RegData(); } if (regno == DREG_MXCSR) { regno = DREG_64_MXCSR; } else { regno = (GdbRegister)(regno - DREG_FIRST_FXSAVE_REG + DREG_64_FIRST_FXSAVE_REG); } break; case x86_64: break; default: DEBUG_ASSERT(0 && "Unknown arch"); return RegData(); } RegData result; if (reg_in_range(regno, DREG_64_ST0, DREG_64_ST7, st_regs_offset, st_reg_space, 10, &result)) { return result; } if (reg_in_range(regno, DREG_64_XMM0, DREG_64_XMM15, xmm_regs_offset, xmm_reg_space, 16, &result)) { return result; } if (reg_in_range(regno, DREG_64_YMM0H, DREG_64_YMM15H, AVX_xsave_offset, 16, 16, &result)) { result.xsave_feature_bit = AVX_FEATURE_BIT; return result; } if (regno < DREG_64_FIRST_FXSAVE_REG || regno > DREG_64_LAST_FXSAVE_REG) { return RegData(); } if (regno == DREG_64_MXCSR) { return RegData(24, 4); } DEBUG_ASSERT(regno >= DREG_64_FCTRL && regno <= DREG_64_FOP); // NB: most of these registers only occupy 2 bytes of space in // the (f)xsave region, but gdb's default x86 target // config expects us to send back 4 bytes of data for // each. return RegData(fxsave_387_ctrl_offsets[regno - DREG_64_FCTRL], 4); } static uint64_t xsave_features(const vector& data) { // If this is just FXSAVE(64) data then we we have no XSAVE header and no // XSAVE(64) features enabled. return data.size() < xsave_header_offset + xsave_header_size ? 0 : *reinterpret_cast(data.data() + xsave_header_offset); } size_t ExtraRegisters::read_register(uint8_t* buf, GdbRegister regno, bool* defined) const { if (format_ == NT_FPR) { if (arch() != aarch64) { *defined = false; return 0; } RegData reg_data; if (DREG_V0 <= regno && regno <= DREG_V31) { reg_data = RegData(offsetof(ARM64Arch::user_fpsimd_state, vregs[0]) + ((regno - DREG_V0) * 16), 16); } else if (regno == DREG_FPSR) { reg_data = RegData(offsetof(ARM64Arch::user_fpsimd_state, fpsr), sizeof(uint32_t)); } else if (regno == DREG_FPCR) { reg_data = RegData(offsetof(ARM64Arch::user_fpsimd_state, fpcr), sizeof(uint32_t)); } else { *defined = false; return 0; } DEBUG_ASSERT(size_t(reg_data.offset + reg_data.size) <= data_.size()); *defined = true; memcpy(buf, data_.data() + reg_data.offset, reg_data.size); return reg_data.size; } if (format_ != XSAVE) { *defined = false; return 0; } auto reg_data = xsave_register_data(arch(), regno); if (reg_data.offset < 0 || empty()) { *defined = false; return reg_data.size; } DEBUG_ASSERT(reg_data.size > 0); *defined = true; // Apparently before any AVX registers are used, the feature bit is not set // in the XSAVE data, so we'll just return 0 for them here. if (reg_data.xsave_feature_bit >= 0 && !(xsave_features(data_) & (1 << reg_data.xsave_feature_bit))) { memset(buf, 0, reg_data.size); } else { DEBUG_ASSERT(size_t(reg_data.offset + reg_data.size) <= data_.size()); memcpy(buf, data_.data() + reg_data.offset, reg_data.size); } return reg_data.size; } static const int xinuse_offset = 512; uint64_t ExtraRegisters::read_xinuse(bool* defined) const { uint64_t ret; if (format_ != XSAVE || data_.size() < 512 + sizeof(ret)) { *defined = false; return 0; } memcpy(&ret, data_.data() + xinuse_offset, sizeof(ret)); return ret; } uint64_t ExtraRegisters::read_fip(bool* defined) const { if (format_ != XSAVE) { *defined = false; return 0; } uint64_t ret; memcpy(&ret, data_.data() + fip_offset, sizeof(ret)); return ret; } uint32_t ExtraRegisters::read_mxcsr(bool* defined) const { if (format_ != XSAVE) { *defined = false; return 0; } uint32_t ret; memcpy(&ret, data_.data() + mxcsr_offset, sizeof(ret)); return ret; } bool ExtraRegisters::clear_fip_fdp() { if (format_ != XSAVE) { return false; } bool ret = false; uint64_t v; memcpy(&v, data_.data() + fip_offset, sizeof(v)); if (v != 0) { ret = true; memset(data_.data() + fip_offset, 0, 8); } memcpy(&v, data_.data() + fdp_offset, sizeof(v)); if (v != 0) { ret = true; memset(data_.data() + fdp_offset, 0, 8); } return ret; } void ExtraRegisters::validate(Task* t) { if (format_ != XSAVE) { return; } ASSERT(t, data_.size() >= 512); uint32_t offset = 512; if (data_.size() > offset) { ASSERT(t, data_.size() >= offset + 64); offset += 64; uint64_t features = xsave_features(data_); if (features & (1 << AVX_FEATURE_BIT)) { ASSERT(t, data_.size() >= offset + 256); } } } static void print_reg(const ExtraRegisters& r, GdbRegister low, GdbRegister hi, const char* name, FILE* f) { uint8_t buf[128]; bool defined = false; size_t len = r.read_register(buf, low, &defined); DEBUG_ASSERT(defined && len <= 64); if (hi != GdbRegister(0)) { size_t len2 = r.read_register(buf + len, hi, &defined); if (defined) { DEBUG_ASSERT(len == len2); len += len2; } } char out[257]; bool printed_digit = false; char* p = out; for (int i = len - 1; i >= 0; --i) { if (!printed_digit && !buf[i] && i > 0) { continue; } p += sprintf(p, printed_digit ? "%02x" : "%x", buf[i]); printed_digit = true; } fprintf(f, "%s:0x%s", name, out); } static void print_regs(const ExtraRegisters& r, GdbRegister low, GdbRegister hi, int num_regs, const char* name_base, FILE* f) { for (int i = 0; i < num_regs; ++i) { char buf[80]; sprintf(buf, "%s%d", name_base, i); print_reg(r, (GdbRegister)(low + i), hi == GdbRegister(0) ? hi : (GdbRegister)(hi + i), buf, f); if (i < num_regs - 1) { fputc(' ', f); } } } void ExtraRegisters::print_register_file_compact(FILE* f) const { switch (arch_) { case x86: print_regs(*this, DREG_ST0, GdbRegister(0), 8, "st", f); fputc(' ', f); print_regs(*this, DREG_XMM0, DREG_YMM0H, 8, "ymm", f); break; case x86_64: print_regs(*this, DREG_64_ST0, GdbRegister(0), 8, "st", f); fputc(' ', f); print_regs(*this, DREG_64_XMM0, DREG_64_YMM0H, 16, "ymm", f); break; case aarch64: DEBUG_ASSERT(format_ == NT_FPR); print_regs(*this, DREG_V0, GdbRegister(0), 32, "v", f); fputc(' ', f); print_reg(*this, DREG_FPSR, GdbRegister(0), "fpsr", f); fputc(' ', f); print_reg(*this, DREG_FPCR, GdbRegister(0), "fpcr", f); break; default: DEBUG_ASSERT(0 && "Unknown arch"); break; } } static X86Arch::user_fpregs_struct convert_fxsave_to_x86_fpregs( const X86Arch::user_fpxregs_struct& buf) { X86Arch::user_fpregs_struct result; for (int i = 0; i < 8; ++i) { memcpy(reinterpret_cast(result.st_space) + i * 10, &buf.st_space[i * 4], 10); } result.cwd = buf.cwd | 0xffff0000; result.swd = buf.swd | 0xffff0000; // XXX Computing the correct twd is a pain. It probably doesn't matter to us // in practice. result.twd = 0; result.fip = buf.fip; result.fcs = buf.fcs; result.foo = buf.foo; result.fos = buf.fos; return result; } static void convert_x86_fpregs_to_fxsave(const X86Arch::user_fpregs_struct& buf, X86Arch::user_fpxregs_struct* result) { for (int i = 0; i < 8; ++i) { memcpy(&result->st_space[i * 4], reinterpret_cast(buf.st_space) + i * 10, 10); } result->cwd = buf.cwd; result->swd = buf.swd; // XXX Computing the correct twd is a pain. It probably doesn't matter to us // in practice. result->fip = buf.fip; result->fcs = buf.fcs; result->foo = buf.foo; result->fos = buf.fos; } template static vector to_vector(const T& v) { vector result; result.resize(sizeof(T)); memcpy(result.data(), &v, sizeof(T)); return result; } static bool all_zeroes(const uint8_t* data, size_t size) { for (size_t i = 0; i < size; ++i) { if (data[i]) { return false; } } return true; } static uint32_t features_used(const uint8_t* data, const XSaveLayout& layout) { uint64_t features; memcpy(&features, data + xsave_header_offset, sizeof(features)); uint64_t pkru_bit = uint64_t(1) << xsave_feature_pkru; if ((features & pkru_bit) && xsave_feature_pkru < layout.feature_layouts.size()) { // Check if it's really used const XSaveFeatureLayout& fl = layout.feature_layouts[xsave_feature_pkru]; if (uint64_t(fl.offset) + fl.size <= layout.full_size && all_zeroes(data + fl.offset, fl.size)) { features &= ~pkru_bit; } } return features; } template bool memcpy_fpr_regs_arch(std::vector& dest, const uint8_t* src, size_t data_size) { if (data_size != sizeof(typename Arch::user_fpregs_struct)) { LOG(error) << "Invalid FPR data length: " << data_size << " for architecture " << arch_name(Arch::arch()) << ", expected " << sizeof(typename Arch::user_fpregs_struct); return false; } dest.resize(sizeof(typename Arch::user_fpregs_struct)); memcpy(dest.data(), src, sizeof(typename Arch::user_fpregs_struct)); return true; } bool memcpy_fpr_regs_arch(SupportedArch arch, std::vector& dest, const uint8_t* src, size_t data_size) { RR_ARCH_FUNCTION(memcpy_fpr_regs_arch, arch, dest, src, data_size) } bool ExtraRegisters::set_to_raw_data(SupportedArch a, Format format, const uint8_t* data, size_t data_size, const XSaveLayout& layout) { arch_ = a; format_ = NONE; if (format == NONE) { return true; } else if (format == NT_FPR) { if (!memcpy_fpr_regs_arch(a, data_, data, data_size)) { return false; } format_ = NT_FPR; return true; } if (format != XSAVE) { LOG(error) << "Unknown ExtraRegisters format: " << format; return false; } format_ = XSAVE; // Now we have to convert from the input XSAVE format to our // native XSAVE format. Be careful to handle possibly-corrupt input data. const XSaveLayout& native_layout = xsave_native_layout(); if (data_size != layout.full_size) { LOG(error) << "Invalid XSAVE data length: " << data_size << ", expected " << layout.full_size; return false; } data_.resize(native_layout.full_size); DEBUG_ASSERT(data_.size() >= xsave_header_offset); if (layout.full_size < xsave_header_offset) { LOG(error) << "Invalid XSAVE layout size: " << layout.full_size; return false; } memcpy(data_.data(), data, xsave_header_offset); memset(data_.data() + xsave_header_offset, 0, data_.size() - xsave_header_offset); // Check for unsupported features being used if (layout.full_size >= xsave_header_end) { uint64_t features = features_used(data, layout); if (features & ~native_layout.supported_feature_bits) { LOG(error) << "Unsupported CPU features found: got " << HEX(features) << " (" << xsave_feature_string(features) << "), supported: " << HEX(native_layout.supported_feature_bits) << " (" << xsave_feature_string(native_layout.supported_feature_bits) << "); Consider using `rr cpufeatures` and " << "`rr record --disable-cpuid-features-(ext)`"; return false; } } if (native_layout.full_size < xsave_header_end) { // No XSAVE supported here, we're done! return true; } if (layout.full_size < xsave_header_end) { // Degenerate XSAVE format without an actual XSAVE header. Assume x87+XMM // are in use. uint64_t assume_features_used = 0x3; memcpy(data_.data() + xsave_header_offset, &assume_features_used, sizeof(assume_features_used)); return true; } uint64_t features = features_used(data, layout); // OK, now both our native layout and the input layout are using the full // XSAVE header. Copy the header. Make sure to use our updated `features`. memcpy(data_.data() + xsave_header_offset, &features, sizeof(features)); memcpy(data_.data() + xsave_header_offset + sizeof(features), data + xsave_header_offset + sizeof(features), xsave_header_size - sizeof(features)); // Now copy each optional and present area into the right place in our struct for (size_t i = 2; i < 64; ++i) { if (features & (uint64_t(1) << i)) { if (i >= layout.feature_layouts.size()) { LOG(error) << "Invalid feature " << i << " beyond max layout " << layout.feature_layouts.size(); return false; } const XSaveFeatureLayout& feature = layout.feature_layouts[i]; if (uint64_t(feature.offset) + feature.size > layout.full_size) { LOG(error) << "Invalid feature region: " << feature.offset << "+" << feature.size << " > " << layout.full_size; return false; } const XSaveFeatureLayout& native_feature = native_layout.feature_layouts[i]; if (feature.size != native_feature.size) { LOG(error) << "Feature " << i << " has wrong size " << feature.size << ", expected " << native_feature.size; return false; } // The CPU should guarantee these DEBUG_ASSERT(native_feature.offset > 0); DEBUG_ASSERT(native_feature.offset + native_feature.size <= native_layout.full_size); memcpy(data_.data() + native_feature.offset, data + feature.offset, feature.size); } } return true; } vector ExtraRegisters::get_user_fpregs_struct( SupportedArch arch) const { DEBUG_ASSERT(format_ == XSAVE); switch (arch) { case x86: DEBUG_ASSERT(format_ == XSAVE); DEBUG_ASSERT(data_.size() >= sizeof(X86Arch::user_fpxregs_struct)); return to_vector(convert_fxsave_to_x86_fpregs( *reinterpret_cast( data_.data()))); case x86_64: DEBUG_ASSERT(format_ == XSAVE); DEBUG_ASSERT(data_.size() >= sizeof(X64Arch::user_fpregs_struct)); return to_vector( *reinterpret_cast(data_.data())); case aarch64: DEBUG_ASSERT(format_ == NT_FPR); DEBUG_ASSERT(data_.size() == sizeof(ARM64Arch::user_fpregs_struct)); return to_vector( *reinterpret_cast(data_.data())); default: DEBUG_ASSERT(0 && "Unknown arch"); return vector(); } } void ExtraRegisters::set_user_fpregs_struct(Task* t, SupportedArch arch, void* data, size_t size) { DEBUG_ASSERT(format_ == XSAVE); switch (arch) { case x86: ASSERT(t, size >= sizeof(X86Arch::user_fpregs_struct)); ASSERT(t, data_.size() >= sizeof(X86Arch::user_fpxregs_struct)); convert_x86_fpregs_to_fxsave( *static_cast(data), reinterpret_cast(data_.data())); return; case x86_64: ASSERT(t, data_.size() >= sizeof(X64Arch::user_fpregs_struct)); ASSERT(t, size >= sizeof(X64Arch::user_fpregs_struct)); memcpy(data_.data(), data, sizeof(X64Arch::user_fpregs_struct)); return; default: DEBUG_ASSERT(0 && "Unknown arch"); } } X86Arch::user_fpxregs_struct ExtraRegisters::get_user_fpxregs_struct() const { DEBUG_ASSERT(format_ == XSAVE); DEBUG_ASSERT(arch_ == x86); DEBUG_ASSERT(data_.size() >= sizeof(X86Arch::user_fpxregs_struct)); return *reinterpret_cast(data_.data()); } void ExtraRegisters::set_user_fpxregs_struct( Task* t, const X86Arch::user_fpxregs_struct& regs) { ASSERT(t, format_ == XSAVE); ASSERT(t, arch_ == x86); ASSERT(t, data_.size() >= sizeof(X86Arch::user_fpxregs_struct)); memcpy(data_.data(), ®s, sizeof(regs)); } static void set_word(SupportedArch arch, vector& v, GdbRegister r, int word) { RegData d = xsave_register_data(arch, r); DEBUG_ASSERT(d.size == 4); DEBUG_ASSERT(d.offset + d.size <= (int)v.size()); DEBUG_ASSERT(-1 == d.xsave_feature_bit); *reinterpret_cast(v.data() + d.offset) = word; } void ExtraRegisters::reset() { memset(data_.data(), 0, data_.size()); if (is_x86ish(arch())) { DEBUG_ASSERT(format_ == XSAVE); if (arch() == x86_64) { set_word(arch(), data_, DREG_64_MXCSR, 0x1f80); set_word(arch(), data_, DREG_64_FCTRL, 0x37f); } else { set_word(arch(), data_, DREG_MXCSR, 0x1f80); set_word(arch(), data_, DREG_FCTRL, 0x37f); } uint64_t xinuse; if (data_.size() >= xinuse_offset + sizeof(xinuse)) { memcpy(&xinuse, data_.data() + xinuse_offset, sizeof(xinuse)); /* We have observed (Skylake, Linux 4.10) the system setting XINUSE's 0 bit * to indicate x87-in-use, at times unrelated to x87 actually being used. * Work around this by setting the bit unconditionally after exec. */ xinuse |= 1; /* If the system supports the PKRU feature, the PKRU feature bit must be * set in order to get the kernel to properly update the PKRU register * value. If this is not set, it has been observed that the PKRU register * may occasionally contain "stale" values, particularly after involuntary * context swtiches. * Avoid this issue by setting the bit if the feature is supported by the * CPU. */ uint64_t pkru_bit = uint64_t(1) << xsave_feature_pkru; if (xcr0() & pkru_bit) { xinuse |= pkru_bit; } memcpy(data_.data() + xinuse_offset, &xinuse, sizeof(xinuse)); } } else { DEBUG_ASSERT(format_ == NT_FPR); DEBUG_ASSERT(arch() == aarch64 && "Ensure that nothing is required here for your architecture."); } } } // namespace rr rr-5.5.0/src/ExtraRegisters.h000066400000000000000000000103471412202446200160610ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_EXTRA_REGISTERS_H_ #define RR_EXTRA_REGISTERS_H_ #include #include #include #include "GdbRegister.h" #include "kernel_abi.h" namespace rr { struct XSaveLayout; /** * An ExtraRegisters object contains values for all user-space-visible * registers other than those in Registers. * * Task is responsible for creating meaningful values of this class. * * The only reason this class has an arch() is to enable us to * interpret GdbRegister. */ class ExtraRegisters { public: // Create empty (uninitialized/unknown registers) value ExtraRegisters(SupportedArch arch = SupportedArch(-1)) : format_(NONE), arch_(arch) {} enum Format { NONE, /** * The XSAVE format is x86(_64) only. * On a x86 64-bit kernel, these structures are initialized by an XSAVE64 or * FXSAVE64. * On a x86 32-bit kernel, they are initialized by an XSAVE or FXSAVE. * * The layouts are basically the same in the first 512 bytes --- an * FXSAVE(64) area. The differences are: * -- On a 64-bit kernel, registers XMM8-XMM15 are saved, but on a 32-bit * kernel they are not (that space is reserved). * -- On a 64-bit kernel, bytes 8-15 store a 64-bit "FPU IP" address, * but on a 32-bit kernel they store "FPU IP/CS". Likewise, * bytes 16-23 store "FPU DP" or "FPU DP/DS". * We basically ignore these differences. If gdb requests 32-bit-specific * registers, we return them, assuming that the data there is valid. * * XSAVE/XSAVE64 have extra information after the first 512 bytes, which we * currently save and restore but do not otherwise use. If the data record * has more than 512 bytes then it's an XSAVE(64) area, otherwise it's just * the FXSAVE(64) area. * * The data always uses our CPU's native XSAVE layout. When reading a trace, * we need to convert from the trace's CPU's XSAVE layout to our layout. */ XSAVE, /** * Stores the content of the NT_FPREGS regset. The format depends on the * architecture. It is given by Arch::user_fpregs_struct for the appropriate * architecture. */ NT_FPR }; // Set values from raw data, with the given XSAVE layout. Returns false // if this could not be done. bool set_to_raw_data(SupportedArch a, Format format, const uint8_t* data, size_t data_size, const XSaveLayout& layout); Format format() const { return format_; } SupportedArch arch() const { return arch_; } const std::vector data() const { return data_; } int data_size() const { return data_.size(); } const uint8_t* data_bytes() const { return data_.data(); } bool empty() const { return data_.empty(); } /** * Read XSAVE `xinuse` field */ uint64_t read_xinuse(bool* defined) const; /** * Read FIP field */ uint64_t read_fip(bool* defined) const; /** * Read MXCSR field */ uint32_t read_mxcsr(bool* defined) const; /** * Clear FIP and FDP registers if they're present. * Returns true if the registers changed. */ bool clear_fip_fdp(); /** * Like |Registers::read_register()|, except attempts to read * the value of an "extra register" (floating point / vector). */ size_t read_register(uint8_t* buf, GdbRegister regno, bool* defined) const; /** * Get a user_fpregs_struct for a particular Arch from these ExtraRegisters. */ std::vector get_user_fpregs_struct(SupportedArch arch) const; /** * Update registers from a user_fpregs_struct. */ void set_user_fpregs_struct(Task* t, SupportedArch arch, void* data, size_t size); /** * Get a user_fpxregs_struct for from these ExtraRegisters. */ X86Arch::user_fpxregs_struct get_user_fpxregs_struct() const; /** * Update registers from a user_fpxregs_struct. */ void set_user_fpxregs_struct(Task* t, const X86Arch::user_fpxregs_struct& regs); void print_register_file_compact(FILE* f) const; /** * Reset to post-exec initial state */ void reset(); void validate(Task* t); private: friend class Task; Format format_; SupportedArch arch_; std::vector data_; }; } // namespace rr #endif /* RR_EXTRA_REGISTERS_H_ */ rr-5.5.0/src/FdTable.cc000066400000000000000000000143631412202446200145470ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "FdTable.h" #include #include #include "rr/rr.h" #include "AddressSpace.h" #include "RecordTask.h" #include "ReplayTask.h" #include "Session.h" #include "core.h" #include "log.h" using namespace std; namespace rr { void FdTable::add_monitor(Task* t, int fd, FileMonitor* monitor) { // In the future we could support multiple monitors on an fd, but we don't // need to yet. ASSERT(t, !is_monitoring(fd)) << "Task " << t->rec_tid << " already monitoring fd " << fd; if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE && fds.count(fd) == 0) { fd_count_beyond_limit++; } fds[fd] = FileMonitor::shr_ptr(monitor); update_syscallbuf_fds_disabled(fd); } bool FdTable::is_rr_fd(int fd) { auto it = fds.find(fd); if (it == fds.end()) { return false; } return it->second->is_rr_fd(); } bool FdTable::emulate_ioctl(int fd, RecordTask* t, uint64_t* result) { auto it = fds.find(fd); if (it == fds.end()) { return false; } return it->second->emulate_ioctl(t, result); } bool FdTable::emulate_fcntl(int fd, RecordTask* t, uint64_t* result) { auto it = fds.find(fd); if (it == fds.end()) { return false; } return it->second->emulate_fcntl(t, result); } bool FdTable::emulate_read(int fd, RecordTask* t, const std::vector& ranges, FileMonitor::LazyOffset& offset, uint64_t* result) { auto it = fds.find(fd); if (it == fds.end()) { return false; } return it->second->emulate_read(t, ranges, offset, result); } void FdTable::filter_getdents(int fd, RecordTask* t) { auto it = fds.find(fd); if (it == fds.end()) { return; } it->second->filter_getdents(t); } Switchable FdTable::will_write(Task* t, int fd) { auto it = fds.find(fd); if (it == fds.end()) { return ALLOW_SWITCH; } return it->second->will_write(t); } void FdTable::did_write(Task* t, int fd, const std::vector& ranges, FileMonitor::LazyOffset& offset) { auto it = fds.find(fd); if (it != fds.end()) { it->second->did_write(t, ranges, offset); } } void FdTable::did_dup(int from, int to) { if (fds.count(from)) { if (to >= SYSCALLBUF_FDS_DISABLED_SIZE && fds.count(to) == 0) { fd_count_beyond_limit++; } fds[to] = fds[from]; } else { if (to >= SYSCALLBUF_FDS_DISABLED_SIZE && fds.count(to) > 0) { fd_count_beyond_limit--; } fds.erase(to); } update_syscallbuf_fds_disabled(to); } void FdTable::did_close(int fd) { LOG(debug) << "Close fd " << fd; if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE && fds.count(fd) > 0) { fd_count_beyond_limit--; } fds.erase(fd); update_syscallbuf_fds_disabled(fd); } FileMonitor* FdTable::get_monitor(int fd) { auto it = fds.find(fd); if (it == fds.end()) { return nullptr; } return it->second.get(); } static syscallbuf_fd_classes join_fd_classes_over_tasks(AddressSpace* vm, int fd) { syscallbuf_fd_classes cls = FD_CLASS_UNTRACED; for (Task* t : vm->task_set()) { auto table = t->fd_table(); if (table->is_monitoring(fd)) { if (cls != FD_CLASS_UNTRACED) { return FD_CLASS_TRACED; } cls = table->get_monitor(fd)->get_syscallbuf_class(); } else if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE - 1 && table->count_beyond_limit() > 0) { return FD_CLASS_TRACED; } } return cls; } void FdTable::update_syscallbuf_fds_disabled(int fd) { DEBUG_ASSERT(fd >= 0); DEBUG_ASSERT(task_set().size() > 0); unordered_set vms_updated; // It's possible for tasks with different VMs to share this fd table. // But tasks with the same VM might have different fd tables... for (Task* t : task_set()) { if (!t->session().is_recording()) { return; } RecordTask* rt = static_cast(t); if (rt->already_exited()) { continue; } AddressSpace* vm = rt->vm().get(); if (vms_updated.find(vm) != vms_updated.end()) { continue; } vms_updated.insert(vm); if (!rt->preload_globals.is_null()) { if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE) { fd = SYSCALLBUF_FDS_DISABLED_SIZE - 1; } char disable = (char)join_fd_classes_over_tasks(vm, fd); auto addr = REMOTE_PTR_FIELD(t->preload_globals, syscallbuf_fd_class[0]) + fd; rt->write_mem(addr, disable); rt->record_local(addr, &disable); } } } void FdTable::init_syscallbuf_fds_disabled(Task* t) { if (!t->session().is_recording()) { return; } RecordTask* rt = static_cast(t); ASSERT(rt, has_task(rt)); if (rt->preload_globals.is_null()) { return; } char disabled[SYSCALLBUF_FDS_DISABLED_SIZE]; memset(disabled, 0, sizeof(disabled)); // It's possible that some tasks in this address space have a different // FdTable. We need to disable syscallbuf for an fd if any tasks for this // address space are monitoring the fd. for (Task* vm_t : rt->vm()->task_set()) { for (auto& it : vm_t->fd_table()->fds) { int fd = it.first; DEBUG_ASSERT(fd >= 0); if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE) { fd = SYSCALLBUF_FDS_DISABLED_SIZE - 1; } if (disabled[fd] == FD_CLASS_UNTRACED) { disabled[fd] = it.second->get_syscallbuf_class(); } else { disabled[fd] = FD_CLASS_TRACED; } } } auto addr = REMOTE_PTR_FIELD(t->preload_globals, syscallbuf_fd_class[0]); rt->write_mem(addr, disabled, SYSCALLBUF_FDS_DISABLED_SIZE); rt->record_local(addr, disabled, SYSCALLBUF_FDS_DISABLED_SIZE); } void FdTable::close_after_exec(ReplayTask* t, const vector& fds_to_close) { ASSERT(t, has_task(t)); for (auto fd : fds_to_close) { did_close(fd); } } static bool is_fd_open(Task* t, int fd) { char path[PATH_MAX]; sprintf(path, "/proc/%d/fd/%d", t->tid, fd); struct stat st; return 0 == lstat(path, &st); } vector FdTable::fds_to_close_after_exec(RecordTask* t) { ASSERT(t, has_task(t)); vector fds_to_close; for (auto& it : fds) { if (!is_fd_open(t, it.first)) { fds_to_close.push_back(it.first); } } for (auto fd : fds_to_close) { did_close(fd); } return fds_to_close; } } // namespace rr rr-5.5.0/src/FdTable.h000066400000000000000000000047261412202446200144130ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_FD_TABLE_H_ #define RR_FD_TABLE_H_ #include #include #include #include "FileMonitor.h" #include "HasTaskSet.h" namespace rr { class RecordTask; class ReplayTask; class Task; class FdTable : public HasTaskSet { public: typedef std::shared_ptr shr_ptr; void add_monitor(Task* t, int fd, FileMonitor* monitor); bool emulate_ioctl(int fd, RecordTask* t, uint64_t* result); bool emulate_fcntl(int fd, RecordTask* t, uint64_t* result); bool emulate_read(int fd, RecordTask* t, const std::vector& ranges, FileMonitor::LazyOffset& offset, uint64_t* result); void filter_getdents(int fd, RecordTask* t); bool is_rr_fd(int fd); Switchable will_write(Task* t, int fd); void did_write(Task* t, int fd, const std::vector& ranges, FileMonitor::LazyOffset& offset); void did_dup(int from, int to); void did_close(int fd); shr_ptr clone() const { return shr_ptr(new FdTable(*this)); } static shr_ptr create(Task* t) { shr_ptr fds(new FdTable()); fds->insert_task(t); return fds; } bool is_monitoring(int fd) const { return fds.count(fd) > 0; } uint32_t count_beyond_limit() const { return fd_count_beyond_limit; } FileMonitor* get_monitor(int fd); /** * Regenerate syscallbuf_fds_disabled in task |t|. * Called during initialization of the preload library. */ void init_syscallbuf_fds_disabled(Task* t); /** * Get list of fds that have been closed after |t| has done an execve. * Rather than tracking CLOEXEC flags (which would be complicated), we just * scan /proc//fd during recording and note any monitored fds that have * been closed. * This also updates our table to match reality. */ std::vector fds_to_close_after_exec(RecordTask* t); /** * Close fds in list after an exec. */ void close_after_exec(ReplayTask* t, const std::vector& fds_to_close); private: FdTable() : fd_count_beyond_limit(0) {} FdTable(const FdTable& other) : fds(other.fds), fd_count_beyond_limit(other.fd_count_beyond_limit) {} void update_syscallbuf_fds_disabled(int fd); std::unordered_map fds; // Number of elements of `fds` that are >= SYSCALLBUF_FDS_DISABLED_SIZE uint32_t fd_count_beyond_limit; }; } // namespace rr #endif /* RR_FD_TABLE_H_ */ rr-5.5.0/src/FileMonitor.cc000066400000000000000000000054771412202446200155030ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "FileMonitor.h" #include #include #include #include "RecordTask.h" #include "ReplayTask.h" #include "Session.h" #include "log.h" namespace rr { using namespace std; template static bool is_implicit_offset_syscall_arch(int syscallno) { return syscallno == Arch::writev || syscallno == Arch::write || syscallno == Arch::readv || syscallno == Arch::read; } template static bool is_write_syscall_arch(int syscallno) { return syscallno == Arch::writev || syscallno == Arch::write || syscallno == Arch::pwrite64 || syscallno == Arch::pwritev; } static bool is_implict_offset_syscall(SupportedArch arch, int syscallno) { RR_ARCH_FUNCTION(is_implicit_offset_syscall_arch, arch, syscallno); } template static int64_t retrieve_offset_arch(Task* t, int syscallno, const Registers& regs) { switch (syscallno) { case Arch::pwrite64: case Arch::pwritev: case Arch::pread64: case Arch::preadv: { if (sizeof(typename Arch::unsigned_word) == 4) { return regs.arg4() | (uint64_t(regs.arg5_signed()) << 32); } return regs.arg4_signed(); } case Arch::readv: case Arch::read: case Arch::writev: case Arch::write: { ASSERT(t, t->session().is_recording()) << "Can only read a file descriptor's offset while recording"; int fd = regs.orig_arg1_signed(); int64_t offset = t->fd_offset(fd); return is_write_syscall_arch(syscallno) ? // The pos we just read, was after the write completed. Luckily, we do // know how many bytes were written. offset - regs.syscall_result() : offset; } default: { ASSERT(t, false) << "Can not retrieve offset for this system call."; return -1; } } } static int64_t retrieve_offset(Task* t, int syscallno, const Registers& regs) { RR_ARCH_FUNCTION(retrieve_offset_arch, t->arch(), t, syscallno, regs); } int64_t FileMonitor::LazyOffset::retrieve(bool needed_for_replay) { bool is_replay = t->session().is_replaying(); bool is_implicit_offset = is_implict_offset_syscall(t->arch(), syscallno); ASSERT(t, needed_for_replay || !is_replay); // There is no way we can figure out this information now, so retrieve it // from the trace (we record it below under the same circumstance). if (is_replay && is_implicit_offset) { return static_cast(t) ->current_trace_frame() .event() .Syscall() .write_offset; } int64_t offset = retrieve_offset(t, syscallno, regs); if (needed_for_replay && is_implicit_offset) { static_cast(t)->ev().Syscall().write_offset = offset; } return offset; } } rr-5.5.0/src/FileMonitor.h000066400000000000000000000070731412202446200153370ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_FILE_MONITOR_H_ #define RR_FILE_MONITOR_H_ class Task; #include #include #include #include #include "preload/preload_interface.h" #include "util.h" namespace rr { class RecordTask; class Registers; class FileMonitor { public: typedef std::shared_ptr shr_ptr; virtual ~FileMonitor() {} enum Type { Base, MagicSaveData, Mmapped, Preserve, ProcFd, ProcMem, Stdio, VirtualPerfCounter, SysCpu, ProcStat, RRPage }; virtual Type type() { return Base; } /** * Overriding this to return true will cause close() (and related fd-smashing * operations such as dup2) to return EBADF, and hide it from the tracee's * /proc/pid/fd/ */ virtual bool is_rr_fd() { return false; } /** * Notification that task |t| is about to write |data| bytes of length * |length| to the file. * In general writes can block, and concurrent blocking writes to the same * file may race so that the kernel performs writes out of order * with respect to will_write notifications. * If it is known that the write cannot block (or that blocking all of rr * on it is OK), this notification can return PREVENT_SWITCH to make the * write a blocking write. This ensures that writes are performed in the order * of will_write notifications. */ virtual Switchable will_write(Task*) { return ALLOW_SWITCH; } /** * Notification that task |t| wrote to the file descriptor. * Due to races, if will_write did not return PREVENT_SWITCH, it's possible * that the data in the buffers is not what was actually written. */ struct Range { remote_ptr data; size_t length; Range(remote_ptr data, size_t length) : data(data), length(length) {} }; /** * Encapsulates the offset at which to read or write. Computing this may be * an expensive operation if the offset is implicit (i.e. is taken from the * file descriptor), so we only do it if we actually need to look at the * offset. */ class LazyOffset { public: LazyOffset(Task* t, const Registers& regs, int64_t syscallno) : t(t), regs(regs), syscallno(syscallno) {} int64_t retrieve(bool needed_for_replay); private: Task* t; const Registers& regs; int64_t syscallno; }; virtual void did_write(Task*, const std::vector&, LazyOffset&) {} /** * Return true if the ioctl should be fully emulated. If so the result * is stored in the last parameter. * Only called during recording. */ virtual bool emulate_ioctl(RecordTask*, uint64_t*) { return false; } /** * Return true if the fcntl should should be fully emulated. If so the * result is stored in the last parameter. * Only called during recording. */ virtual bool emulate_fcntl(RecordTask*, uint64_t*) { return false; } /** * Return true if the read should should be fully emulated. If so the * result is stored in the last parameter. The emulation should write to the * task's memory ranges. * Only called during recording. */ virtual bool emulate_read(RecordTask*, const std::vector&, LazyOffset&, uint64_t*) { return false; } /** * Allows the FileMonitor to rewrite the output of a getdents/getdents64 call * if desired. */ virtual void filter_getdents(RecordTask*) {} virtual enum syscallbuf_fd_classes get_syscallbuf_class() { return FD_CLASS_TRACED; } }; } // namespace rr #endif /* RR_FILE_MONITOR_H_ */ rr-5.5.0/src/FileNameCommand.cc000066400000000000000000000042571412202446200162260ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include "AddressSpace.h" #include "Command.h" #include "TraceStream.h" #include "core.h" #include "main.h" #include "util.h" using namespace std; namespace rr { class FileNameCommand : public Command { public: virtual int run(vector& args) override; protected: FileNameCommand(const char* name, const char* help) : Command(name, help) {} bool parse_file_name(vector& args, string* out); static FileNameCommand singleton; }; FileNameCommand FileNameCommand::singleton( "filename", " rr filename \n" " Prints the original filename for a given trace file name.\n"); static void print_original_file_name(const string& trace_dir, const string& file_name, FILE* out) { TraceReader trace(trace_dir); unordered_set original_files; string full_file_name = trace.dir() + "/" + file_name; while (true) { TraceReader::MappedData data; bool found; KernelMapping km = trace.read_mapped_region( &data, &found, TraceReader::VALIDATE, TraceReader::ANY_TIME); if (!found) { break; } if (data.source == TraceReader::SOURCE_FILE && data.file_name == full_file_name && !km.fsname().empty() && original_files.find(km.fsname()) == original_files.end()) { fprintf(out, "%s\n", km.fsname().c_str()); original_files.insert(km.fsname()); } } } bool FileNameCommand::parse_file_name(vector& args, string* out) { if (args.empty() || !verify_not_option(args)) { return false; } *out = args[0]; args.erase(args.begin()); return true; } int FileNameCommand::run(vector& args) { string file_name; if (!parse_file_name(args, &file_name) || !args.empty()) { print_help(stderr); return 1; } string trace_dir; size_t last_slash = file_name.rfind('/'); if (last_slash == string::npos) { trace_dir = "."; } else { trace_dir = file_name.substr(0, last_slash); file_name = file_name.substr(last_slash + 1); } print_original_file_name(trace_dir, file_name, stdout); return 0; } } // namespace rr rr-5.5.0/src/Flags.cc000066400000000000000000000003251412202446200142730ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Flags.h" namespace rr { Flags& Flags::get_for_init() { return singleton; } Flags Flags::singleton; } // namespace rr rr-5.5.0/src/Flags.h000066400000000000000000000047001412202446200141360ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_FLAGS_H_ #define RR_FLAGS_H_ #include #include #include #include "Ticks.h" #include "TraceFrame.h" namespace rr { /** * Command line arguments for rr */ struct Flags { enum { CHECKSUM_NONE = -3, CHECKSUM_SYSCALL = -2, CHECKSUM_ALL = -1 }; /* When to generate or check memory checksums. One of CHECKSUM_NONE, * CHECKSUM_SYSCALL or CHECKSUM_ALL, or a positive integer representing the * event time at which to start checksumming. */ FrameTime checksum; enum { DUMP_ON_ALL = 10000, DUMP_ON_RDTSC = 10001, DUMP_ON_NONE = -DUMP_ON_ALL }; int dump_on; enum { DUMP_AT_NONE = -1 }; /* time at which to create memory dump */ FrameTime dump_at; // global time // Force rr to do some things that it otherwise wouldn't, for // example launching an emergency debugger when the output // doesn't seem to be a tty. bool force_things; /* Mark the trace global time along with tracee writes to * stdio. */ bool mark_stdio; // Check that cached mmaps match /proc/maps after each event. bool check_cached_mmaps; // Suppress warnings related to environmental features outside rr's // control. bool suppress_environment_warnings; // Any warning or error that would be printed is treated as fatal bool fatal_errors_and_warnings; // Pretend CPUID faulting support doesn't exist bool disable_cpuid_faulting; // Don't listen for PTRACE_EVENT_EXIT events, to test how rr handles // missing PTRACE_EVENT_EXITs. bool disable_ptrace_exit_events; // User override for architecture detection, e.g. when running // under valgrind. std::string forced_uarch; // User override for the path to page files and other resources. std::string resource_path; Flags() : checksum(CHECKSUM_NONE), dump_on(DUMP_ON_NONE), dump_at(DUMP_AT_NONE), force_things(false), mark_stdio(false), check_cached_mmaps(false), suppress_environment_warnings(false), fatal_errors_and_warnings(false), disable_cpuid_faulting(false), disable_ptrace_exit_events(false) {} static const Flags& get() { return singleton; } /** * Get a reference that can be used to initialize the global Flags. * Can only be called once. */ static Flags& get_for_init(); private: static Flags singleton; }; } // namespace rr #endif /* RR_FLAGS_H_ */ rr-5.5.0/src/GdbCommand.cc000066400000000000000000000134571412202446200152440ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "GdbCommand.h" #include "ReplayTask.h" #include "log.h" using namespace std; namespace rr { static SimpleGdbCommand elapsed_time( "elapsed-time", "Print elapsed time (in seconds) since the start of the trace, in the" " 'record' timeline.", [](GdbServer&, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } ReplayTask* replay_t = static_cast(t); double elapsed_time = replay_t->current_trace_frame().monotonic_time() - replay_t->session().get_trace_start_time(); return string("Elapsed Time (s): ") + to_string(elapsed_time); }); static SimpleGdbCommand when( "when", "Print the current rr event number.", [](GdbServer&, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } return string("Current event: ") + to_string( static_cast(t)->current_trace_frame().time()); }); static SimpleGdbCommand when_ticks( "when-ticks", "Print the current rr tick count for the current thread.", [](GdbServer&, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } return string("Current tick: ") + to_string(t->tick_count()); }); static SimpleGdbCommand when_tid( "when-tid", "Print the real tid for the current thread.", [](GdbServer&, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } return string("Current tid: ") + to_string(t->tid); }); static std::vector back_stack; static ReplayTimeline::Mark current_history_cp; static std::vector forward_stack; static SimpleGdbCommand rr_history_push( "rr-history-push", "Push an entry into the rr history.", [](GdbServer& gdb_server, Task* t, const vector&) { if (!t->session().is_replaying()) { // Don't create new history state inside a diversion return string(); } if (current_history_cp) { back_stack.push_back(current_history_cp); } current_history_cp = gdb_server.get_timeline().mark(); forward_stack.clear(); return string(); }); static SimpleGdbCommand back( "back", "Go back one entry in the rr history.", [](GdbServer& gdb_server, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } if (back_stack.size() == 0) { return string("Can't go back. No more history entries."); } forward_stack.push_back(current_history_cp); current_history_cp = back_stack.back(); back_stack.pop_back(); gdb_server.get_timeline().seek_to_mark(current_history_cp); return string(); }); static SimpleGdbCommand forward( "forward", "Go forward one entry in the rr history.", [](GdbServer& gdb_server, Task* t, const vector&) { if (!t->session().is_replaying()) { return GdbCommandHandler::cmd_end_diversion(); } if (forward_stack.size() == 0) { return string("Can't go forward. No more history entries."); } back_stack.push_back(current_history_cp); current_history_cp = forward_stack.back(); forward_stack.pop_back(); gdb_server.get_timeline().seek_to_mark(current_history_cp); return string(); }); static int gNextCheckpointId = 0; string invoke_checkpoint(GdbServer& gdb_server, Task*, const vector& args) { const string& where = args[1]; int checkpoint_id = ++gNextCheckpointId; GdbServer::Checkpoint::Explicit e; if (gdb_server.timeline.can_add_checkpoint()) { e = GdbServer::Checkpoint::EXPLICIT; } else { e = GdbServer::Checkpoint::NOT_EXPLICIT; } gdb_server.checkpoints[checkpoint_id] = GdbServer::Checkpoint( gdb_server.timeline, gdb_server.last_continue_tuid, e, where); return string("Checkpoint ") + to_string(checkpoint_id) + " at " + where; } static SimpleGdbCommand checkpoint( "checkpoint", "create a checkpoint representing a point in the execution\n" "use the 'restart' command to return to the checkpoint", invoke_checkpoint); string invoke_delete_checkpoint(GdbServer& gdb_server, Task*, const vector& args) { int id = stoi(args[1]); auto it = gdb_server.checkpoints.find(id); if (it != gdb_server.checkpoints.end()) { if (it->second.is_explicit == GdbServer::Checkpoint::EXPLICIT) { gdb_server.timeline.remove_explicit_checkpoint(it->second.mark); } gdb_server.checkpoints.erase(it); return string("Deleted checkpoint ") + to_string(id) + "."; } else { return string("No checkpoint number ") + to_string(id) + "."; } } static SimpleGdbCommand delete_checkpoint( "delete checkpoint", "remove a checkpoint created with the 'checkpoint' command", invoke_delete_checkpoint); string invoke_info_checkpoints(GdbServer& gdb_server, Task*, const vector&) { if (gdb_server.checkpoints.size() == 0) { return "No checkpoints."; } string out = "ID\tWhen\tWhere"; for (auto& c : gdb_server.checkpoints) { out += string("\n") + to_string(c.first) + "\t" + to_string(c.second.mark.time()) + "\t" + c.second.where; } return out; } static SimpleGdbCommand info_checkpoints( "info checkpoints", "list all checkpoints created with the 'checkpoint' command", invoke_info_checkpoints); /*static*/ void GdbCommand::init_auto_args() { checkpoint.add_auto_arg("rr-where"); } } // namespace rr rr-5.5.0/src/GdbCommand.h000066400000000000000000000041751412202446200151030ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_COMMAND_H_ #define RR_GDB_COMMAND_H_ #include "GdbCommandHandler.h" #include "GdbServer.h" #include #include #include namespace rr { class GdbCommand { protected: GdbCommand(const std::string& cmd_name, const std::string& documentation) : cmd_name(cmd_name), documentation(documentation) { GdbCommandHandler::register_command(*this); } public: virtual ~GdbCommand() {} const std::string& name() const { return cmd_name; } const std::string& docs() const { return documentation; } /** * Handle the RR Cmd and return a string response to be echo'd * to the user. * * NOTE: args[0] is the command name */ virtual std::string invoke(GdbServer& gdb_server, Task* t, const std::vector& args) = 0; /** * When called, gdb will automatically run gdb.execute() on this string and * pass it as an argument to the rr command. This is useful to pass gdb * state alongside the command invocation. */ void add_auto_arg(const std::string& auto_arg) { cmd_auto_args.push_back(auto_arg); } const std::vector& auto_args() const { return cmd_auto_args; } /** * Setup all the automatic auto_args for our commands. */ static void init_auto_args(); private: const std::string cmd_name; const std::string documentation; std::vector cmd_auto_args; }; class SimpleGdbCommand : public GdbCommand { public: SimpleGdbCommand( const std::string& cmd_name, const std::string& documentation, const std::function&)>& invoker) : GdbCommand(cmd_name, documentation), invoker(invoker) {} virtual std::string invoke(GdbServer& gdb_server, Task* t, const std::vector& args) override { return invoker(gdb_server, t, args); } std::function&)> invoker; }; } // namespace rr #endif rr-5.5.0/src/GdbCommandHandler.cc000066400000000000000000000155541412202446200165420ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "GdbCommandHandler.h" #include "GdbCommand.h" #include "log.h" #include #include using namespace std; namespace rr { // HashMap would be better here but the unordered_map API is annoying // and linear search is fine. static vector* gdb_command_list; static string gdb_macro_binding(const GdbCommand& cmd) { string auto_args_str = "["; for (size_t i = 0; i < cmd.auto_args().size(); i++) { if (i > 0) { auto_args_str += ", "; } auto_args_str += "'" + cmd.auto_args()[i] + "'"; } auto_args_str += "]"; string ret = "python RRCmd('" + cmd.name() + "', " + auto_args_str + ")\n"; if (!cmd.docs().empty()) { ret += "document " + cmd.name() + "\n" + cmd.docs() + "\nend\n"; } return ret; } /* static */ string GdbCommandHandler::gdb_macros() { GdbCommand::init_auto_args(); stringstream ss; ss << string(R"Delimiter( set python print-stack full python import re def gdb_unescape(string): str_len = len(string) if str_len % 2: return "" result = "" # check for unexpected string length try: pos = 0 while pos < str_len: hex_char = string[pos:pos+2] result += chr(int(hex_char, 16)) pos += 2 except: # check for unexpected string value return "" return result def gdb_escape(string): result = "" pos = 0 for curr_char in string: result += format(ord(curr_char), '02x') return result class RRWhere(gdb.Command): """Helper to get the location for checkpoints/history. Used by auto-args""" def __init__(self): gdb.Command.__init__(self, 'rr-where', gdb.COMMAND_USER, gdb.COMPLETE_NONE, False) def invoke(self, arg, from_tty): #Get the symbol name from 'frame 0' in the format: # '#0 0x00007f9d81a04c46 in _dl_start (arg=0x7ffee1f1c740) at rtld.c:356 # 356 in rtld.c' try: rv = gdb.execute('frame 0', to_string=True) except: rv = "???" # This may occurs if we're not running m = re.match("#0\w*(.*)", rv); if m: rv = m.group(1) else: rv = rv + "???" gdb.write(rv) RRWhere() class RRCmd(gdb.Command): def __init__(self, name, auto_args): gdb.Command.__init__(self, name, gdb.COMMAND_USER, gdb.COMPLETE_NONE, False) self.cmd_name = name self.auto_args = auto_args def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) self.rr_cmd(args) def rr_cmd(self, args): cmd_prefix = "maint packet qRRCmd:" + gdb_escape(self.cmd_name) argStr = "" for auto_arg in self.auto_args: argStr += ":" + gdb_escape(gdb.execute(auto_arg, to_string=True)) for arg in args: argStr += ":" + gdb_escape(arg) rv = gdb.execute(cmd_prefix + argStr, to_string=True); rv_match = re.search('received: "(.*)"', rv, re.MULTILINE); if not rv_match: gdb.write("Response error: " + rv) return response = gdb_unescape(rv_match.group(1)) gdb.write(response) def history_push(p): gdb.execute("rr-history-push", to_string=True) rr_suppress_run_hook = False class RRHookRun(gdb.Command): def __init__(self): gdb.Command.__init__(self, 'rr-hook-run', gdb.COMMAND_USER, gdb.COMPLETE_NONE, False) def invoke(self, arg, from_tty): thread = int(gdb.parse_and_eval("$_thread")) if thread != 0 and not rr_suppress_run_hook: gdb.execute("stepi") class RRSetSuppressRunHook(gdb.Command): def __init__(self): gdb.Command.__init__(self, 'rr-set-suppress-run-hook', gdb.COMMAND_USER, gdb.COMPLETE_NONE, False) def invoke(self, arg, from_tty): rr_suppress_run_hook = arg == '1' RRHookRun() RRSetSuppressRunHook() #Automatically push an history entry when the program execution stops #(signal, breakpoint).This is fired before an interactive prompt is shown. #Disabled for now since it's not fully working. #gdb.events.stop.connect(history_push) end )Delimiter"); if (gdb_command_list) { for (auto& it : *gdb_command_list) { ss << gdb_macro_binding(*it); } } ss << string(R"Delimiter( define hookpost-back frame end define hookpost-forward frame end )Delimiter"); return ss.str(); } /*static*/ GdbCommand* GdbCommandHandler::command_for_name(const string& name) { if (!gdb_command_list) { return nullptr; } for (auto& it : *gdb_command_list) { if (it->name() == name) { return it; } } return nullptr; } void GdbCommandHandler::register_command(GdbCommand& cmd) { LOG(debug) << "registering command: " << cmd.name(); if (!gdb_command_list) { gdb_command_list = new vector(); } gdb_command_list->push_back(&cmd); } // applies the simplest two hex character by byte encoding static string gdb_escape(const string& str) { stringstream ss; ss << hex; const size_t len = str.size(); const char *data = str.data(); for (size_t i = 0; i < len; i++) { int chr = data[i]; if (chr < 16) { ss << "0"; } ss << chr; } return ss.str(); } // undo the two hex character byte encoding, // in case of error returns an empty string static string gdb_unescape(const string& str) { const size_t len = str.size(); // check for unexpected string length if (len % 2) { return ""; } stringstream ss; for (size_t i = 0; i < len; i += 2) { string substr = str.substr(i, 2); const char *hex_str = substr.c_str(); char *ptr = nullptr; ss << (char)strtoul(hex_str, &ptr, 16); // check for unexpected character if (*ptr) { return ""; } } return ss.str(); } static vector parse_cmd(string& str) { vector args; size_t pos = 0; string delimiter = ":"; while ((pos = str.find(delimiter)) != string::npos) { args.push_back(gdb_unescape(str.substr(0, pos))); str.erase(0, pos + delimiter.length()); } args.push_back(gdb_unescape(str)); return args; } /* static */ string GdbCommandHandler::process_command(GdbServer& gdb_server, Task* t, string payload) { const vector args = parse_cmd(payload); GdbCommand* cmd = command_for_name(args[0]); if (!cmd) { return gdb_escape(string() + "Command '" + args[0] + "' not found.\n"); } LOG(debug) << "invoking command: " << cmd->name(); string resp = cmd->invoke(gdb_server, t, args); if (resp == GdbCommandHandler::cmd_end_diversion()) { LOG(debug) << "cmd must run outside of diversion (" << resp << ")"; return resp; } LOG(debug) << "cmd response: " << resp; return gdb_escape(resp + "\n"); } } // namespace rr rr-5.5.0/src/GdbCommandHandler.h000066400000000000000000000020541412202446200163730ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_COMMAND_HANDLER_H_ #define RR_GDB_COMMAND_HANDLER_H_ #include namespace rr { class GdbCommand; class GdbServer; class Task; class GdbCommandHandler { public: // Declare any registered command with supporting // wrapper code. static std::string gdb_macros(); static void register_command(GdbCommand& cmd); /** * Process an incoming GDB payload of the following form: * :::... * * NOTE: RR Command are typically sent with the qRRCmd: prefix which * should of been striped already. */ static std::string process_command(GdbServer& gdb_server, Task* t, std::string payload); static GdbCommand* command_for_name(const std::string& name); /** * Special return value for commands that immediatly end a diversion session */ static std::string cmd_end_diversion() { return std::string("RRCmd_EndDiversion"); } private: }; } // namespace rr #endif rr-5.5.0/src/GdbConnection.cc000066400000000000000000001427211412202446200157620ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #define REVERSE_EXECUTION /** * Much of this implementation is based on the documentation at * * http://sourceware.org/gdb/onlinedocs/gdb/Packets.html */ #include "GdbConnection.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "GdbCommandHandler.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "core.h" #include "log.h" using namespace std; namespace rr { static const char INTERRUPT_CHAR = '\x03'; #define UNHANDLED_REQ() \ write_packet(""); \ LOG(info) const GdbThreadId GdbThreadId::ANY(0, 0); const GdbThreadId GdbThreadId::ALL(-1, -1); #ifdef DEBUG static bool request_needs_immediate_response(const GdbRequest* req) { switch (req->type) { case DREQ_NONE: case DREQ_CONT: return false; default: return true; } } #endif GdbConnection::GdbConnection(pid_t tgid, const Features& features) : tgid(tgid), cpu_features_(0), no_ack(false), features_(features), connection_alive_(true) { #ifndef REVERSE_EXECUTION features_.reverse_execution = false; #endif } void GdbConnection::await_debugger(ScopedFd& listen_fd) { sock_fd = ScopedFd(accept(listen_fd, nullptr, nullptr)); // We might restart this debugging session, so don't set the // socket fd CLOEXEC. } /** * Poll for data to or from gdb, waiting |timeoutMs|. 0 means "don't * wait", and -1 means "wait forever". Return true if data is ready. */ static bool poll_socket(const ScopedFd& sock_fd, short events, int timeoutMs) { struct pollfd pfd; memset(&pfd, 0, sizeof(pfd)); pfd.fd = sock_fd; pfd.events = events; int ret = poll(&pfd, 1, timeoutMs); if (ret < 0 && errno != EINTR) { LOG(info) << "gdb socket has been closed"; } return ret > 0; } static bool poll_incoming(const ScopedFd& sock_fd, int timeoutMs) { return poll_socket(sock_fd, POLLIN /* TODO: |POLLERR */, timeoutMs); } static void poll_outgoing(const ScopedFd& sock_fd, int timeoutMs) { poll_socket(sock_fd, POLLOUT /* TODO: |POLLERR */, timeoutMs); } /** * read() incoming data exactly one time, successfully. May block. */ void GdbConnection::read_data_once() { ssize_t nread; /* Wait until there's data, instead of busy-looping on * EAGAIN. */ poll_incoming(sock_fd, -1 /* wait forever */); uint8_t buf[4096]; nread = read(sock_fd, buf, sizeof(buf)); if (nread <= 0) { LOG(info) << "Could not read data from gdb socket, " "marking connection as closed"; connection_alive_ = false; } else { inbuf.insert(inbuf.end(), buf, buf + nread); } } void GdbConnection::write_flush() { size_t write_index = 0; outbuf.push_back(0); LOG(debug) << "write_flush: '" << outbuf.data() << "'"; outbuf.pop_back(); while (write_index < outbuf.size()) { ssize_t nwritten; poll_outgoing(sock_fd, -1 /*wait forever*/); nwritten = write(sock_fd, outbuf.data() + write_index, outbuf.size() - write_index); if (nwritten < 0) { LOG(info) << "Could not write data to gdb socket, " "marking connection as closed"; connection_alive_ = false; outbuf.clear(); return; } else { write_index += nwritten; } } outbuf.clear(); } void GdbConnection::write_data_raw(const uint8_t* data, ssize_t len) { outbuf.insert(outbuf.end(), data, data + len); } void GdbConnection::write_hex(unsigned long hex) { char buf[32]; size_t len; len = snprintf(buf, sizeof(buf) - 1, "%02lx", hex); write_data_raw((uint8_t*)buf, len); } void GdbConnection::write_packet_bytes(const uint8_t* data, size_t num_bytes) { uint8_t checksum; size_t i; write_data_raw((uint8_t*)"$", 1); for (i = 0, checksum = 0; i < num_bytes; ++i) { checksum += data[i]; } write_data_raw((uint8_t*)data, num_bytes); write_data_raw((uint8_t*)"#", 1); write_hex(checksum); } void GdbConnection::write_packet(const char* data) { return write_packet_bytes((const uint8_t*)data, strlen(data)); } void GdbConnection::write_binary_packet(const char* pfx, const uint8_t* data, ssize_t num_bytes) { ssize_t pfx_num_chars = strlen(pfx); vector buf; buf.resize(2 * num_bytes + pfx_num_chars); ssize_t buf_num_bytes = 0; int i; memcpy((char*)buf.data(), pfx, pfx_num_chars); buf_num_bytes += pfx_num_chars; for (i = 0; i < num_bytes; ++i) { uint8_t b = data[i]; if (buf_num_bytes + 2 > ssize_t(buf.size())) { break; } switch (b) { case '#': case '$': case '}': case '*': buf.data()[buf_num_bytes++] = '}'; buf.data()[buf_num_bytes++] = b ^ 0x20; break; default: buf.data()[buf_num_bytes++] = b; break; } } LOG(debug) << " ***** NOTE: writing binary data, upcoming debug output may " "be truncated"; return write_packet_bytes(buf.data(), buf_num_bytes); } void GdbConnection::write_hex_bytes_packet(const char* prefix, const uint8_t* bytes, size_t len) { if (prefix[0] == '\0' && 0 == len) { write_packet(""); return; } ssize_t pfx_num_chars = strlen(prefix); vector buf; buf.resize(pfx_num_chars + 2 * len + 1); memcpy(buf.data(), prefix, pfx_num_chars); for (size_t i = 0; i < len; ++i) { unsigned long b = bytes[i]; snprintf(&buf.data()[pfx_num_chars + 2 * i], 3, "%02lx", b); } write_packet(buf.data()); } void GdbConnection::write_hex_bytes_packet(const uint8_t* bytes, size_t len) { write_hex_bytes_packet("", bytes, len); } static void parser_assert(bool cond) { if (!cond) { fputs("Failed to parse gdb request\n", stderr); DEBUG_ASSERT(false); exit(2); } } static string decode_ascii_encoded_hex_str(const char* encoded) { ssize_t enc_len = strlen(encoded); parser_assert(enc_len % 2 == 0); string str; for (int i = 0; i < enc_len / 2; ++i) { char enc_byte[] = { encoded[2 * i], encoded[2 * i + 1], '\0' }; char* endp; int c = strtol(enc_byte, &endp, 16); parser_assert(c < 128); str += static_cast(c); } return str; } bool GdbConnection::skip_to_packet_start() { ssize_t end = -1; /* XXX we want memcspn() here ... */ for (size_t i = 0; i < inbuf.size(); ++i) { if (inbuf[i] == '$' || inbuf[i] == INTERRUPT_CHAR) { end = i; break; } } if (end < 0) { /* Discard all read bytes, which we don't care * about. */ inbuf.clear(); return false; } /* Discard bytes up to start-of-packet. */ inbuf.erase(inbuf.begin(), inbuf.begin() + end); parser_assert(1 <= inbuf.size()); parser_assert('$' == inbuf[0] || INTERRUPT_CHAR == inbuf[0]); return true; } bool GdbConnection::sniff_packet() { if (skip_to_packet_start()) { /* We've already seen a (possibly partial) packet. */ return true; } parser_assert(inbuf.empty()); return poll_incoming(sock_fd, 0 /*don't wait*/); } void GdbConnection::read_packet() { /* Read and discard bytes until we see the start of a * packet. * * NB: we're ignoring "+/-" responses from gdb. There doesn't * seem to be any sane reason why we would send a damaged * packet to gdb over TCP, then see a "-" reply from gdb and * somehow magically fix our bug that led to the malformed * packet in the first place. */ while (!skip_to_packet_start() && connection_alive_) { read_data_once(); } if (!connection_alive_) { return; } if (inbuf[0] == INTERRUPT_CHAR) { /* Interrupts are kind of an ugly duckling in the gdb * protocol ... */ packetend = 1; return; } /* Read until we see end-of-packet. */ size_t checkedlen = 0; while (true) { uint8_t* p = (uint8_t*)memchr(inbuf.data() + checkedlen, '#', inbuf.size() - checkedlen); if (p) { packetend = p - inbuf.data(); break; } checkedlen = inbuf.size(); read_data_once(); if (!connection_alive_) { return; } } /* NB: we're ignoring the gdb packet checksums here too. If * gdb is corrupted enough to garble a checksum over TCP, it's * not really clear why asking for the packet again might make * the bug go away. */ parser_assert('$' == inbuf[0] && packetend < inbuf.size()); /* Acknowledge receipt of the packet. */ if (!no_ack) { write_data_raw((uint8_t*)"+", 1); write_flush(); } } static void read_binary_data(const uint8_t* payload, const uint8_t* payload_end, vector& data) { data.clear(); while (payload < payload_end) { uint8_t b = *payload++; if ('}' == b) { parser_assert(payload < payload_end); b = 0x20 ^ *payload++; } data.push_back(b); } } /** * Parse and return a gdb thread-id from |str|. |endptr| points to * the character just after the last character in the thread-id. It * may be nullptr. */ static GdbThreadId parse_threadid(const char* str, char** endptr) { GdbThreadId t; char* endp; bool multiprocess = false; if ('p' == *str) { multiprocess = true; ++str; } t.pid = strtol(str, &endp, 16); parser_assert(endp); if ('\0' == *endp) { if (multiprocess) { t.tid = -1; } else { t.tid = t.pid; t.pid = -1; } *endptr = endp; return t; } parser_assert('.' == *endp); str = endp + 1; t.tid = strtol(str, &endp, 16); *endptr = endp; return t; } void GdbConnection::write_xfer_response(const void* data, size_t size, uint64_t offset, uint64_t len) { if (offset > size) { write_packet("E01"); return; } if (offset == size) { write_packet("l"); return; } if (offset + len < size) { write_binary_packet("m", static_cast(data) + offset, len); return; } write_binary_packet("l", static_cast(data) + offset, size - offset); } static string read_target_desc(const char* file_name) { string path = resource_path() + "share/rr/" + string(file_name); stringstream ss; FILE* f = fopen(path.c_str(), "r"); DEBUG_ASSERT(f); while (true) { int ch = getc(f); if (ch == EOF) { break; } ss << (char)ch; } fclose(f); return ss.str(); } static const char* target_description_name(uint32_t cpu_features) { // This doesn't scale, but it's what gdb does... switch (cpu_features) { case 0: return "i386-linux.xml"; case GdbConnection::CPU_X86_64: return "amd64-linux.xml"; case GdbConnection::CPU_AVX: return "i386-avx-linux.xml"; case GdbConnection::CPU_X86_64 | GdbConnection::CPU_AVX: return "amd64-avx-linux.xml"; case GdbConnection::CPU_AARCH64: return "aarch64-core.xml"; default: FATAL() << "Unknown features"; return nullptr; } } bool GdbConnection::xfer(const char* name, char* args) { const char* mode = args; args = strchr(args, ':'); parser_assert(args); *args++ = '\0'; if (strcmp(mode, "read") && strcmp(mode, "write")) { write_packet(""); return false; } const char* annex = args; args = strchr(args, ':'); parser_assert(args); *args++ = '\0'; uint64_t offset = strtoul(args, &args, 16); uint64_t len = 0; if (!strcmp(mode, "read")) { parser_assert(',' == *args++); len = strtoul(args, &args, 16); parser_assert(!*args); } else { parser_assert(*args == ':'); ++args; } LOG(debug) << "gdb asks us to transfer " << name << " mode=" << mode << ", annex=" << annex << ", offset=" << offset << " len=" << len; if (!strcmp(name, "auxv")) { if (strcmp(annex, "")) { write_packet("E00"); return false; } if (strcmp(mode, "read")) { write_packet(""); return false; } req = GdbRequest(DREQ_GET_AUXV); req.target = query_thread; // XXX handle offset/len here! return true; } if (!strcmp(name, "exec-file")) { if (strcmp(mode, "read")) { write_packet(""); return false; } req = GdbRequest(DREQ_GET_EXEC_FILE); req.target.pid = req.target.tid = strtoul(annex, nullptr, 16); // XXX handle offset/len here! return true; } if (!strcmp(name, "siginfo")) { if (strcmp(annex, "")) { write_packet("E00"); return false; } if (!strcmp(mode, "read")) { req = GdbRequest(DREQ_READ_SIGINFO); req.target = query_thread; req.mem().addr = offset; req.mem().len = len; return true; } req = GdbRequest(DREQ_WRITE_SIGINFO); req.target = query_thread; return true; } if (!strcmp(name, "features")) { if (strcmp(mode, "read")) { write_packet(""); return false; } string target_desc = read_target_desc((strcmp(annex, "") && strcmp(annex, "target.xml")) ? annex : target_description_name(cpu_features_)); write_xfer_response(target_desc.c_str(), target_desc.size(), offset, len); return false; } write_packet(""); return false; } /** * Format |value| into |buf| in the manner gdb expects. |buf| must * point at a buffer with at least |1 + 2*DBG_MAX_REG_SIZE| bytes * available. Fewer bytes than that may be written, but |buf| is * guaranteed to be null-terminated. */ static size_t print_reg_value(const GdbRegisterValue& reg, char* buf) { parser_assert(reg.size <= GdbRegisterValue::MAX_SIZE); if (reg.defined) { /* gdb wants the register value in native endianness. * reg.value read in native endianness is exactly that. */ for (size_t i = 0; i < reg.size; ++i) { snprintf(&buf[2 * i], 3, "%02lx", (unsigned long)reg.value[i]); } } else { for (size_t i = 0; i < reg.size; ++i) { strcpy(&buf[2 * i], "xx"); } } return reg.size * 2; } /** * Read the encoded register value in |strp| into |reg|. |strp| may * be mutated. */ static void read_reg_value(char** strp, GdbRegisterValue* reg) { char* str = *strp; if ('x' == str[0]) { reg->defined = false; reg->size = 0; return; } reg->defined = true; reg->size = strlen(str) / 2; for (size_t i = 0; i < reg->size; ++i) { char tmp = str[2]; str[2] = '\0'; reg->value[i] = strtoul(str, &str, 16); parser_assert('\0' == *str); str[0] = tmp; } *strp = str; } bool GdbConnection::query(char* payload) { const char* name; char* args; args = strchr(payload, ':'); if (args) { *args++ = '\0'; } name = payload; if (strstr(name, "RRCmd") == name) { LOG(debug) << "gdb requests rr cmd: " << name; req = GdbRequest(DREQ_RR_CMD); req.text_ = args; return true; } if (!strcmp(name, "C")) { LOG(debug) << "gdb requests current thread ID"; req = GdbRequest(DREQ_GET_CURRENT_THREAD); return true; } if (!strcmp(name, "Attached")) { LOG(debug) << "gdb asks if this is a new or existing process"; /* Tell gdb this is an existing process; it might be * (see emergency_debug()). */ write_packet("1"); return false; } if (!strcmp(name, "fThreadInfo")) { LOG(debug) << "gdb asks for thread list"; req = GdbRequest(DREQ_GET_THREAD_LIST); return true; } if (!strcmp(name, "sThreadInfo")) { write_packet("l"); /* "end of list" */ return false; } if (!strcmp(name, "GetTLSAddr")) { LOG(debug) << "gdb asks for TLS addr"; req = GdbRequest(DREQ_TLS); req.target = parse_threadid(args, &args); parser_assert(*args == ','); ++args; size_t offset = strtoul(args, &args, 16); parser_assert(*args == ','); ++args; remote_ptr load_module = strtoul(args, &args, 16); parser_assert(*args == '\0'); req.tls().offset = offset; req.tls().load_module = load_module; return true; } if (!strcmp(name, "Offsets")) { LOG(debug) << "gdb asks for section offsets"; req = GdbRequest(DREQ_GET_OFFSETS); req.target = query_thread; return true; } if ('P' == name[0]) { /* The docs say not to use this packet ... */ write_packet(""); return false; } if (!strcmp(name, "Supported")) { /* TODO process these */ LOG(debug) << "gdb supports " << args; multiprocess_supported_ = strstr(args, "multiprocess+") != nullptr; hwbreak_supported_ = strstr(args, "hwbreak+") != nullptr; swbreak_supported_ = strstr(args, "swbreak+") != nullptr; stringstream supported; // Encourage gdb to use very large packets since we support any packet size supported << "PacketSize=1048576" ";QStartNoAckMode+" ";qXfer:features:read+" ";qXfer:auxv:read+" ";qXfer:exec-file:read+" ";qXfer:siginfo:read+" ";qXfer:siginfo:write+" ";multiprocess+" ";hwbreak+" ";swbreak+" ";ConditionalBreakpoints+" ";vContSupported+"; if (features().reverse_execution) { supported << ";ReverseContinue+" ";ReverseStep+"; } write_packet(supported.str().c_str()); return false; } if (!strcmp(name, "Symbol")) { LOG(debug) << "gdb is ready for symbol lookups"; const char* colon = strchr(args, ':'); parser_assert(colon != nullptr); req = GdbRequest(DREQ_QSYMBOL); if (*args == ':') { req.sym().has_address = false; } else { req.sym().has_address = true; req.sym().address = strtoul(args, &args, 16); } parser_assert(*args == ':'); ++args; req.sym().name = decode_ascii_encoded_hex_str(args); return true; } if (strstr(name, "ThreadExtraInfo") == name) { // ThreadExtraInfo is a special snowflake that // delimits its args with ','. parser_assert(!args); args = payload; args = 1 + strchr(args, ',' /*sic*/); req = GdbRequest(DREQ_GET_THREAD_EXTRA_INFO); req.target = parse_threadid(args, &args); parser_assert('\0' == *args); return true; } if (!strcmp(name, "TStatus")) { LOG(debug) << "gdb asks for trace status"; /* XXX from the docs, it appears that we should reply * with "T0" here. But if we do, gdb keeps bothering * us with trace queries. So pretend we don't know * what it's talking about. */ write_packet(""); return false; } if (!strcmp(name, "Xfer")) { name = args; args = strchr(args, ':'); parser_assert(args); *args++ = '\0'; return xfer(name, args); } if (!strcmp(name, "Search")) { name = args; args = strchr(args, ':'); if (args) { *args++ = '\0'; } if (!strcmp(name, "memory") && args) { req = GdbRequest(DREQ_SEARCH_MEM); req.target = query_thread; req.mem().addr = strtoul(args, &args, 16); parser_assert(';' == *args++); req.mem().len = strtoull(args, &args, 16); parser_assert(';' == *args++); read_binary_data((const uint8_t*)args, inbuf.data() + packetend, req.mem().data); LOG(debug) << "gdb searching memory (addr=" << HEX(req.mem().addr) << ", len=" << req.mem().len << ")"; return true; } write_packet(""); return false; } UNHANDLED_REQ() << "Unhandled gdb query: q" << name; return false; } bool GdbConnection::set_var(char* payload) { const char* name; char* args; args = strchr(payload, ':'); if (args) { *args++ = '\0'; } name = payload; if (!strcmp(name, "StartNoAckMode")) { write_packet("OK"); no_ack = true; return false; } UNHANDLED_REQ() << "Unhandled gdb set: Q" << name; return false; } void GdbConnection::consume_request() { req = GdbRequest(); write_flush(); } bool GdbConnection::process_bpacket(char* payload) { if (strcmp(payload, "c") == 0) { req = GdbRequest(DREQ_CONT); req.cont().run_direction = RUN_BACKWARD; req.cont().actions.push_back(GdbContAction(ACTION_CONTINUE, resume_thread)); return true; } else if (strcmp(payload, "s") == 0) { req = GdbRequest(DREQ_CONT); req.cont().run_direction = RUN_BACKWARD; req.cont().actions.push_back(GdbContAction(ACTION_STEP, resume_thread)); return true; } else { UNHANDLED_REQ() << "Unhandled gdb bpacket: b" << payload; return false; } } static int gdb_open_flags_to_system_flags(int64_t flags) { int ret; switch (flags & 3) { case 0: ret = O_RDONLY; break; case 1: ret = O_WRONLY; break; case 2: ret = O_RDWR; break; default: parser_assert(false); return 0; } parser_assert(!(flags & ~int64_t(3 | 0x8 | 0x200 | 0x400 | 0x800))); if (flags & 0x8) { ret |= O_APPEND; } if (flags & 0x200) { ret |= O_CREAT; } if (flags & 0x400) { ret |= O_TRUNC; } if (flags & 0x800) { ret |= O_EXCL; } return ret; } bool GdbConnection::process_vpacket(char* payload) { const char* name; char* args; args = strchr(payload, ';'); if (args) { *args++ = '\0'; } name = payload; if (!strcmp("Cont", name)) { vector actions; bool has_default_action = false; GdbContAction default_action; while (args) { char* cmd = args; while (*args != ':' && *args != ';') { if (!*args) { args = nullptr; break; } ++args; } bool is_default = true; GdbThreadId target; if (args) { if (*args == ':') { is_default = false; *args = '\0'; target = parse_threadid(args + 1, &args); } args = strchr(args, ';'); if (args) { *args = '\0'; ++args; } } GdbActionType action; int signal_to_deliver = 0; char* endptr = NULL; switch (cmd[0]) { case 'C': action = ACTION_CONTINUE; signal_to_deliver = strtol(cmd + 1, &endptr, 16); break; case 'c': action = ACTION_CONTINUE; break; case 'S': action = ACTION_STEP; signal_to_deliver = strtol(cmd + 1, &cmd, 16); break; case 's': action = ACTION_STEP; break; default: UNHANDLED_REQ() << "Unhandled vCont command " << cmd << "(" << args << ")"; return false; } if (endptr && *endptr) { UNHANDLED_REQ() << "Unhandled vCont command parameters " << cmd; return false; } if (is_default) { if (has_default_action) { UNHANDLED_REQ() << "Unhandled vCont command with multiple default actions"; return false; } has_default_action = true; default_action = GdbContAction(action, GdbThreadId::ALL, signal_to_deliver); } else { actions.push_back(GdbContAction(action, target, signal_to_deliver)); } } if (has_default_action) { actions.push_back(default_action); } req = GdbRequest(DREQ_CONT); req.cont().run_direction = RUN_FORWARD; req.cont().actions = move(actions); return true; } if (!strcmp("Cont?", name)) { LOG(debug) << "gdb queries which continue commands we support"; write_packet("vCont;c;C;s;S;"); return false; } if (!strcmp("Kill", name)) { // We can't kill tracees or replay can diverge. We // assume that this kill request is being made because // a "vRun" restart is coming right up. We know how // to implement vRun, so we'll ignore this one. LOG(debug) << "gdb asks us to kill tracee(s); ignoring"; write_packet("OK"); return false; } if (!strcmp("Run", name)) { req = GdbRequest(DREQ_RESTART); const char* filename = args; args = strchr(args, ';'); if (args) { *args++ = '\0'; } if (strlen(filename)) { FATAL() << "gdb wants us to run the exe image `" << filename << "', but we don't support that."; } if (!args) { req.restart().type = RESTART_FROM_PREVIOUS; return true; } const char* arg1 = args; args = strchr(args, ';'); if (args) { *args++ = 0; LOG(debug) << "Ignoring extra parameters " << args; } string event_str = decode_ascii_encoded_hex_str(arg1); char* endp; if (event_str[0] == 'c') { int64_t param = strtoll(event_str.c_str() + 1, &endp, 0); req.restart().type = RESTART_FROM_CHECKPOINT; req.restart().param_str = event_str.substr(1); req.restart().param = param; LOG(debug) << "next replayer restarting from checkpoint " << param; } else if (event_str[0] == 't') { int64_t param = strtoll(event_str.c_str() + 1, &endp, 0); req.restart().type = RESTART_FROM_TICKS; req.restart().param_str = event_str.substr(1); req.restart().param = param; LOG(debug) << "next replayer restarting from tick count " << param; } else { req.restart().type = RESTART_FROM_EVENT; req.restart().param = strtoll(event_str.c_str(), &endp, 0); LOG(debug) << "next replayer advancing to event " << req.restart().param; } if (!endp || *endp != '\0') { LOG(debug) << "Couldn't parse event string `" << event_str << "'" << "; restarting from previous"; req.restart().type = RESTART_FROM_PREVIOUS; req.restart().param = -1; } return true; } if (name == strstr(name, "File:")) { char* operation = payload + 5; if (operation == strstr(operation, "open:")) { char* file_name_end = strchr(operation + 5, ','); parser_assert(file_name_end != NULL); *file_name_end = 0; req = GdbRequest(DREQ_FILE_OPEN); req.file_open().file_name = decode_ascii_encoded_hex_str(operation + 5); char* flags_end; int64_t flags = strtol(file_name_end + 1, &flags_end, 16); parser_assert(*flags_end == ','); req.file_open().flags = gdb_open_flags_to_system_flags(flags); char* mode_end; int64_t mode = strtol(flags_end + 1, &mode_end, 16); parser_assert(*mode_end == 0); parser_assert((mode & ~(int64_t)0777) == 0); req.file_open().mode = mode; return true; } else if (operation == strstr(operation, "close:")) { char* endptr; int64_t fd = strtol(operation + 6, &endptr, 16); parser_assert(*endptr == 0); req = GdbRequest(DREQ_FILE_CLOSE); req.file_close().fd = fd; parser_assert(req.file_close().fd == fd); return true; } else if (operation == strstr(operation, "pread:")) { char* fd_end; int64_t fd = strtol(operation + 6, &fd_end, 16); parser_assert(*fd_end == ','); req = GdbRequest(DREQ_FILE_PREAD); req.file_pread().fd = fd; parser_assert(req.file_pread().fd == fd); char* size_end; int64_t size = strtol(fd_end + 1, &size_end, 16); parser_assert(*size_end == ','); parser_assert(size >= 0); req.file_pread().size = size; char* offset_end; int64_t offset = strtol(size_end + 1, &offset_end, 16); parser_assert(*offset_end == 0); parser_assert(offset >= 0); req.file_pread().offset = offset; return true; } else if (operation == strstr(operation, "setfs:")) { char* endptr; int64_t pid = strtol(operation + 6, &endptr, 16); parser_assert(*endptr == 0); req = GdbRequest(DREQ_FILE_SETFS); req.file_setfs().pid = pid; parser_assert(req.file_setfs().pid == pid); return true; } else { write_packet(""); return false; } } UNHANDLED_REQ() << "Unhandled gdb vpacket: v" << name; return false; } static string to_string(const vector& bytes, size_t max_len) { stringstream ss; for (size_t i = 0; i < bytes.size(); ++i) { if (i >= max_len) { ss << "..."; break; } char buf[3]; sprintf(buf, "%02x", bytes[i]); ss << buf; } return ss.str(); } bool GdbConnection::process_packet() { parser_assert( INTERRUPT_CHAR == inbuf[0] || ('$' == inbuf[0] && (uint8_t*)memchr(inbuf.data(), '#', inbuf.size()) == inbuf.data() + packetend)); if (INTERRUPT_CHAR == inbuf[0]) { LOG(debug) << "gdb requests interrupt"; req = GdbRequest(DREQ_INTERRUPT); inbuf.erase(inbuf.begin()); return true; } char request = inbuf[1]; char* payload = (char*)&inbuf[2]; inbuf[packetend] = '\0'; LOG(debug) << "raw request " << request << payload; bool ret; switch (request) { case 'b': ret = process_bpacket(payload); break; case 'c': LOG(debug) << "gdb is asking to continue"; req = GdbRequest(DREQ_CONT); req.cont().run_direction = RUN_FORWARD; req.cont().actions.push_back(GdbContAction(ACTION_CONTINUE)); ret = true; break; case 'D': LOG(debug) << "gdb is detaching from us"; req = GdbRequest(DREQ_DETACH); ret = true; break; case 'g': req = GdbRequest(DREQ_GET_REGS); req.target = query_thread; LOG(debug) << "gdb requests registers"; ret = true; break; case 'G': /* XXX we can't let gdb spray registers in general, * because it may cause replay to diverge. But some * writes may be OK. Let's see how far we can get * with ignoring these requests. */ write_packet(""); ret = false; break; case 'H': if ('c' == *payload++) { req = GdbRequest(DREQ_SET_CONTINUE_THREAD); } else { req = GdbRequest(DREQ_SET_QUERY_THREAD); } req.target = parse_threadid(payload, &payload); parser_assert('\0' == *payload); LOG(debug) << "gdb selecting " << req.target; ret = true; break; case 'k': LOG(info) << "gdb requests kill, exiting"; write_packet("OK"); exit(0); case 'm': req = GdbRequest(DREQ_GET_MEM); req.target = query_thread; req.mem().addr = strtoul(payload, &payload, 16); parser_assert(',' == *payload++); req.mem().len = strtoul(payload, &payload, 16); parser_assert('\0' == *payload); LOG(debug) << "gdb requests memory (addr=" << HEX(req.mem().addr) << ", len=" << req.mem().len << ")"; ret = true; break; case 'M': /* We can't allow the debugger to write arbitrary data * to memory, or the replay may diverge. */ // TODO: parse this packet in case some oddball gdb // decides to send it instead of 'X' write_packet(""); ret = false; break; case 'p': req = GdbRequest(DREQ_GET_REG); req.target = query_thread; req.reg().name = GdbRegister(strtoul(payload, &payload, 16)); parser_assert('\0' == *payload); LOG(debug) << "gdb requests register value (" << req.reg().name << ")"; ret = true; break; case 'P': req = GdbRequest(DREQ_SET_REG); req.target = query_thread; req.reg().name = GdbRegister(strtoul(payload, &payload, 16)); parser_assert('=' == *payload++); read_reg_value(&payload, &req.reg()); parser_assert('\0' == *payload); ret = true; break; case 'q': ret = query(payload); break; case 'Q': ret = set_var(payload); break; case 'T': req = GdbRequest(DREQ_GET_IS_THREAD_ALIVE); req.target = parse_threadid(payload, &payload); parser_assert('\0' == *payload); LOG(debug) << "gdb wants to know if " << req.target << " is alive"; ret = true; break; case 'v': ret = process_vpacket(payload); break; case 'X': { req = GdbRequest(DREQ_SET_MEM); req.target = query_thread; req.mem().addr = strtoul(payload, &payload, 16); parser_assert(',' == *payload++); req.mem().len = strtoul(payload, &payload, 16); parser_assert(':' == *payload++); read_binary_data((const uint8_t*)payload, inbuf.data() + packetend, req.mem().data); parser_assert(req.mem().len == req.mem().data.size()); LOG(debug) << "gdb setting memory (addr=" << HEX(req.mem().addr) << ", len=" << req.mem().len << ", data=" << to_string(req.mem().data, 32) << ")"; ret = true; break; } case 'z': case 'Z': { int type = strtol(payload, &payload, 16); parser_assert(',' == *payload++); if (!(0 <= type && type <= 4)) { LOG(warn) << "Unknown watch type " << type; write_packet(""); ret = false; break; } req = GdbRequest(GdbRequestType( type + (request == 'Z' ? DREQ_SET_SW_BREAK : DREQ_REMOVE_SW_BREAK))); req.watch().addr = strtoul(payload, &payload, 16); parser_assert(',' == *payload); payload++; req.watch().kind = strtoul(payload, &payload, 16); if (';' == *payload) { ++payload; while ('X' == *payload) { ++payload; int len = strtol(payload, &payload, 16); parser_assert(',' == *payload); payload++; vector bytes; for (int i = 0; i < len; ++i) { parser_assert(payload[0] && payload[1]); char tmp = payload[2]; payload[2] = '\0'; bytes.push_back(strtol(payload, &payload, 16)); parser_assert('\0' == *payload); payload[0] = tmp; } req.watch().conditions.push_back(move(bytes)); } } parser_assert('\0' == *payload); LOG(debug) << "gdb requests " << ('Z' == request ? "set" : "remove") << "breakpoint (addr=" << HEX(req.watch().addr) << ", len=" << req.watch().kind << ")"; ret = true; break; } case '!': LOG(debug) << "gdb requests extended mode"; write_packet("OK"); ret = false; break; case '?': LOG(debug) << "gdb requests stop reason"; req = GdbRequest(DREQ_GET_STOP_REASON); req.target = query_thread; ret = true; break; default: UNHANDLED_REQ() << "Unhandled gdb request '" << inbuf[1] << "'"; ret = false; } /* Erase the newly processed packet from the input buffer. The checksum * after the '#' will be skipped later as we look for the next packet start. */ inbuf.erase(inbuf.begin(), inbuf.begin() + packetend + 1); /* If we processed the request internally, consume it. */ if (!ret) { consume_request(); } return ret; } void GdbConnection::notify_no_such_thread(const GdbRequest& req) { DEBUG_ASSERT(req.target == this->req.target && req.type == this->req.type); /* '10' is the errno ECHILD. We use it as a magic code to * notify the user that the thread that was the target of this * request has died, and either gdb didn't notice that, or rr * didn't notify gdb. Either way, the user should restart * their debugging session. */ LOG(error) << "Targeted thread no longer exists; this is the result of " "either a gdb or\n" "rr bug. Please restart your debugging session and avoid " "doing whatever\n" "triggered this bug."; write_packet("E10"); consume_request(); } void GdbConnection::notify_restart() { DEBUG_ASSERT(DREQ_RESTART == req.type); // These threads may not exist at the first trace-stop after // restart. The gdb client should reset this state, but help // it out just in case. resume_thread = GdbThreadId::ANY; query_thread = GdbThreadId::ANY; req = GdbRequest(); } GdbRequest GdbConnection::get_request() { if (DREQ_RESTART == req.type) { LOG(debug) << "consuming RESTART request"; notify_restart(); // gdb wants to be notified with a stop packet when // the process "relaunches". In rr's case, the // traceee may be very far away from process creation, // but that's OK. req = GdbRequest(DREQ_GET_STOP_REASON); req.target = query_thread; return req; } /* Can't ask for the next request until you've satisfied the * current one, for requests that need an immediate * response. */ #ifdef DEBUG DEBUG_ASSERT(!request_needs_immediate_response(&req)); #endif if (!sniff_packet() && req.is_resume_request()) { /* There's no new request data available and gdb has * already asked us to resume. OK, do that (or keep * doing that) now. */ return req; } while (true) { /* There's either new request data, or we have nothing * to do. Either way, block until we read a complete * packet from gdb. */ read_packet(); if (!connection_alive_) { return req = GdbRequest(DREQ_DETACH); } if (process_packet()) { /* We couldn't process the packet internally, * so the target has to do something. */ return req; } /* The packet we got was "internal", gdb details. * Nothing for the target to do yet. Keep waiting. */ } } void GdbConnection::notify_exit_code(int code) { char buf[64]; DEBUG_ASSERT(req.is_resume_request() || req.type == DREQ_INTERRUPT); snprintf(buf, sizeof(buf) - 1, "W%02x", code); write_packet(buf); consume_request(); } void GdbConnection::notify_exit_signal(int sig) { char buf[64]; DEBUG_ASSERT(req.is_resume_request() || req.type == DREQ_INTERRUPT); snprintf(buf, sizeof(buf) - 1, "X%02x", sig); write_packet(buf); consume_request(); } /** * Translate linux-x86 |sig| to gdb's internal numbering. Translation * made according to gdb/include/gdb/signals.def. */ static int to_gdb_signum(int sig) { switch (sig) { case 0: return 0; case SIGHUP: return 1; case SIGINT: return 2; case SIGQUIT: return 3; case SIGILL: return 4; case SIGTRAP: return 5; case SIGABRT /*case SIGIOT*/: return 6; case SIGBUS: return 10; case SIGFPE: return 8; case SIGKILL: return 9; case SIGUSR1: return 30; case SIGSEGV: return 11; case SIGUSR2: return 31; case SIGPIPE: return 13; case SIGALRM: return 14; case SIGTERM: return 15; /* gdb hasn't heard of SIGSTKFLT, so this is * arbitrarily made up. SIGDANGER just sounds cool.*/ case SIGSTKFLT: return 38 /*GDB_SIGNAL_DANGER*/; /*case SIGCLD*/ case SIGCHLD: return 20; case SIGCONT: return 19; case SIGSTOP: return 17; case SIGTSTP: return 18; case SIGTTIN: return 21; case SIGTTOU: return 22; case SIGURG: return 16; case SIGXCPU: return 24; case SIGXFSZ: return 25; case SIGVTALRM: return 26; case SIGPROF: return 27; case SIGWINCH: return 28; /*case SIGPOLL*/ case SIGIO: return 23; case SIGPWR: return 32; case SIGSYS: return 12; case 32: return 77; default: if (33 <= sig && sig <= 63) { /* GDB_SIGNAL_REALTIME_33 is numbered 45, hence this offset. */ return sig + 12; } if (64 <= sig && sig <= 127) { /* GDB_SIGNAL_REALTIME_64 is numbered 78, hence this offset. */ return sig + 14; } LOG(warn) << "Unknown signal " << sig; return 143; // GDB_SIGNAL_UNKNOWN } } void GdbConnection::send_stop_reply_packet(GdbThreadId thread, int sig, const char *reason) { if (sig < 0) { write_packet("E01"); return; } char buf[PATH_MAX]; if (multiprocess_supported_) { snprintf(buf, sizeof(buf) - 1, "T%02xthread:p%02x.%02x;%s", to_gdb_signum(sig), thread.pid, thread.tid, reason); } else { snprintf(buf, sizeof(buf) - 1, "T%02xthread:%02x;%s", to_gdb_signum(sig), thread.tid, reason); } write_packet(buf); } void GdbConnection::notify_stop(GdbThreadId thread, int sig, const char *reason) { DEBUG_ASSERT(req.is_resume_request() || req.type == DREQ_INTERRUPT); if (tgid != thread.pid) { LOG(debug) << "ignoring stop of " << thread << " because we're debugging tgid " << tgid; // Re-use the existing continue request to advance to // the next stop we're willing to tell gdb about. return; } if (!reason) { reason = ""; } send_stop_reply_packet(thread, sig, reason); // This isn't documented in the gdb remote protocol, but if we // don't do this, gdb will sometimes continue to send requests // for the previously-stopped thread when it obviously intends // to be making requests about the stopped thread. // To make things even better, gdb expects different behavior // for forward continue/interupt and reverse continue. if (req.is_resume_request() && req.cont().run_direction == RUN_BACKWARD) { LOG(debug) << "Setting query/resume_thread to ANY after reverse continue"; query_thread = resume_thread = GdbThreadId::ANY; } else { LOG(debug) << "Setting query/resume_thread to " << thread << " after forward continue or interrupt"; query_thread = resume_thread = thread; } consume_request(); } void GdbConnection::notify_restart_failed() { DEBUG_ASSERT(DREQ_RESTART == req.type); // TODO: it's not known by this author whether gdb knows how // to recover from a failed "run" request. write_packet("E01"); consume_request(); } void GdbConnection::reply_get_current_thread(GdbThreadId thread) { DEBUG_ASSERT(DREQ_GET_CURRENT_THREAD == req.type); char buf[1024]; if (multiprocess_supported_) { snprintf(buf, sizeof(buf), "QCp%02x.%02x", thread.pid, thread.tid); } else { snprintf(buf, sizeof(buf), "QC%02x", thread.tid); } write_packet(buf); consume_request(); } void GdbConnection::reply_get_auxv(const vector& auxv) { DEBUG_ASSERT(DREQ_GET_AUXV == req.type); if (!auxv.empty()) { write_binary_packet("l", auxv.data(), auxv.size()); } else { write_packet("E01"); } consume_request(); } void GdbConnection::reply_get_exec_file(const string& exec_file) { DEBUG_ASSERT(DREQ_GET_EXEC_FILE == req.type); if (!exec_file.empty()) { write_binary_packet("l", reinterpret_cast(exec_file.c_str()), exec_file.size()); } else { write_packet("E01"); } consume_request(); } void GdbConnection::reply_get_is_thread_alive(bool alive) { DEBUG_ASSERT(DREQ_GET_IS_THREAD_ALIVE == req.type); write_packet(alive ? "OK" : "E01"); consume_request(); } void GdbConnection::reply_get_thread_extra_info(const string& info) { DEBUG_ASSERT(DREQ_GET_THREAD_EXTRA_INFO == req.type); LOG(debug) << "thread extra info: '" << info.c_str() << "'"; write_hex_bytes_packet((const uint8_t*)info.c_str(), 1 + info.length()); consume_request(); } void GdbConnection::reply_select_thread(bool ok) { DEBUG_ASSERT(DREQ_SET_CONTINUE_THREAD == req.type || DREQ_SET_QUERY_THREAD == req.type); if (ok && DREQ_SET_CONTINUE_THREAD == req.type) { resume_thread = req.target; } else if (ok && DREQ_SET_QUERY_THREAD == req.type) { query_thread = req.target; } write_packet(ok ? "OK" : "E01"); consume_request(); } void GdbConnection::reply_get_mem(const vector& mem) { DEBUG_ASSERT(DREQ_GET_MEM == req.type); DEBUG_ASSERT(mem.size() <= req.mem().len); if (req.mem().len > 0 && mem.size() == 0) { write_packet("E01"); } else { write_hex_bytes_packet(mem.data(), mem.size()); } consume_request(); } void GdbConnection::reply_set_mem(bool ok) { DEBUG_ASSERT(DREQ_SET_MEM == req.type); write_packet(ok ? "OK" : "E01"); consume_request(); } void GdbConnection::reply_search_mem(bool found, remote_ptr addr) { DEBUG_ASSERT(DREQ_SEARCH_MEM == req.type); if (found) { char buf[256]; sprintf(buf, "1,%llx", (long long)addr.as_int()); write_packet(buf); } else { write_packet("0"); } consume_request(); } void GdbConnection::reply_get_offsets(/* TODO */) { DEBUG_ASSERT(DREQ_GET_OFFSETS == req.type); /* XXX FIXME TODO */ write_packet(""); consume_request(); } void GdbConnection::reply_get_reg(const GdbRegisterValue& reg) { char buf[2 * GdbRegisterValue::MAX_SIZE + 1]; DEBUG_ASSERT(DREQ_GET_REG == req.type); print_reg_value(reg, buf); write_packet(buf); consume_request(); } void GdbConnection::reply_get_regs(const vector& file) { std::unique_ptr buf( new char[file.size() * 2 * GdbRegisterValue::MAX_SIZE + 1]); DEBUG_ASSERT(DREQ_GET_REGS == req.type); size_t offset = 0; for (auto& reg : file) { offset += print_reg_value(reg, &buf[offset]); } write_packet(buf.get()); consume_request(); } void GdbConnection::reply_set_reg(bool ok) { DEBUG_ASSERT(DREQ_SET_REG == req.type); // TODO: what happens if we're forced to reply to a // set-register request with |ok = false|, leading us to // pretend not to understand the packet? If, later, an // experimental session needs the set-register request will it // not be sent? // // We can't reply with an error packet here because gdb thinks // that failed set-register requests are catastrophic. write_packet(ok ? "OK" : ""); consume_request(); } void GdbConnection::reply_get_stop_reason(GdbThreadId which, int sig) { DEBUG_ASSERT(DREQ_GET_STOP_REASON == req.type); send_stop_reply_packet(which, sig, ""); consume_request(); } void GdbConnection::reply_get_thread_list(const vector& threads) { DEBUG_ASSERT(DREQ_GET_THREAD_LIST == req.type); if (threads.empty()) { write_packet("l"); } else { stringstream sstr; sstr << 'm'; for (size_t i = 0; i < threads.size(); ++i) { const GdbThreadId& t = threads[i]; if (tgid != t.pid) { continue; } if (multiprocess_supported_) { sstr << 'p' << setw(2) << setfill('0') << hex << t.pid << dec << '.' << setw(2) << setfill('0') << hex << t.tid << ','; } else { sstr << setw(2) << setfill('0') << hex << t.tid << ','; } } string str = sstr.str(); /* Overwrite the trailing ',' */ str.back() = 0; write_packet(str.c_str()); } consume_request(); } void GdbConnection::reply_watchpoint_request(bool ok) { DEBUG_ASSERT(DREQ_WATCH_FIRST <= req.type && req.type <= DREQ_WATCH_LAST); write_packet(ok ? "OK" : "E01"); consume_request(); } void GdbConnection::reply_detach() { DEBUG_ASSERT(DREQ_DETACH <= req.type); write_packet("OK"); consume_request(); } void GdbConnection::reply_read_siginfo(const vector& si_bytes) { DEBUG_ASSERT(DREQ_READ_SIGINFO == req.type); if (si_bytes.empty()) { write_packet("E01"); } else { write_binary_packet("l", si_bytes.data(), si_bytes.size()); } consume_request(); } void GdbConnection::reply_write_siginfo(/* TODO*/) { DEBUG_ASSERT(DREQ_WRITE_SIGINFO == req.type); write_packet("E01"); consume_request(); } void GdbConnection::reply_rr_cmd(const std::string& text) { DEBUG_ASSERT(DREQ_RR_CMD == req.type); write_packet(text.c_str()); consume_request(); } void GdbConnection::send_qsymbol(const std::string& name) { DEBUG_ASSERT(DREQ_QSYMBOL == req.type); const void* data = static_cast(name.c_str()); write_hex_bytes_packet("qSymbol:", static_cast(data), name.length()); consume_request(); } void GdbConnection::qsymbols_finished() { DEBUG_ASSERT(DREQ_QSYMBOL == req.type); write_packet("OK"); consume_request(); } void GdbConnection::reply_tls_addr(bool ok, remote_ptr address) { DEBUG_ASSERT(DREQ_TLS == req.type); if (ok) { char buf[256]; sprintf(buf, "%llx", (long long)address.as_int()); write_packet(buf); } else { write_packet("E01"); } consume_request(); } void GdbConnection::reply_setfs(int err) { DEBUG_ASSERT(DREQ_FILE_SETFS == req.type); if (err) { send_file_error_reply(err); } else { write_packet("F0"); } consume_request(); } void GdbConnection::reply_open(int fd, int err) { DEBUG_ASSERT(DREQ_FILE_OPEN == req.type); if (err) { send_file_error_reply(err); } else { char buf[32]; sprintf(buf, "F%x", fd); write_packet(buf); } consume_request(); } void GdbConnection::reply_pread(const uint8_t* bytes, ssize_t len, int err) { DEBUG_ASSERT(DREQ_FILE_PREAD == req.type); if (err) { send_file_error_reply(err); } else { char buf[32]; sprintf(buf, "F%llx;", (long long)len); write_binary_packet(buf, bytes, len); } consume_request(); } void GdbConnection::reply_close(int err) { DEBUG_ASSERT(DREQ_FILE_CLOSE == req.type); if (err) { send_file_error_reply(err); } else { write_packet("F0"); } consume_request(); } void GdbConnection::send_file_error_reply(int system_errno) { int gdb_err; switch (system_errno) { case EPERM: gdb_err = 1; break; case ENOENT: gdb_err = 2; break; case EINTR: gdb_err = 4; break; case EBADF: gdb_err = 9; break; case EACCES: gdb_err = 13; break; case EFAULT: gdb_err = 14; break; case EBUSY: gdb_err = 16; break; case EEXIST: gdb_err = 17; break; case ENODEV: gdb_err = 19; break; case ENOTDIR: gdb_err = 20; break; case EISDIR: gdb_err = 21; break; case EINVAL: gdb_err = 22; break; case ENFILE: gdb_err = 23; break; case EMFILE: gdb_err = 24; break; case EFBIG: gdb_err = 27; break; case ENOSPC: gdb_err = 28; break; case ESPIPE: gdb_err = 29; break; case EROFS: gdb_err = 30; break; case ENAMETOOLONG: gdb_err = 91; break; default: gdb_err = 9999; break; } char buf[32]; sprintf(buf, "F-01,%x", gdb_err); write_packet(buf); } bool GdbConnection::is_connection_alive() { return connection_alive_; } } // namespace rr rr-5.5.0/src/GdbConnection.h000066400000000000000000000451471412202446200156300ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_CONNECTION_H_ #define RR_GDB_CONNECTION_H_ #include #include #include #include #include #include #include "GdbRegister.h" #include "Registers.h" #include "ReplaySession.h" #include "ReplayTimeline.h" #include "core.h" namespace rr { /** * Descriptor for task. Note: on linux, we can uniquely identify any thread * by its |tid| (in rr's pid namespace). */ struct GdbThreadId { GdbThreadId(pid_t pid = -1, pid_t tid = -1) : pid(pid), tid(tid) {} pid_t pid; pid_t tid; bool operator==(const GdbThreadId& o) const { return pid == o.pid && tid == o.tid; } static const GdbThreadId ANY; static const GdbThreadId ALL; }; inline std::ostream& operator<<(std::ostream& o, const GdbThreadId& t) { o << t.pid << "." << t.tid; return o; } /** * Represents a possibly-undefined register |name|. |size| indicates how * many bytes of |value| are valid, if any. */ struct GdbRegisterValue { enum { MAX_SIZE = Registers::MAX_SIZE }; GdbRegister name; union { uint8_t value[MAX_SIZE]; uint8_t value1; uint16_t value2; uint32_t value4; uint64_t value8; }; size_t size; bool defined; }; enum GdbRequestType { DREQ_NONE = 0, /* None of these requests have parameters. */ DREQ_GET_CURRENT_THREAD, DREQ_GET_OFFSETS, DREQ_GET_REGS, DREQ_GET_STOP_REASON, DREQ_GET_THREAD_LIST, DREQ_INTERRUPT, DREQ_DETACH, /* These use params.target. */ DREQ_GET_AUXV, DREQ_GET_EXEC_FILE, DREQ_GET_IS_THREAD_ALIVE, DREQ_GET_THREAD_EXTRA_INFO, DREQ_SET_CONTINUE_THREAD, DREQ_SET_QUERY_THREAD, // TLS lookup, uses params.target and params.tls. DREQ_TLS, // gdb wants to write back siginfo_t to a tracee. More // importantly, this packet arrives before an experiment // session for a |call foo()| is about to be torn down. // // TODO: actual interface NYI. DREQ_WRITE_SIGINFO, /* These use params.mem. */ DREQ_GET_MEM, DREQ_SET_MEM, // gdb wants to read the current siginfo_t for a stopped // tracee. More importantly, this packet arrives at the very // beginning of a |call foo()| experiment. // // Uses .mem for offset/len. DREQ_READ_SIGINFO, DREQ_SEARCH_MEM, DREQ_MEM_FIRST = DREQ_GET_MEM, DREQ_MEM_LAST = DREQ_SEARCH_MEM, DREQ_REMOVE_SW_BREAK, DREQ_REMOVE_HW_BREAK, DREQ_REMOVE_WR_WATCH, DREQ_REMOVE_RD_WATCH, DREQ_REMOVE_RDWR_WATCH, DREQ_SET_SW_BREAK, DREQ_SET_HW_BREAK, DREQ_SET_WR_WATCH, DREQ_SET_RD_WATCH, DREQ_SET_RDWR_WATCH, DREQ_WATCH_FIRST = DREQ_REMOVE_SW_BREAK, DREQ_WATCH_LAST = DREQ_SET_RDWR_WATCH, /* Use params.reg. */ DREQ_GET_REG, DREQ_SET_REG, DREQ_REG_FIRST = DREQ_GET_REG, DREQ_REG_LAST = DREQ_SET_REG, /* Use params.cont. */ DREQ_CONT, /* gdb host detaching from stub. No parameters. */ /* Uses params.restart. */ DREQ_RESTART, /* Uses params.text. */ DREQ_RR_CMD, // qSymbol packet, uses params.sym. DREQ_QSYMBOL, // vFile:setfs packet, uses params.file_setfs. DREQ_FILE_SETFS, // vFile:open packet, uses params.file_open. DREQ_FILE_OPEN, // vFile:pread packet, uses params.file_pread. DREQ_FILE_PREAD, // vFile:close packet, uses params.file_close. DREQ_FILE_CLOSE, }; enum GdbRestartType { RESTART_FROM_PREVIOUS, RESTART_FROM_EVENT, RESTART_FROM_CHECKPOINT, RESTART_FROM_TICKS }; enum GdbActionType { ACTION_CONTINUE, ACTION_STEP }; struct GdbContAction { GdbContAction(GdbActionType type = ACTION_CONTINUE, const GdbThreadId& target = GdbThreadId::ANY, int signal_to_deliver = 0) : type(type), target(target), signal_to_deliver(signal_to_deliver) {} GdbActionType type; GdbThreadId target; int signal_to_deliver; }; /** * These requests are made by the debugger host and honored in proxy * by rr, the target. */ struct GdbRequest { GdbRequest(GdbRequestType type = DREQ_NONE) : type(type), suppress_debugger_stop(false) {} GdbRequest(const GdbRequest& other) : type(other.type), target(other.target), suppress_debugger_stop(other.suppress_debugger_stop), mem_(other.mem_), watch_(other.watch_), reg_(other.reg_), restart_(other.restart_), cont_(other.cont_), text_(other.text_), tls_(other.tls_), sym_(other.sym_), file_setfs_(other.file_setfs_), file_open_(other.file_open_), file_pread_(other.file_pread_), file_close_(other.file_close_) {} GdbRequest& operator=(const GdbRequest& other) { this->~GdbRequest(); new (this) GdbRequest(other); return *this; } const GdbRequestType type; GdbThreadId target; bool suppress_debugger_stop; struct Mem { uintptr_t addr; size_t len; // For SET_MEM requests, the |len| raw bytes that are to be written. // For SEARCH_MEM requests, the bytes to search for. std::vector data; } mem_; struct Watch { uintptr_t addr; int kind; std::vector> conditions; } watch_; GdbRegisterValue reg_; struct Restart { int64_t param; std::string param_str; GdbRestartType type; } restart_; struct Cont { RunDirection run_direction; std::vector actions; } cont_; std::string text_; struct Tls { size_t offset; remote_ptr load_module; } tls_; struct Symbol { bool has_address; remote_ptr address; std::string name; } sym_; struct FileSetfs { pid_t pid; } file_setfs_; struct FileOpen { std::string file_name; // In system format, not gdb's format int flags; int mode; } file_open_; struct FilePread { int fd; size_t size; uint64_t offset; } file_pread_; struct FileClose { int fd; } file_close_; Mem& mem() { DEBUG_ASSERT(type >= DREQ_MEM_FIRST && type <= DREQ_MEM_LAST); return mem_; } const Mem& mem() const { DEBUG_ASSERT(type >= DREQ_MEM_FIRST && type <= DREQ_MEM_LAST); return mem_; } Watch& watch() { DEBUG_ASSERT(type >= DREQ_WATCH_FIRST && type <= DREQ_WATCH_LAST); return watch_; } const Watch& watch() const { DEBUG_ASSERT(type >= DREQ_WATCH_FIRST && type <= DREQ_WATCH_LAST); return watch_; } GdbRegisterValue& reg() { DEBUG_ASSERT(type >= DREQ_REG_FIRST && type <= DREQ_REG_LAST); return reg_; } const GdbRegisterValue& reg() const { DEBUG_ASSERT(type >= DREQ_REG_FIRST && type <= DREQ_REG_LAST); return reg_; } Restart& restart() { DEBUG_ASSERT(type == DREQ_RESTART); return restart_; } const Restart& restart() const { DEBUG_ASSERT(type == DREQ_RESTART); return restart_; } Cont& cont() { DEBUG_ASSERT(type == DREQ_CONT); return cont_; } const Cont& cont() const { DEBUG_ASSERT(type == DREQ_CONT); return cont_; } const std::string& text() const { DEBUG_ASSERT(type == DREQ_RR_CMD); return text_; } Tls& tls() { DEBUG_ASSERT(type == DREQ_TLS); return tls_; } const Tls& tls() const { DEBUG_ASSERT(type == DREQ_TLS); return tls_; } Symbol& sym() { DEBUG_ASSERT(type == DREQ_QSYMBOL); return sym_; } const Symbol& sym() const { DEBUG_ASSERT(type == DREQ_QSYMBOL); return sym_; } FileSetfs& file_setfs() { DEBUG_ASSERT(type == DREQ_FILE_SETFS); return file_setfs_; } const FileSetfs& file_setfs() const { DEBUG_ASSERT(type == DREQ_FILE_SETFS); return file_setfs_; } FileOpen& file_open() { DEBUG_ASSERT(type == DREQ_FILE_OPEN); return file_open_; } const FileOpen& file_open() const { DEBUG_ASSERT(type == DREQ_FILE_OPEN); return file_open_; } FilePread& file_pread() { DEBUG_ASSERT(type == DREQ_FILE_PREAD); return file_pread_; } const FilePread& file_pread() const { DEBUG_ASSERT(type == DREQ_FILE_PREAD); return file_pread_; } FileClose& file_close() { DEBUG_ASSERT(type == DREQ_FILE_CLOSE); return file_close_; } const FileClose& file_close() const { DEBUG_ASSERT(type == DREQ_FILE_CLOSE); return file_close_; } /** * Return nonzero if this requires that program execution be resumed * in some way. */ bool is_resume_request() const { return type == DREQ_CONT; } }; /** * This struct wraps up the state of the gdb protocol, so that we can * offer a (mostly) stateless interface to clients. */ class GdbConnection { public: struct Features { Features() : reverse_execution(true) {} bool reverse_execution; }; /** * Call this when the target of |req| is needed to fulfill the * request, but the target is dead. This situation is a symptom of a * gdb or rr bug. */ void notify_no_such_thread(const GdbRequest& req); /** * Finish a DREQ_RESTART request. Should be invoked after replay * restarts and prior GdbConnection has been restored. */ void notify_restart(); /** * Return the current request made by the debugger host, that needs to * be satisfied. This function will block until either there's a * debugger host request that needs a response, or until a request is * made to resume execution of the target. In the latter case, * calling this function multiple times will return an appropriate * resume request each time (see above). * * The target should peek at the debugger request in between execution * steps. A new request may need to be serviced. */ GdbRequest get_request(); /** * Notify the host that this process has exited with |code|. */ void notify_exit_code(int code); /** * Notify the host that this process has exited from |sig|. */ void notify_exit_signal(int sig); /** * Notify the host that a resume request has "finished", i.e., the * target has stopped executing for some reason. |sig| is the signal * that stopped execution, or 0 if execution stopped otherwise. */ void notify_stop(GdbThreadId which, int sig, const char *reason=nullptr); /** Notify the debugger that a restart request failed. */ void notify_restart_failed(); /** * Tell the host that |thread| is the current thread. */ void reply_get_current_thread(GdbThreadId thread); /** * Reply with the target thread's |auxv| pairs. |auxv.empty()| * if there was an error reading the auxiliary vector. */ void reply_get_auxv(const std::vector& auxv); /** * Reply with the target thread's executable file name */ void reply_get_exec_file(const std::string& exec_file); /** * |alive| is true if the requested thread is alive, false if dead. */ void reply_get_is_thread_alive(bool alive); /** * |info| is a string containing data about the request target that * might be relevant to the debugger user. */ void reply_get_thread_extra_info(const std::string& info); /** * |ok| is true if req->target can be selected, false otherwise. */ void reply_select_thread(bool ok); /** * The first |mem.size()| bytes of the request were read into |mem|. * |mem.size()| must be less than or equal to the length of the request. */ void reply_get_mem(const std::vector& mem); /** * |ok| is true if a SET_MEM request succeeded, false otherwise. This * function *must* be called whenever a SET_MEM request is made, * regardless of success/failure or special interpretation. */ void reply_set_mem(bool ok); /** * Reply to the DREQ_SEARCH_MEM request. * |found| is true if we found the searched-for bytes starting at address * |addr|. */ void reply_search_mem(bool found, remote_ptr addr); /** * Reply to the DREQ_GET_OFFSETS request. */ void reply_get_offsets(/* TODO */); /** * Send |value| back to the debugger host. |value| may be undefined. */ void reply_get_reg(const GdbRegisterValue& value); /** * Send |file| back to the debugger host. |file| may contain * undefined register values. */ void reply_get_regs(const std::vector& file); /** * Pass |ok = true| iff the requested register was successfully set. */ void reply_set_reg(bool ok); /** * Reply to the DREQ_GET_STOP_REASON request. */ void reply_get_stop_reason(GdbThreadId which, int sig); /** * |threads| contains the list of live threads, of which there are * |len|. */ void reply_get_thread_list(const std::vector& threads); /** * |ok| is true if the request was successfully applied, false if * not. */ void reply_watchpoint_request(bool ok); /** * DREQ_DETACH was processed. * * There's no functional reason to reply to the detach request. * However, some versions of gdb expect a response and time out * awaiting it, wasting developer time. */ void reply_detach(); /** * Pass the siginfo_t and its size (as requested by the debugger) in * |si_bytes| and |num_bytes| if successfully read. Otherwise pass * |si_bytes = nullptr|. */ void reply_read_siginfo(const std::vector& si_bytes); /** * Not yet implemented, but call this after a WRITE_SIGINFO request * anyway. */ void reply_write_siginfo(/* TODO*/); /** * Send a manual text response to a rr cmd (maintenance) packet. */ void reply_rr_cmd(const std::string& text); /** * Send a qSymbol response to gdb, requesting the address of the * symbol |name|. */ void send_qsymbol(const std::string& name); /** * The "all done" response to a qSymbol packet from gdb. */ void qsymbols_finished(); /** * Respond to a qGetTLSAddr packet. If |ok| is true, then respond * with |address|. If |ok| is false, respond with an error. */ void reply_tls_addr(bool ok, remote_ptr address); /** * Respond to a vFile:setfs */ void reply_setfs(int err); /** * Respond to a vFile:open */ void reply_open(int fd, int err); /** * Respond to a vFile:pread */ void reply_pread(const uint8_t* bytes, ssize_t len, int err); /** * Respond to a vFile:close */ void reply_close(int err); /** * Create a checkpoint of the given Session with the given id. Delete the * existing checkpoint with that id if there is one. */ void created_checkpoint(ReplaySession::shr_ptr& checkpoint, int checkpoint_id); /** * Delete the checkpoint with the given id. Silently fail if the checkpoint * does not exist. */ void delete_checkpoint(int checkpoint_id); /** * Get the checkpoint with the given id. Return null if not found. */ ReplaySession::shr_ptr get_checkpoint(int checkpoint_id); /** * Return true if there's a new packet to be read/process (whether * incomplete or not), and false if there isn't one. */ bool sniff_packet(); const Features& features() { return features_; } enum { CPU_X86_64 = 0x1, CPU_AVX = 0x2, CPU_AARCH64 = 0x4 }; void set_cpu_features(uint32_t features) { cpu_features_ = features; } uint32_t cpu_features() const { return cpu_features_; } GdbConnection(pid_t tgid, const Features& features); /** * Wait for a debugger client to connect to |dbg|'s socket. Blocks * indefinitely. */ void await_debugger(ScopedFd& listen_fd); /** * Returns false if the connection has been closed */ bool is_connection_alive(); bool hwbreak_supported() { return hwbreak_supported_; } bool swbreak_supported() { return swbreak_supported_; } private: /** * read() incoming data exactly one time, successfully. May block. */ void read_data_once(); /** * Send all pending output to gdb. May block. */ void write_flush(); void write_data_raw(const uint8_t* data, ssize_t len); void write_hex(unsigned long hex); void write_packet_bytes(const uint8_t* data, size_t num_bytes); void write_packet(const char* data); void write_binary_packet(const char* pfx, const uint8_t* data, ssize_t num_bytes); void write_hex_bytes_packet(const char* prefix, const uint8_t* bytes, size_t len); void write_hex_bytes_packet(const uint8_t* bytes, size_t len); void write_xfer_response(const void* data, size_t size, uint64_t offset, uint64_t len); /** * Consume bytes in the input buffer until start-of-packet ('$') or * the interrupt character is seen. Does not block. Return true if * seen, false if not. */ bool skip_to_packet_start(); /** * Block until the sequence of bytes * * "[^$]*\$[^#]*#.*" * * has been read from the client fd. This is one (or more) gdb * packet(s). */ void read_packet(); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool xfer(const char* name, char* args); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool query(char* payload); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool set_var(char* payload); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool process_vpacket(char* payload); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool process_bpacket(char* payload); /** * Return true if we need to do something in a debugger request, * false if we already handled the packet internally. */ bool process_packet(); void consume_request(); void send_stop_reply_packet(GdbThreadId thread, int sig, const char *reason); void send_file_error_reply(int system_errno); // Current request to be processed. GdbRequest req; // Thread to be resumed. GdbThreadId resume_thread; // Thread for get/set requests. GdbThreadId query_thread; // gdb and rr don't work well together in multi-process and // multi-exe-image debugging scenarios, so we pretend only // this thread group exists when interfacing with gdb pid_t tgid; uint32_t cpu_features_; // true when "no-ack mode" enabled, in which we don't have // to send ack packets back to gdb. This is a huge perf win. bool no_ack; ScopedFd sock_fd; std::vector inbuf; /* buffered input from gdb */ size_t packetend; /* index of '#' character */ std::vector outbuf; /* buffered output for gdb */ Features features_; bool connection_alive_; bool multiprocess_supported_; // client supports multiprocess extension bool hwbreak_supported_; // client supports hwbreak extension bool swbreak_supported_; // client supports swbreak extension }; } // namespace rr #endif /* RR_GDB_CONNECTION_H_ */ rr-5.5.0/src/GdbExpression.cc000066400000000000000000000262521412202446200160220ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "GdbExpression.h" #include "GdbServer.h" #include "Task.h" #include "core.h" using namespace std; namespace rr { #define WORKAROUND_GDB_BUGS // Extracted from // https://sourceware.org/gdb/current/onlinedocs/gdb/Bytecode-Descriptions.html enum Opcode { OP_float = 0x01, OP_add = 0x02, OP_sub = 0x03, OP_mul = 0x04, OP_div_signed = 0x05, OP_div_unsigned = 0x06, OP_rem_signed = 0x07, OP_rem_unsigned = 0x08, OP_lsh = 0x09, OP_rsh_signed = 0x0a, OP_rsh_unsigned = 0x0b, OP_trace = 0x0c, OP_trace_quick = 0x0d, OP_log_not = 0x0e, OP_bit_and = 0x0f, OP_bit_or = 0x10, OP_bit_xor = 0x11, OP_bit_not = 0x12, OP_equal = 0x13, OP_less_signed = 0x14, OP_less_unsigned = 0x15, OP_ext = 0x16, OP_ref8 = 0x17, OP_ref16 = 0x18, OP_ref32 = 0x19, OP_ref64 = 0x1a, OP_ref_float = 0x1b, OP_ref_double = 0x1c, OP_ref_long_double = 0x1d, OP_l_to_d = 0x1e, OP_d_to_l = 0x1f, OP_if_goto = 0x20, OP_goto = 0x21, OP_const8 = 0x22, OP_const16 = 0x23, OP_const32 = 0x24, OP_const64 = 0x25, OP_reg = 0x26, OP_end = 0x27, OP_dup = 0x28, OP_pop = 0x29, OP_zero_ext = 0x2a, OP_swap = 0x2b, OP_getv = 0x2c, OP_setv = 0x2d, OP_tracev = 0x2e, OP_tracenz = 0x2f, OP_trace16 = 0x30, OP_pick = 0x32, OP_rot = 0x33, OP_printf = 0x34, }; struct ExpressionState { typedef GdbExpression::Value Value; ExpressionState(const vector& bytecode) : bytecode(bytecode), pc(0), error(false), end(false) {} void set_error() { error = true; } // Methods set error to true if there's an error and return some sentinel // Value. Value pop() { if (stack.empty()) { set_error(); return Value(-1); } Value v = stack.back(); stack.pop_back(); return v; } struct BinaryOperands { BinaryOperands(int64_t a = 0, int64_t b = 0) : a(a), b(b) {} int64_t a; int64_t b; }; BinaryOperands pop_a_b() { int64_t b = pop().i; return BinaryOperands(pop().i, b); } int64_t nonzero(int64_t v) { if (!v) { set_error(); return 1; } return v; } int64_t pop_a() { return pop().i; } void push(int64_t i) { stack.push_back(Value(i)); } template T fetch() { if (pc + sizeof(T) > bytecode.size()) { set_error(); return T(-1); } T v = 0; for (size_t i = 0; i < sizeof(T); ++i) { v = (v << 8) | bytecode[pc + i]; } pc += sizeof(T); return v; } template void load(Task* t) { uint64_t addr = pop().i; if (error) { // Don't do unnecessary syscalls if we're already in an error state. return; } bool ok = true; T v = t->read_mem(remote_ptr(addr), &ok); if (!ok) { set_error(); return; } push(v); } void pick(size_t offset) { if (offset >= stack.size()) { set_error(); return; } push(stack[stack.size() - 1 - offset].i); } void step(Task* t) { DEBUG_ASSERT(!error); BinaryOperands operands; switch (fetch()) { case OP_add: operands = pop_a_b(); return push(operands.a + operands.b); case OP_sub: operands = pop_a_b(); return push(operands.a - operands.b); case OP_mul: operands = pop_a_b(); return push(operands.a * operands.b); case OP_div_signed: operands = pop_a_b(); return push(operands.a / nonzero(operands.b)); case OP_div_unsigned: operands = pop_a_b(); return push(uint64_t(operands.a) / uint64_t(nonzero(operands.b))); case OP_rem_signed: operands = pop_a_b(); return push(operands.a % nonzero(operands.b)); case OP_rem_unsigned: operands = pop_a_b(); return push(uint64_t(operands.a) % uint64_t(nonzero(operands.b))); case OP_lsh: operands = pop_a_b(); return push(operands.a << operands.b); case OP_rsh_signed: operands = pop_a_b(); return push(operands.a >> operands.b); case OP_rsh_unsigned: operands = pop_a_b(); return push(uint64_t(operands.a) >> operands.b); case OP_log_not: return push(!pop_a()); case OP_bit_and: operands = pop_a_b(); return push(operands.a & operands.b); case OP_bit_or: operands = pop_a_b(); return push(operands.a | operands.b); case OP_bit_xor: operands = pop_a_b(); return push(operands.a ^ operands.b); case OP_bit_not: return push(~pop_a()); case OP_equal: operands = pop_a_b(); return push(operands.a == operands.b); case OP_less_signed: operands = pop_a_b(); return push(operands.a < operands.b); case OP_less_unsigned: operands = pop_a_b(); return push(uint64_t(operands.a) < uint64_t(operands.b)); case OP_ext: { int64_t n = nonzero(fetch()); if (n >= 64) { return; } int64_t a = pop_a(); int64_t n_mask = (int64_t(1) << n) - 1; int sign_bit = (a >> (n - 1)) & 1; return push((sign_bit * ~n_mask) | (a & n_mask)); } case OP_zero_ext: { int64_t n = fetch(); if (n >= 64) { return; } int64_t a = pop_a(); int64_t n_mask = (int64_t(1) << n) - 1; return push(a & n_mask); } case OP_ref8: return load(t); case OP_ref16: return load(t); case OP_ref32: return load(t); case OP_ref64: return load(t); case OP_dup: return pick(0); case OP_swap: operands = pop_a_b(); push(operands.b); return push(operands.a); case OP_pop: pop_a(); return; case OP_pick: return pick(fetch()); case OP_rot: { int64_t c = pop_a(); int64_t b = pop_a(); int64_t a = pop_a(); push(c); push(b); return push(a); } case OP_if_goto: { uint16_t offset = fetch(); if (pop_a()) { pc = offset; } return; } case OP_goto: pc = fetch(); return; case OP_const8: return push(fetch()); case OP_const16: return push(fetch()); case OP_const32: return push(fetch()); case OP_const64: return push(fetch()); case OP_reg: { GdbRegisterValue v = GdbServer::get_reg(t->regs(), t->extra_regs(), GdbRegister(fetch())); if (!v.defined) { set_error(); return; } switch (v.size) { case 1: return push(v.value1); case 2: return push(v.value2); case 4: return push(v.value4); case 8: return push(v.value8); } set_error(); return; } case OP_end: end = true; return; default: set_error(); return; } } const vector& bytecode; vector stack; size_t pc; bool error; bool end; }; #ifdef WORKAROUND_GDB_BUGS /* https://sourceware.org/bugzilla/show_bug.cgi?id=18617 means that * gdb generates incorrect operands for OP_ext and OP_zero_ext. * We work around this bug by generating all the alternative programs that gdb * maybe should have generated, and evaluating all of them. If they agree on * the result, we return that as the correct result, otherwise we return * failure. */ static int count_variants(int bits) { int result = 1; if (bits > 8) { ++result; } if (bits > 16) { ++result; } if (bits > 32) { ++result; } return result; } template static T fetch(const uint8_t* data, size_t size, size_t pc) { if (pc + sizeof(T) > size) { return T(-1); } T v = 0; for (size_t i = 0; i < sizeof(T); ++i) { v = (v << 8) | data[pc + i]; } return v; } GdbExpression::GdbExpression(const uint8_t* data, size_t size) { vector instruction_starts; instruction_starts.resize(size); fill(instruction_starts.begin(), instruction_starts.end(), false); int64_t num_variants = 1; vector unvisited; unvisited.push_back(0); while (!unvisited.empty()) { size_t pc = unvisited.back(); unvisited.pop_back(); if (pc >= instruction_starts.size() || instruction_starts[pc]) { continue; } instruction_starts[pc] = true; switch (data[pc]) { case OP_ext: case OP_zero_ext: if (pc + 1 < size) { num_variants *= count_variants(data[pc + 1]); if (num_variants > 64) { // Too many variants, giving up on this expression return; } } unvisited.push_back(pc + 2); break; case OP_pick: case OP_const8: unvisited.push_back(pc + 2); break; case OP_if_goto: unvisited.push_back(fetch(data, size, pc + 1)); unvisited.push_back(pc + 3); break; case OP_goto: unvisited.push_back(fetch(data, size, pc + 1)); break; case OP_const16: case OP_reg: unvisited.push_back(pc + 3); break; case OP_const32: unvisited.push_back(pc + 5); break; case OP_const64: unvisited.push_back(pc + 9); break; case OP_end: break; default: unvisited.push_back(pc + 1); break; } } bytecode_variants.push_back(vector(data, data + size)); for (size_t i = 0; i < size; ++i) { if (!instruction_starts[i]) { continue; } if ((data[i] == OP_ext || data[i] == OP_zero_ext) && i + 1 < size) { uint8_t bits = data[i + 1]; vector> variants; for (auto& b : bytecode_variants) { // gdb perhaps should have used a smaller type width here --- 8, 16 or // 32 bits. if (bits > 8) { vector v = b; v[i + 1] = 8; variants.push_back(move(v)); } if (bits > 16) { vector v = b; v[i + 1] = 16; variants.push_back(move(v)); } if (bits > 32) { vector v = b; v[i + 1] = 32; variants.push_back(move(v)); } variants.push_back(move(b)); } bytecode_variants = move(variants); } } } #else GdbExpression::GdbExpression(const uint8_t* data, size_t size) { bytecode_variants.push_back(vector(data, data + size)); } #endif bool GdbExpression::evaluate(Task* t, Value* result) const { if (bytecode_variants.empty()) { return false; } bool first = true; for (auto& b : bytecode_variants) { ExpressionState state(b); for (int steps = 0; !state.end; ++steps) { if (steps >= 10000 || state.error) { return false; } state.step(t); } Value v = state.pop(); if (state.error) { return false; } if (first) { *result = v; first = false; } else if (*result != v) { return false; } } return true; } } // namespace rr rr-5.5.0/src/GdbExpression.h000066400000000000000000000022071412202446200156560ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_EXPRESSION_H_ #define RR_GDB_EXPRESSION_H_ #include #include #include namespace rr { class Task; /** * gdb has a simple bytecode language for writing expressions to be evaluated * in a remote target. This class implements evaluation of such expressions. * See https://sourceware.org/gdb/current/onlinedocs/gdb/Agent-Expressions.html */ class GdbExpression { public: GdbExpression(const uint8_t* data, size_t size); struct Value { Value(int64_t i = 0) : i(i) {} bool operator==(const Value& v) { return i == v.i; } bool operator!=(const Value& v) { return !(*this == v); } int64_t i; }; /** * If evaluation succeeds, store the final result in *result and return true. * Otherwise return false. */ bool evaluate(Task* t, Value* result) const; private: /** * To work around gdb bugs, we may generate and evaluate multiple versions of * the same expression program. */ std::vector> bytecode_variants; }; } // namespace rr #endif // RR_GDB_EXPRESSION_H_ rr-5.5.0/src/GdbInitCommand.cc000066400000000000000000000012211412202446200160520ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Command.h" #include "GdbServer.h" #include "main.h" using namespace std; namespace rr { class GdbInitCommand : public Command { public: virtual int run(vector& args) override; protected: GdbInitCommand(const char* name, const char* help) : Command(name, help) {} static GdbInitCommand singleton; }; GdbInitCommand GdbInitCommand::singleton("gdbinit", " rr gdbinit\n"); int GdbInitCommand::run(vector& args) { while (parse_global_option(args)) { } fputs(GdbServer::init_script().c_str(), stdout); return 0; } } // namespace rr rr-5.5.0/src/GdbRegister.h000066400000000000000000000074641412202446200153150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_REGISTER_H_ #define RR_GDB_REGISTER_H_ namespace rr { /** * This is the register numbering used by GDB. */ enum GdbRegister { DREG_EAX, DREG_ECX, DREG_EDX, DREG_EBX, DREG_ESP, DREG_EBP, DREG_ESI, DREG_EDI, DREG_EIP, DREG_EFLAGS, DREG_CS, DREG_SS, DREG_DS, DREG_ES, DREG_FS, DREG_GS, DREG_FIRST_FXSAVE_REG, DREG_ST0 = DREG_FIRST_FXSAVE_REG, DREG_ST1, DREG_ST2, DREG_ST3, DREG_ST4, DREG_ST5, DREG_ST6, DREG_ST7, // These are the names GDB gives the registers. DREG_FCTRL, DREG_FSTAT, DREG_FTAG, DREG_FISEG, DREG_FIOFF, DREG_FOSEG, DREG_FOOFF, DREG_FOP, DREG_XMM0, DREG_XMM1, DREG_XMM2, DREG_XMM3, DREG_XMM4, DREG_XMM5, DREG_XMM6, DREG_XMM7, DREG_MXCSR, // XXX the last fxsave reg on *x86* DREG_LAST_FXSAVE_REG = DREG_MXCSR, DREG_ORIG_EAX, DREG_YMM0H, DREG_YMM1H, DREG_YMM2H, DREG_YMM3H, DREG_YMM4H, DREG_YMM5H, DREG_YMM6H, DREG_YMM7H, DREG_NUM_LINUX_I386, // Last register we can find in user_regs_struct // (except for orig_eax). DREG_NUM_USER_REGS = DREG_GS + 1, // x86-64 register numbers DREG_RAX = 0, DREG_RBX, DREG_RCX, DREG_RDX, DREG_RSI, DREG_RDI, DREG_RBP, DREG_RSP, DREG_R8, DREG_R9, DREG_R10, DREG_R11, DREG_R12, DREG_R13, DREG_R14, DREG_R15, DREG_RIP, // Things get a little tricky here, because x86-64 has some registers // named identically to its x86 counterpart, but we've used the names // in the x86 register definitions above, and the numbers they need // to represent are different. Hence the unique names here. DREG_64_EFLAGS, DREG_64_CS, DREG_64_SS, DREG_64_DS, DREG_64_ES, DREG_64_FS, DREG_64_GS, DREG_64_FIRST_FXSAVE_REG, DREG_64_ST0 = DREG_64_FIRST_FXSAVE_REG, DREG_64_ST1, DREG_64_ST2, DREG_64_ST3, DREG_64_ST4, DREG_64_ST5, DREG_64_ST6, DREG_64_ST7, // These are the names GDB gives the registers. DREG_64_FCTRL, DREG_64_FSTAT, DREG_64_FTAG, DREG_64_FISEG, DREG_64_FIOFF, DREG_64_FOSEG, DREG_64_FOOFF, DREG_64_FOP, DREG_64_XMM0, DREG_64_XMM1, DREG_64_XMM2, DREG_64_XMM3, DREG_64_XMM4, DREG_64_XMM5, DREG_64_XMM6, DREG_64_XMM7, DREG_64_XMM8, DREG_64_XMM9, DREG_64_XMM10, DREG_64_XMM11, DREG_64_XMM12, DREG_64_XMM13, DREG_64_XMM14, DREG_64_XMM15, DREG_64_MXCSR, DREG_64_LAST_FXSAVE_REG = DREG_64_MXCSR, DREG_ORIG_RAX, DREG_FS_BASE, DREG_GS_BASE, DREG_64_YMM0H, DREG_64_YMM1H, DREG_64_YMM2H, DREG_64_YMM3H, DREG_64_YMM4H, DREG_64_YMM5H, DREG_64_YMM6H, DREG_64_YMM7H, DREG_64_YMM8H, DREG_64_YMM9H, DREG_64_YMM10H, DREG_64_YMM11H, DREG_64_YMM12H, DREG_64_YMM13H, DREG_64_YMM14H, DREG_64_YMM15H, DREG_NUM_LINUX_X86_64, // Last register we can find in user_regs_struct (except for orig_rax). DREG_64_NUM_USER_REGS = DREG_64_GS + 1, // aarch64-core.xml DREG_X0 = 0, DREG_X1, DREG_X2, DREG_X3, DREG_X4, DREG_X5, DREG_X6, DREG_X7, DREG_X8, DREG_X9, DREG_X10, DREG_X11, DREG_X12, DREG_X13, DREG_X14, DREG_X15, DREG_X16, DREG_X17, DREG_X18, DREG_X19, DREG_X20, DREG_X21, DREG_X22, DREG_X23, DREG_X24, DREG_X25, DREG_X26, DREG_X27, DREG_X28, DREG_X29, DREG_X30, DREG_SP, DREG_PC, DREG_CPSR, // aarch64-fpu.xml DREG_V0 = 34, DREG_V1, DREG_V2, DREG_V3, DREG_V4, DREG_V5, DREG_V6, DREG_V7, DREG_V8, DREG_V9, DREG_V10, DREG_V11, DREG_V12, DREG_V13, DREG_V14, DREG_V15, DREG_V16, DREG_V17, DREG_V18, DREG_V19, DREG_V20, DREG_V21, DREG_V22, DREG_V23, DREG_V24, DREG_V25, DREG_V26, DREG_V27, DREG_V28, DREG_V29, DREG_V30, DREG_V31, DREG_FPSR, DREG_FPCR, DREG_NUM_LINUX_AARCH64 = DREG_FPCR + 1, }; } // namespace rr #endif /* RR_GDB_REGISTER_H_ */ rr-5.5.0/src/GdbServer.cc000066400000000000000000002223041412202446200151250ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "GdbServer.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "BreakpointCondition.h" #include "ElfReader.h" #include "GdbCommandHandler.h" #include "GdbExpression.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "StringVectorToCharArray.h" #include "Task.h" #include "ThreadGroup.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" #include "util.h" using namespace std; namespace rr { GdbServer::GdbServer(std::unique_ptr& dbg, Task* t) : dbg(std::move(dbg)), debuggee_tguid(t->thread_group()->tguid()), last_continue_tuid(t->tuid()), last_query_tuid(t->tuid()), final_event(UINT32_MAX), stop_replaying_to_target(false), interrupt_pending(false), emergency_debug_session(&t->session()), file_scope_pid(0) { memset(&stop_siginfo, 0, sizeof(stop_siginfo)); } // Special-sauce macros defined by rr when launching the gdb client, // which implement functionality outside of the gdb remote protocol. // (Don't stare at them too long or you'll go blind ;).) static const string& gdb_rr_macros() { static string s; if (s.empty()) { stringstream ss; ss << GdbCommandHandler::gdb_macros() << "define restart\n" << " run c$arg0\n" << "end\n" << "document restart\n" << "restart at checkpoint N\n" << "checkpoints are created with the 'checkpoint' command\n" << "end\n" << "define seek-ticks\n" << " run t$arg0\n" << "end\n" << "document seek-ticks\n" << "restart at given ticks value\n" << "end\n" // In gdb version "Fedora 7.8.1-30.fc21", a raw "run" command // issued before any user-generated resume-execution command // results in gdb hanging just after the inferior hits an internal // gdb breakpoint. This happens outside of rr, with gdb // controlling gdbserver, as well. We work around that by // ensuring *some* resume-execution command has been issued before // restarting the session. But, only if the inferior hasn't // already finished execution ($_thread != 0). If it has and we // issue the "stepi" command, then gdb refuses to restart // execution. << "define hook-run\n" << " rr-hook-run\n" << "end\n" << "define hookpost-continue\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-step\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-stepi\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-next\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-nexti\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-finish\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-reverse-continue\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-reverse-step\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-reverse-stepi\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-reverse-finish\n" << " rr-set-suppress-run-hook 1\n" << "end\n" << "define hookpost-run\n" << " rr-set-suppress-run-hook 0\n" << "end\n" << "set unwindonsignal on\n" << "handle SIGURG stop\n" << "set prompt (rr) \n" // Try both "set target-async" and "maint set target-async" since // that changed recently. << "python\n" << "import re\n" << "m = re.compile(" << "'[^0-9]*([0-9]+)\\.([0-9]+)(\\.([0-9]+))?'" << ").match(gdb.VERSION)\n" << "ver = int(m.group(1))*10000 + int(m.group(2))*100\n" << "if m.group(4):\n" << " ver = ver + int(m.group(4))\n" << "\n" << "if ver == 71100:\n" << " gdb.write(" << "'This version of gdb (7.11.0) has known bugs that break rr. " << "Install 7.11.1 or later.\\n', gdb.STDERR)\n" << "\n" << "if ver < 71101:\n" << " gdb.execute('set target-async 0')\n" << " gdb.execute('maint set target-async 0')\n" << "end\n"; s = ss.str(); } return s; } /** * Attempt to find the value of |regname| (a DebuggerRegister * name), and if so (i) write it to |buf|; (ii) * set |*defined = true|; (iii) return the size of written * data. If |*defined == false|, the value of |buf| is * meaningless. * * This helper can fetch the values of both general-purpose * and "extra" registers. * * NB: |buf| must be large enough to hold the largest register * value that can be named by |regname|. */ static size_t get_reg(const Registers& regs, const ExtraRegisters& extra_regs, uint8_t* buf, GdbRegister regname, bool* defined) { size_t num_bytes = regs.read_register(buf, regname, defined); if (!*defined) { num_bytes = extra_regs.read_register(buf, regname, defined); } return num_bytes; } /** * Return the register |which|, which may not have a defined value. */ GdbRegisterValue GdbServer::get_reg(const Registers& regs, const ExtraRegisters& extra_regs, GdbRegister which) { GdbRegisterValue reg; memset(®, 0, sizeof(reg)); reg.name = which; reg.size = rr::get_reg(regs, extra_regs, ®.value[0], which, ®.defined); return reg; } static GdbThreadId get_threadid(const Session& session, const TaskUid& tuid) { Task* t = session.find_task(tuid); pid_t pid = t ? t->tgid() : GdbThreadId::ANY.pid; return GdbThreadId(pid, tuid.tid()); } static GdbThreadId get_threadid(Task* t) { return GdbThreadId(t->tgid(), t->rec_tid); } static bool matches_threadid(const GdbThreadId& tid, const GdbThreadId& target) { return (target.pid <= 0 || target.pid == tid.pid) && (target.tid <= 0 || target.tid == tid.tid); } static bool matches_threadid(Task* t, const GdbThreadId& target) { GdbThreadId tid = get_threadid(t); return matches_threadid(tid, target); } static WatchType watchpoint_type(GdbRequestType req) { switch (req) { case DREQ_SET_HW_BREAK: case DREQ_REMOVE_HW_BREAK: return WATCH_EXEC; case DREQ_SET_WR_WATCH: case DREQ_REMOVE_WR_WATCH: return WATCH_WRITE; case DREQ_REMOVE_RDWR_WATCH: case DREQ_SET_RDWR_WATCH: // NB: x86 doesn't support read-only watchpoints (who would // ever want to use one?) so we treat them as readwrite // watchpoints and hope that gdb can figure out what's going // on. That is, if a user ever tries to set a read // watchpoint. case DREQ_REMOVE_RD_WATCH: case DREQ_SET_RD_WATCH: return WATCH_READWRITE; default: FATAL() << "Unknown dbg request " << req; return WatchType(-1); // not reached } } static void maybe_singlestep_for_event(Task* t, GdbRequest* req) { if (!t->session().is_replaying()) { return; } auto rt = static_cast(t); if (trace_instructions_up_to_event( rt->session().current_trace_frame().time())) { fputs("Stepping: ", stderr); t->regs().print_register_file_compact(stderr); fprintf(stderr, " ticks:%" PRId64 "\n", t->tick_count()); *req = GdbRequest(DREQ_CONT); req->suppress_debugger_stop = true; req->cont().actions.push_back( GdbContAction(ACTION_STEP, get_threadid(t->session(), t->tuid()))); } } void GdbServer::dispatch_regs_request(const Registers& regs, const ExtraRegisters& extra_regs) { GdbRegister end; // Send values for all the registers we sent XML register descriptions for. // Those descriptions are controlled by GdbConnection::cpu_features(). bool have_AVX = dbg->cpu_features() & GdbConnection::CPU_AVX; switch (regs.arch()) { case x86: end = have_AVX ? DREG_YMM7H : DREG_ORIG_EAX; break; case x86_64: end = have_AVX ? DREG_64_YMM15H : DREG_ORIG_RAX; break; case aarch64: end = DREG_FPCR; break; default: FATAL() << "Unknown architecture"; return; } vector rs; for (GdbRegister r = GdbRegister(0); r <= end; r = GdbRegister(r + 1)) { rs.push_back(get_reg(regs, extra_regs, r)); } dbg->reply_get_regs(rs); } class GdbBreakpointCondition : public BreakpointCondition { public: GdbBreakpointCondition(const vector>& bytecodes) { for (auto& b : bytecodes) { expressions.push_back(GdbExpression(b.data(), b.size())); } } virtual bool evaluate(Task* t) const override { for (auto& e : expressions) { GdbExpression::Value v; // Break if evaluation fails or the result is nonzero if (!e.evaluate(t, &v) || v.i != 0) { return true; } } return false; } private: vector expressions; }; static unique_ptr breakpoint_condition( const GdbRequest& request) { if (request.watch().conditions.empty()) { return nullptr; } return unique_ptr( new GdbBreakpointCondition(request.watch().conditions)); } static bool search_memory(Task* t, const MemoryRange& where, const vector& find, remote_ptr* result) { vector buf; buf.resize(page_size() + find.size() - 1); for (const auto& m : t->vm()->maps()) { MemoryRange r = MemoryRange(m.map.start(), m.map.end() + find.size() - 1) .intersect(where); // We basically read page by page here, but we read past the end of the // page to handle the case where a found string crosses page boundaries. // This approach isn't great for handling long search strings but gdb's find // command isn't really suited to that. // Reading page by page lets us avoid problems where some pages in a // mapping aren't readable (e.g. reading beyond end of file). while (r.size() >= find.size()) { ssize_t nread = t->read_bytes_fallible( r.start(), std::min(buf.size(), r.size()), buf.data()); if (nread >= ssize_t(find.size())) { void* found = memmem(buf.data(), nread, find.data(), find.size()); if (found) { *result = r.start() + (static_cast(found) - buf.data()); return true; } } r = MemoryRange( std::min(r.end(), floor_page_size(r.start()) + page_size()), r.end()); } } return false; } static bool is_in_patch_stubs(Task* t, remote_code_ptr ip) { auto p = ip.to_data_ptr(); return t->vm()->has_mapping(p) && (t->vm()->mapping_flags_of(p) & AddressSpace::Mapping::IS_PATCH_STUBS); } void GdbServer::maybe_intercept_mem_request(Task* target, const GdbRequest& req, vector* result) { DEBUG_ASSERT(req.mem_.len >= result->size()); /* Crazy hack! * When gdb tries to read the word at the top of the stack, and we're in our * dynamically-generated stub code, tell it the value is zero, so that gdb's * stack-walking code doesn't find a bogus value that it treats as a return * address and sets a breakpoint there, potentially corrupting program data. * gdb sometimes reads a whole block of memory around the stack pointer so * handle cases where the top-of-stack word is contained in a larger range. */ size_t size = word_size(target->arch()); if (target->regs().sp().as_int() >= req.mem_.addr && target->regs().sp().as_int() + size <= req.mem_.addr + result->size() && is_in_patch_stubs(target, target->ip())) { memset(result->data() + target->regs().sp().as_int() - req.mem_.addr, 0, size); } } void GdbServer::dispatch_debugger_request(Session& session, const GdbRequest& req, ReportState state) { DEBUG_ASSERT(!req.is_resume_request()); // These requests don't require a target task. switch (req.type) { case DREQ_RESTART: DEBUG_ASSERT(false); return; // unreached case DREQ_GET_CURRENT_THREAD: dbg->reply_get_current_thread(get_threadid(session, last_continue_tuid)); return; case DREQ_GET_OFFSETS: /* TODO */ dbg->reply_get_offsets(); return; case DREQ_GET_THREAD_LIST: { vector tids; if (state != REPORT_THREADS_DEAD) { for (auto& kv : session.tasks()) { tids.push_back(get_threadid(session, kv.second->tuid())); } } dbg->reply_get_thread_list(tids); return; } case DREQ_INTERRUPT: { Task* t = session.find_task(last_continue_tuid); ASSERT(t, session.is_diversion()) << "Replay interrupts should be handled at a higher level"; DEBUG_ASSERT(!t || t->thread_group()->tguid() == debuggee_tguid); dbg->notify_stop(t ? get_threadid(t) : GdbThreadId(), 0); memset(&stop_siginfo, 0, sizeof(stop_siginfo)); if (t) { last_query_tuid = last_continue_tuid = t->tuid(); } return; } case DREQ_GET_EXEC_FILE: { // We shouldn't normally receive this since we try to pass the exe file // name on gdb's command line, but the user might start gdb manually // and this is easy to support in case some other debugger or // configuration needs it. Task* t = nullptr; if (req.target.tid) { ThreadGroup* tg = session.find_thread_group(req.target.tid); if (tg) { t = *tg->task_set().begin(); } } else { t = session.find_task(last_continue_tuid); } if (t) { dbg->reply_get_exec_file(t->vm()->exe_image()); } else { dbg->reply_get_exec_file(string()); } return; } case DREQ_FILE_SETFS: // Only the filesystem as seen by the remote stub is supported currently file_scope_pid = req.file_setfs().pid; dbg->reply_setfs(0); return; case DREQ_FILE_OPEN: // We only support reading files if (req.file_open().flags == O_RDONLY) { Task* t = session.find_task(last_continue_tuid); int fd = open_file(session, t, req.file_open().file_name); dbg->reply_open(fd, fd >= 0 ? 0 : ENOENT); } else { dbg->reply_open(-1, EACCES); } return; case DREQ_FILE_PREAD: { GdbRequest::FilePread read_req = req.file_pread(); { auto it = files.find(read_req.fd); if (it != files.end()) { size_t size = min(read_req.size, 1024 * 1024); vector data; data.resize(size); ssize_t bytes = read_to_end(it->second, read_req.offset, data.data(), size); dbg->reply_pread(data.data(), bytes, bytes >= 0 ? 0 : -errno); return; } } { auto it = memory_files.find(read_req.fd); if (it != memory_files.end() && timeline.is_running()) { // Search our mmap stream for a record that can satisfy this request TraceReader tmp_reader(timeline.current_session().trace_reader()); tmp_reader.rewind(); while (true) { TraceReader::MappedData data; bool found; KernelMapping km = tmp_reader.read_mapped_region( &data, &found, TraceReader::DONT_VALIDATE, TraceReader::ANY_TIME); if (!found) break; if (it->second == FileId(km)) { if (data.source != TraceReader::SOURCE_FILE) { LOG(warn) << "Not serving file because it is not a file source"; break; } ScopedFd fd(data.file_name.c_str(), O_RDONLY); vector data; data.resize(read_req.size); LOG(debug) << "Reading " << read_req.size << " bytes at offset " << read_req.offset; ssize_t bytes = read_to_end(fd, read_req.offset, data.data(), read_req.size); if (bytes < (ssize_t)read_req.size) { LOG(warn) << "Requested " << read_req.size << " bytes but only got " << bytes; } dbg->reply_pread(data.data(), bytes, bytes >= 0 ? 0 : -errno); return; } } LOG(warn) << "No mapping found"; } } LOG(warn) << "Unknown file descriptor requested"; dbg->reply_pread(nullptr, 0, EIO); return; } case DREQ_FILE_CLOSE: { { auto it = files.find(req.file_close().fd); if (it != files.end()) { files.erase(it); dbg->reply_close(0); return; } } { auto it = memory_files.find(req.file_close().fd); if (it != memory_files.end()) { memory_files.erase(it); dbg->reply_close(0); return; } } LOG(warn) << "Unable to find file descriptor for close"; dbg->reply_close(EBADF); } default: /* fall through to next switch stmt */ break; } bool is_query = req.type != DREQ_SET_CONTINUE_THREAD; Task* target = req.target.tid > 0 ? session.find_task(req.target.tid) : session.find_task(is_query ? last_query_tuid : last_continue_tuid); if (target) { if (is_query) { last_query_tuid = target->tuid(); } else { last_continue_tuid = target->tuid(); } } // These requests query or manipulate which task is the // target, so it's OK if the task doesn't exist. switch (req.type) { case DREQ_GET_IS_THREAD_ALIVE: dbg->reply_get_is_thread_alive(target != nullptr); return; case DREQ_GET_THREAD_EXTRA_INFO: dbg->reply_get_thread_extra_info(target->name()); return; case DREQ_SET_CONTINUE_THREAD: dbg->reply_select_thread(target != nullptr); return; case DREQ_SET_QUERY_THREAD: dbg->reply_select_thread(target != nullptr); return; default: // fall through to next switch stmt break; } // These requests require a valid target task. We don't trust // the debugger to use the information provided above to only // query valid tasks. if (!target) { dbg->notify_no_such_thread(req); return; } switch (req.type) { case DREQ_GET_AUXV: { dbg->reply_get_auxv(target->vm()->saved_auxv()); return; } case DREQ_GET_MEM: { vector mem; mem.resize(req.mem().len); ssize_t nread = target->read_bytes_fallible(req.mem().addr, req.mem().len, mem.data()); mem.resize(max(ssize_t(0), nread)); target->vm()->replace_breakpoints_with_original_values( mem.data(), mem.size(), req.mem().addr); maybe_intercept_mem_request(target, req, &mem); dbg->reply_get_mem(mem); return; } case DREQ_SET_MEM: { // gdb has been observed to send requests of length 0 at // odd times // (e.g. before sending the magic write to create a checkpoint) if (req.mem().len == 0) { dbg->reply_set_mem(true); return; } // If an address is recognised as belonging to a SystemTap semaphore it's // because it was detected by the audit library during recording and // pre-incremented. if (target->vm()->is_stap_semaphore(req.mem().addr)) { LOG(info) << "Suppressing write to SystemTap semaphore"; dbg->reply_set_mem(true); return; } // We only allow the debugger to write memory if the // memory will be written to an diversion session. // Arbitrary writes to replay sessions cause // divergence. if (!session.is_diversion()) { LOG(error) << "Attempt to write memory outside diversion session"; dbg->reply_set_mem(false); return; } LOG(debug) << "Writing " << req.mem().len << " bytes to " << HEX(req.mem().addr); // TODO fallible target->write_bytes_helper(req.mem().addr, req.mem().len, req.mem().data.data()); dbg->reply_set_mem(true); return; } case DREQ_SEARCH_MEM: { remote_ptr addr; bool found = search_memory(target, MemoryRange(req.mem().addr, req.mem().len), req.mem().data, &addr); dbg->reply_search_mem(found, addr); return; } case DREQ_GET_REG: { GdbRegisterValue reg = get_reg(target->regs(), target->extra_regs(), req.reg().name); dbg->reply_get_reg(reg); return; } case DREQ_GET_REGS: { dispatch_regs_request(target->regs(), target->extra_regs()); return; } case DREQ_SET_REG: { if (!session.is_diversion()) { // gdb sets orig_eax to -1 during a restart. For a // replay session this is not correct (we might be // restarting from an rr checkpoint inside a system // call, and we must not tamper with replay state), so // just ignore it. if ((target->arch() == x86 && req.reg().name == DREG_ORIG_EAX) || (target->arch() == x86_64 && req.reg().name == DREG_ORIG_RAX)) { dbg->reply_set_reg(true); return; } LOG(error) << "Attempt to write register outside diversion session"; dbg->reply_set_reg(false); return; } if (req.reg().defined) { Registers regs = target->regs(); regs.write_register(req.reg().name, req.reg().value, req.reg().size); target->set_regs(regs); } dbg->reply_set_reg(true /*currently infallible*/); return; } case DREQ_GET_STOP_REASON: { dbg->reply_get_stop_reason(get_threadid(session, last_continue_tuid), stop_siginfo.si_signo); return; } case DREQ_SET_SW_BREAK: { ASSERT(target, req.watch().kind == bkpt_instruction_length(target->arch())) << "Debugger setting bad breakpoint insn"; // Mirror all breakpoint/watchpoint sets/unsets to the target process // if it's not part of the timeline (i.e. it's a diversion). ReplayTask* replay_task = timeline.current_session().find_task(target->tuid()); bool ok = timeline.add_breakpoint(replay_task, req.watch().addr, breakpoint_condition(req)); if (ok && &session != &timeline.current_session()) { bool diversion_ok = target->vm()->add_breakpoint(req.watch().addr, BKPT_USER); ASSERT(target, diversion_ok); } dbg->reply_watchpoint_request(ok); return; } case DREQ_SET_HW_BREAK: case DREQ_SET_RD_WATCH: case DREQ_SET_WR_WATCH: case DREQ_SET_RDWR_WATCH: { ReplayTask* replay_task = timeline.current_session().find_task(target->tuid()); bool ok = timeline.add_watchpoint( replay_task, req.watch().addr, req.watch().kind, watchpoint_type(req.type), breakpoint_condition(req)); if (ok && &session != &timeline.current_session()) { bool diversion_ok = target->vm()->add_watchpoint( req.watch().addr, req.watch().kind, watchpoint_type(req.type)); ASSERT(target, diversion_ok); } dbg->reply_watchpoint_request(ok); return; } case DREQ_REMOVE_SW_BREAK: { ReplayTask* replay_task = timeline.current_session().find_task(target->tuid()); timeline.remove_breakpoint(replay_task, req.watch().addr); if (&session != &timeline.current_session()) { target->vm()->remove_breakpoint(req.watch().addr, BKPT_USER); } dbg->reply_watchpoint_request(true); return; } case DREQ_REMOVE_HW_BREAK: case DREQ_REMOVE_RD_WATCH: case DREQ_REMOVE_WR_WATCH: case DREQ_REMOVE_RDWR_WATCH: { ReplayTask* replay_task = timeline.current_session().find_task(target->tuid()); timeline.remove_watchpoint(replay_task, req.watch().addr, req.watch().kind, watchpoint_type(req.type)); if (&session != &timeline.current_session()) { target->vm()->remove_watchpoint(req.watch().addr, req.watch().kind, watchpoint_type(req.type)); } dbg->reply_watchpoint_request(true); return; } case DREQ_READ_SIGINFO: { vector si_bytes; si_bytes.resize(req.mem().len); memset(si_bytes.data(), 0, si_bytes.size()); memcpy(si_bytes.data(), &stop_siginfo, min(si_bytes.size(), sizeof(stop_siginfo))); dbg->reply_read_siginfo(si_bytes); return; } case DREQ_WRITE_SIGINFO: LOG(warn) << "WRITE_SIGINFO request outside of diversion session"; dbg->reply_write_siginfo(); return; case DREQ_RR_CMD: dbg->reply_rr_cmd( GdbCommandHandler::process_command(*this, target, req.text())); return; case DREQ_QSYMBOL: { // When gdb sends "qSymbol::", it means that gdb is ready to // respond to symbol requests. This can be sent multiple times // during the course of a session -- gdb sends it whenever // something in the inferior has changed, making it possible // that previous failed symbol lookups could now succeed. In // response to a qSymbol request from gdb, we either send back a // qSymbol response, requesting the address of a symbol; or we // send back OK. We have to do this as an ordinary response and // maintain our own state explicitly, as opposed to simply // reading another packet from gdb, because when gdb looks up a // symbol it might send other requests that must be served. So, // we keep a copy of the symbol names, and an iterator into this // copy. When gdb sends a plain "qSymbol::" packet, because gdb // has detected some change in the inferior state that might // enable more symbol lookups, we restart the iterator. if (!thread_db) { thread_db = std::unique_ptr(new ThreadDb(debuggee_tguid.tid())); } const string& name = req.sym().name; if (req.sym().has_address) { // Got a response holding a previously-requested symbol's name // and address. thread_db->register_symbol(name, req.sym().address); } else if (name == "") { // Plain "qSymbol::" request. symbols = thread_db->get_symbols_and_clear_map(target->thread_group().get()); symbols_iter = symbols.begin(); } if (symbols_iter == symbols.end()) { dbg->qsymbols_finished(); } else { string symbol = *symbols_iter++; dbg->send_qsymbol(symbol); } return; } case DREQ_TLS: { if (!thread_db) { thread_db = std::unique_ptr(new ThreadDb(debuggee_tguid.tid())); } remote_ptr address; bool ok = thread_db->get_tls_address(target->thread_group().get(), target->rec_tid, req.tls().offset, req.tls().load_module, &address); dbg->reply_tls_addr(ok, address); return; } default: FATAL() << "Unknown debugger request " << req.type; } } static bool any_action_targets_match(const Session& session, const TaskUid& tuid, const vector& actions) { GdbThreadId tid = get_threadid(session, tuid); return any_of(actions.begin(), actions.end(), [tid](GdbContAction action) { return matches_threadid(tid, action.target); }); } static Task* find_first_task_matching_target( const Session& session, const vector& actions) { const Session::TaskMap& tasks = session.tasks(); auto it = find_first_of( tasks.begin(), tasks.end(), actions.begin(), actions.end(), [](Session::TaskMap::value_type task_pair, GdbContAction action) { return matches_threadid(task_pair.second, action.target); }); return it != tasks.end() ? it->second : nullptr; } bool GdbServer::diverter_process_debugger_requests( DiversionSession& diversion_session, uint32_t& diversion_refcount, GdbRequest* req) { while (true) { *req = dbg->get_request(); if (req->is_resume_request()) { const vector& actions = req->cont().actions; DEBUG_ASSERT(actions.size() > 0); // GDB may ask us to resume more than one task, so we have to // choose one. We give priority to the task last resumed, as // this is likely to be the context in which GDB is executing // code; selecting any other task runs the risk of resuming // replay, denying the diverted code an opportunity to complete // and end the diversion session. if (!any_action_targets_match(diversion_session, last_continue_tuid, actions)) { // If none of the resumption targets match the task last // resumed, we simply choose any matching task. This ensures // that GDB (and the user) can choose an arbitrary thread to // serve as the context of the code being evaluated. // TODO: maybe it makes sense to try and select the matching // task that was most recently resumed, or possibly the // matching task with an event in the replay trace nearest to // 'now'. Task* task = find_first_task_matching_target(diversion_session, actions); DEBUG_ASSERT(task != nullptr); last_continue_tuid = task->tuid(); } return diversion_refcount > 0; } switch (req->type) { case DREQ_RESTART: case DREQ_DETACH: diversion_refcount = 0; return false; case DREQ_READ_SIGINFO: { LOG(debug) << "Adding ref to diversion session"; ++diversion_refcount; // TODO: maybe share with replayer.cc? vector si_bytes; si_bytes.resize(req->mem().len); memset(si_bytes.data(), 0, si_bytes.size()); dbg->reply_read_siginfo(si_bytes); continue; } case DREQ_SET_QUERY_THREAD: { if (req->target.tid) { Task* next = diversion_session.find_task(req->target.tid); if (next) { last_query_tuid = next->tuid(); } } break; } case DREQ_WRITE_SIGINFO: LOG(debug) << "Removing reference to diversion session ..."; DEBUG_ASSERT(diversion_refcount > 0); --diversion_refcount; if (diversion_refcount == 0) { LOG(debug) << " ... dying at next continue request"; } dbg->reply_write_siginfo(); continue; case DREQ_RR_CMD: { DEBUG_ASSERT(req->type == DREQ_RR_CMD); Task* task = diversion_session.find_task(last_continue_tuid); if (task) { std::string reply = GdbCommandHandler::process_command(*this, task, req->text()); // Certain commands cause the diversion to end immediately // while other commands must work within a diversion. if (reply == GdbCommandHandler::cmd_end_diversion()) { diversion_refcount = 0; return false; } dbg->reply_rr_cmd(reply); continue; } else { diversion_refcount = 0; return false; } break; } default: break; } dispatch_debugger_request(diversion_session, *req, REPORT_NORMAL); } } static bool is_last_thread_exit(const BreakStatus& break_status) { // The task set may be empty if the task has already exited. return break_status.task_exit && break_status.task_context.thread_group->task_set().size() <= 1; } static Task* is_in_exec(ReplayTimeline& timeline) { Task* t = timeline.current_session().current_task(); if (!t) { return nullptr; } return timeline.current_session().next_step_is_successful_syscall_exit( syscall_number_for_execve(t->arch())) ? t : nullptr; } void GdbServer::maybe_notify_stop(const GdbRequest& req, const BreakStatus& break_status) { bool do_stop = false; remote_ptr watch_addr; char watch[1024]; watch[0] = '\0'; if (!break_status.watchpoints_hit.empty()) { do_stop = true; memset(&stop_siginfo, 0, sizeof(stop_siginfo)); stop_siginfo.si_signo = SIGTRAP; watch_addr = break_status.watchpoints_hit[0].addr; bool any_hw_break = false; for (const auto& w : break_status.watchpoints_hit) { if (w.type == WATCH_EXEC) { any_hw_break = true; } } if (dbg->hwbreak_supported() && any_hw_break) { snprintf(watch, sizeof(watch) - 1, "hwbreak:;"); } else if (watch_addr) { snprintf(watch, sizeof(watch) - 1, "watch:%" PRIxPTR ";", watch_addr.as_int()); } LOG(debug) << "Stopping for watchpoint at " << watch_addr; } if (break_status.breakpoint_hit || break_status.singlestep_complete) { do_stop = true; memset(&stop_siginfo, 0, sizeof(stop_siginfo)); stop_siginfo.si_signo = SIGTRAP; if (break_status.breakpoint_hit) { if (dbg->swbreak_supported()) { snprintf(watch, sizeof(watch) - 1, "swbreak:;"); } LOG(debug) << "Stopping for breakpoint"; } else { LOG(debug) << "Stopping for singlestep"; } } if (break_status.signal) { do_stop = true; stop_siginfo = *break_status.signal; LOG(debug) << "Stopping for signal " << stop_siginfo; } if (is_last_thread_exit(break_status)) { if (break_status.task_context.session->is_diversion()) { // If the last task of a diversion session has exited, we need // to make sure GDB knows it's unrecoverable. There's no good // way to do this: a stop is insufficient, but an inferior exit // typically signals the end of a debugging session. Using the // latter approach appears to work, but stepping through GDB's // processing of the event seems to indicate it isn't really // supposed to. FIXME. LOG(debug) << "Last task of diversion exiting. " << "Notifying exit with synthetic SIGKILL"; dbg->notify_exit_signal(SIGKILL); return; } else if (dbg->features().reverse_execution) { do_stop = true; memset(&stop_siginfo, 0, sizeof(stop_siginfo)); if (req.cont().run_direction == RUN_FORWARD) { // The exit of the last task in a thread group generates a fake SIGKILL, // when reverse-execution is enabled, because users often want to run // backwards from the end of the task. stop_siginfo.si_signo = SIGKILL; LOG(debug) << "Stopping for synthetic SIGKILL"; } else { // The start of the debuggee task-group should trigger a silent stop. stop_siginfo.si_signo = 0; LOG(debug) << "Stopping at start of execution while running backwards"; } } } Task* t = break_status.task(); Task* in_exec_task = is_in_exec(timeline); if (in_exec_task) { do_stop = true; memset(&stop_siginfo, 0, sizeof(stop_siginfo)); t = in_exec_task; LOG(debug) << "Stopping at exec"; } if (do_stop && t->thread_group()->tguid() == debuggee_tguid) { /* Notify the debugger and process any new requests * that might have triggered before resuming. */ dbg->notify_stop(get_threadid(t), stop_siginfo.si_signo, watch); last_query_tuid = last_continue_tuid = t->tuid(); } } static RunCommand compute_run_command_from_actions(Task* t, const GdbRequest& req, int* signal_to_deliver) { for (auto& action : req.cont().actions) { if (matches_threadid(t, action.target)) { // We can only run task |t|; neither diversion nor replay sessions // support running multiple threads. So even if gdb tells us to continue // multiple threads, we don't do that. *signal_to_deliver = action.signal_to_deliver; return action.type == ACTION_STEP ? RUN_SINGLESTEP : RUN_CONTINUE; } } // gdb told us to run (or step) some thread that's not |t|, without resuming // |t|. It sometimes does this even though its target thread is entering a // blocking syscall and |t| must run before gdb's target thread can make // progress. So, allow |t| to run anyway. *signal_to_deliver = 0; return RUN_CONTINUE; } struct AllowedTasks { TaskUid task; // tid 0 means 'any member of debuggee_tguid' RunCommand command; }; static RunCommand compute_run_command_for_reverse_exec( Session& session, const ThreadGroupUid& debuggee_tguid, const GdbRequest& req, vector& allowed_tasks) { // Singlestep if any of the actions request singlestepping. RunCommand result = RUN_CONTINUE; for (auto& action : req.cont().actions) { if (action.target.pid > 0 && action.target.pid != debuggee_tguid.tid()) { continue; } AllowedTasks allowed; allowed.command = RUN_CONTINUE; if (action.type == ACTION_STEP) { allowed.command = result = RUN_SINGLESTEP; } if (action.target.tid > 0) { Task* t = session.find_task(action.target.tid); if (t) { allowed.task = t->tuid(); } } allowed_tasks.push_back(allowed); } return result; } /** * Create a new diversion session using |replay| session as the * template. The |replay| session isn't mutated. * * Execution begins in the new diversion session under the control of * |dbg| starting with initial thread target |task|. The diversion * session ends at the request of |dbg|, and |req| returns the first * request made that wasn't handled by the diversion session. That * is, the first request that should be handled by |replay| upon * resuming execution in that session. */ GdbRequest GdbServer::divert(ReplaySession& replay) { GdbRequest req; LOG(debug) << "Starting debugging diversion for " << &replay; if (timeline.is_running()) { // Ensure breakpoints and watchpoints are applied before we fork the // diversion, to ensure the diversion is consistent with the timeline // breakpoint/watchpoint state. timeline.apply_breakpoints_and_watchpoints(); } DiversionSession::shr_ptr diversion_session = replay.clone_diversion(); uint32_t diversion_refcount = 1; TaskUid saved_query_tuid = last_query_tuid; TaskUid saved_continue_tuid = last_continue_tuid; while (diverter_process_debugger_requests(*diversion_session, diversion_refcount, &req)) { DEBUG_ASSERT(req.is_resume_request()); if (req.cont().run_direction == RUN_BACKWARD) { // We don't support reverse execution in a diversion. Just issue // an immediate stop. dbg->notify_stop(get_threadid(*diversion_session, last_continue_tuid), 0); memset(&stop_siginfo, 0, sizeof(stop_siginfo)); last_query_tuid = last_continue_tuid; continue; } Task* t = diversion_session->find_task(last_continue_tuid); DEBUG_ASSERT(t != nullptr); int signal_to_deliver; RunCommand command = compute_run_command_from_actions(t, req, &signal_to_deliver); auto result = diversion_session->diversion_step(t, command, signal_to_deliver); if (result.status == DiversionSession::DIVERSION_EXITED) { diversion_refcount = 0; maybe_notify_stop(req, result.break_status); req = GdbRequest(DREQ_NONE); break; } DEBUG_ASSERT(result.status == DiversionSession::DIVERSION_CONTINUE); maybe_notify_stop(req, result.break_status); } LOG(debug) << "... ending debugging diversion"; DEBUG_ASSERT(diversion_refcount == 0); diversion_session->kill_all_tasks(); last_query_tuid = saved_query_tuid; last_continue_tuid = saved_continue_tuid; return req; } /** * Reply to debugger requests until the debugger asks us to resume * execution, detach, restart, or interrupt. */ GdbRequest GdbServer::process_debugger_requests(ReportState state) { while (true) { GdbRequest req = dbg->get_request(); req.suppress_debugger_stop = false; try_lazy_reverse_singlesteps(req); if (req.type == DREQ_READ_SIGINFO) { vector si_bytes; si_bytes.resize(req.mem().len); memset(si_bytes.data(), 0, si_bytes.size()); memcpy(si_bytes.data(), &stop_siginfo, min(si_bytes.size(), sizeof(stop_siginfo))); dbg->reply_read_siginfo(si_bytes); // READ_SIGINFO is usually the start of a diversion. It can also be // triggered by "print $_siginfo" but that is rare so we just assume it's // a diversion start; if "print $_siginfo" happens we'll print the correct // siginfo and then incorrectly start a diversion and go haywire :-(. // Ideally we'd come up with a better way to detect diversions so that // "print $_siginfo" works. req = divert(timeline.current_session()); if (req.type == DREQ_NONE) { continue; } // Carry on to process the request that was rejected by // the diversion session } if (req.is_resume_request()) { Task* t = current_session().find_task(last_continue_tuid); if (t) { maybe_singlestep_for_event(t, &req); } return req; } if (req.type == DREQ_INTERRUPT) { LOG(debug) << " request to interrupt"; return req; } if (req.type == DREQ_RESTART) { // Debugger client requested that we restart execution // from the beginning. Restart our debug session. LOG(debug) << " request to restart at event " << req.restart().param; return req; } if (req.type == DREQ_DETACH) { LOG(debug) << " debugger detached"; dbg->reply_detach(); return req; } dispatch_debugger_request(current_session(), req, state); } } void GdbServer::try_lazy_reverse_singlesteps(GdbRequest& req) { if (!timeline.is_running()) { return; } ReplayTimeline::Mark now; bool need_seek = false; ReplayTask* t = timeline.current_session().current_task(); while (t && req.type == DREQ_CONT && req.cont().run_direction == RUN_BACKWARD && req.cont().actions.size() == 1 && req.cont().actions[0].type == ACTION_STEP && req.cont().actions[0].signal_to_deliver == 0 && matches_threadid(t, req.cont().actions[0].target) && !req.suppress_debugger_stop) { if (!now) { now = timeline.mark(); } ReplayTimeline::Mark previous = timeline.lazy_reverse_singlestep(now, t); if (!previous) { break; } now = previous; need_seek = true; BreakStatus break_status; break_status.task_context = TaskContext(t); break_status.singlestep_complete = true; LOG(debug) << " using lazy reverse-singlestep"; maybe_notify_stop(req, break_status); while (true) { req = dbg->get_request(); req.suppress_debugger_stop = false; if (req.type != DREQ_GET_REGS) { break; } LOG(debug) << " using lazy reverse-singlestep registers"; dispatch_regs_request(now.regs(), now.extra_regs()); } } if (need_seek) { timeline.seek_to_mark(now); } } bool GdbServer::detach_or_restart(const GdbRequest& req, ContinueOrStop* s) { if (DREQ_RESTART == req.type) { restart_session(req); *s = CONTINUE_DEBUGGING; return true; } if (DREQ_DETACH == req.type) { *s = STOP_DEBUGGING; return true; } return false; } GdbServer::ContinueOrStop GdbServer::handle_exited_state( GdbRequest& last_resume_request) { // TODO return real exit code, if it's useful. dbg->notify_exit_code(0); final_event = timeline.current_session().trace_reader().time(); GdbRequest req = process_debugger_requests(REPORT_THREADS_DEAD); ContinueOrStop s; if (detach_or_restart(req, &s)) { last_resume_request = GdbRequest(); return s; } FATAL() << "Received continue/interrupt request after end-of-trace."; return STOP_DEBUGGING; } GdbServer::ContinueOrStop GdbServer::debug_one_step( GdbRequest& last_resume_request) { ReplayResult result; GdbRequest req; if (in_debuggee_end_state) { // Treat the state where the last thread is about to exit like // termination. req = process_debugger_requests(); // If it's a forward execution request, fake the exited state. if (req.is_resume_request() && req.cont().run_direction == RUN_FORWARD) { if (interrupt_pending) { // Just process this. We're getting it after a restart. } else { return handle_exited_state(last_resume_request); } } else { if (req.type != DREQ_DETACH) { in_debuggee_end_state = false; } } // Otherwise (e.g. detach, restart, interrupt or reverse-exec) process // the request as normal. } else if (!interrupt_pending || last_resume_request.type == DREQ_NONE) { req = process_debugger_requests(); } else { req = last_resume_request; } ContinueOrStop s; if (detach_or_restart(req, &s)) { last_resume_request = GdbRequest(); return s; } if (req.is_resume_request()) { last_resume_request = req; } else { DEBUG_ASSERT(req.type == DREQ_INTERRUPT); interrupt_pending = true; req = last_resume_request; DEBUG_ASSERT(req.is_resume_request()); } if (interrupt_pending) { Task* t = timeline.current_session().current_task(); if (t->thread_group()->tguid() == debuggee_tguid) { interrupt_pending = false; dbg->notify_stop(get_threadid(t), in_debuggee_end_state ? SIGKILL : 0); memset(&stop_siginfo, 0, sizeof(stop_siginfo)); return CONTINUE_DEBUGGING; } } if (req.cont().run_direction == RUN_FORWARD) { if (is_in_exec(timeline) && timeline.current_session().current_task()->thread_group()->tguid() == debuggee_tguid) { // Don't go any further forward. maybe_notify_stop will generate a // stop. result = ReplayResult(); } else { int signal_to_deliver; RunCommand command = compute_run_command_from_actions( timeline.current_session().current_task(), req, &signal_to_deliver); // Ignore gdb's |signal_to_deliver|; we just have to follow the replay. result = timeline.replay_step_forward(command, target.event); } if (result.status == REPLAY_EXITED) { return handle_exited_state(last_resume_request); } } else { vector allowed_tasks; // Convert the tids in GdbContActions into TaskUids to avoid issues // if tids get reused. RunCommand command = compute_run_command_for_reverse_exec( timeline.current_session(), debuggee_tguid, req, allowed_tasks); auto stop_filter = [&](Task* t) -> bool { if (t->thread_group()->tguid() != debuggee_tguid) { return false; } // If gdb's requested actions don't allow the task to run, we still // let it run (we can't do anything else, since we're replaying), but // we won't report stops in that task. for (auto& a : allowed_tasks) { if (a.task.tid() == 0 || a.task == t->tuid()) { return true; } } return false; }; auto interrupt_check = [&]() { return dbg->sniff_packet(); }; switch (command) { case RUN_CONTINUE: result = timeline.reverse_continue(stop_filter, interrupt_check); break; case RUN_SINGLESTEP: { Task* t = timeline.current_session().find_task(last_continue_tuid); DEBUG_ASSERT(t); result = timeline.reverse_singlestep( last_continue_tuid, t->tick_count(), stop_filter, interrupt_check); break; } default: DEBUG_ASSERT(0 && "Unknown RunCommand"); } if (result.status == REPLAY_EXITED) { return handle_exited_state(last_resume_request); } } if (!req.suppress_debugger_stop) { maybe_notify_stop(req, result.break_status); } if (req.cont().run_direction == RUN_FORWARD && is_last_thread_exit(result.break_status) && result.break_status.task_context.thread_group->tguid() == debuggee_tguid) { in_debuggee_end_state = true; } return CONTINUE_DEBUGGING; } bool GdbServer::at_target() { // Don't launch the debugger for the initial rr fork child. // No one ever wants that to happen. if (!timeline.current_session().done_initial_exec()) { return false; } Task* t = timeline.current_session().current_task(); if (!t) { return false; } if (!timeline.can_add_checkpoint()) { return false; } if (stop_replaying_to_target) { return true; } // When we decide to create the debugger, we may end up // creating a checkpoint. In that case, we want the // checkpoint to retain the state it had *before* we started // replaying the next frame. Otherwise, the TraceIfstream // will be one frame ahead of its tracee tree. // // So we make the decision to create the debugger based on the // frame we're *about to* replay, without modifying the // TraceIfstream. // NB: we'll happily attach to whichever task within the // group happens to be scheduled here. We don't take // "attach to process" to mean "attach to thread-group // leader". return timeline.current_session().current_trace_frame().time() > target.event && (!target.pid || t->tgid() == target.pid) && (!target.require_exec || t->execed()) && // Ensure we're at the start of processing an event. We don't // want to attach while we're finishing an exec() since that's a // slightly confusing state for ReplayTimeline's reverse execution. !timeline.current_session().current_step_key().in_execution(); } /** * The trace has reached the event at which the user wanted to start debugging. * Set up the appropriate state. */ void GdbServer::activate_debugger() { TraceFrame next_frame = timeline.current_session().current_trace_frame(); FrameTime event_now = next_frame.time(); Task* t = timeline.current_session().current_task(); if (target.event > 0 || target.pid) { if (stop_replaying_to_target) { fprintf(stderr, "\a\n" "--------------------------------------------------\n" " ---> Interrupted; attached to NON-TARGET process %d at event %llu.\n" "--------------------------------------------------\n", t->tgid(), (long long)event_now); } else { fprintf(stderr, "\a\n" "--------------------------------------------------\n" " ---> Reached target process %d at event %llu.\n" "--------------------------------------------------\n", t->tgid(), (long long)event_now); } } // Store the current tgid and event as the "execution target" // for the next replay session, if we end up restarting. This // allows us to determine if a later session has reached this // target without necessarily replaying up to this point. target.pid = t->tgid(); target.require_exec = false; target.event = event_now; last_query_tuid = last_continue_tuid = t->tuid(); // Have the "checkpoint" be the original replay // session, and then switch over to using the cloned // session. The cloned tasks will look like children // of the clonees, so this scheme prevents |pstree| // output from getting /too/ far out of whack. const char* where = "???"; if (timeline.can_add_checkpoint()) { debugger_restart_checkpoint = Checkpoint(timeline, last_continue_tuid, Checkpoint::EXPLICIT, where); } else { debugger_restart_checkpoint = Checkpoint(timeline, last_continue_tuid, Checkpoint::NOT_EXPLICIT, where); } } void GdbServer::restart_session(const GdbRequest& req) { DEBUG_ASSERT(req.type == DREQ_RESTART); DEBUG_ASSERT(dbg); in_debuggee_end_state = false; timeline.remove_breakpoints_and_watchpoints(); Checkpoint checkpoint_to_restore; if (req.restart().type == RESTART_FROM_CHECKPOINT) { auto it = checkpoints.find(req.restart().param); if (it == checkpoints.end()) { cout << "Checkpoint " << req.restart().param_str << " not found.\n"; cout << "Valid checkpoints:"; for (auto& c : checkpoints) { cout << " " << c.first; } cout << "\n"; dbg->notify_restart_failed(); return; } checkpoint_to_restore = it->second; } else if (req.restart().type == RESTART_FROM_PREVIOUS) { checkpoint_to_restore = debugger_restart_checkpoint; } else if (req.restart().type == RESTART_FROM_TICKS) { Ticks target = req.restart().param; ReplaySession &session = timeline.current_session(); Task* task = session.current_task(); FrameTime current_time = session.current_frame_time(); TraceReader tmp_reader(session.trace_reader()); FrameTime last_time = current_time; if (session.ticks_at_start_of_current_event() > target) { tmp_reader.rewind(); FrameTime task_time; // EXEC and CLONE reset the ticks counter. Find the first event // where the tuid matches our current task. // We'll always hit at least one CLONE/EXEC event for a task // (we can't debug the time before the initial exec) // but set this to 0 anyway to silence compiler warnings. FrameTime ticks_start_time = 0; while (true) { TraceTaskEvent r = tmp_reader.read_task_event(&task_time); if (task_time >= current_time) { break; } if (r.type() == TraceTaskEvent::CLONE || r.type() == TraceTaskEvent::EXEC) { if (r.tid() == task->tuid().tid()) { ticks_start_time = task_time; } } } // Forward the frame reader to the current event last_time = ticks_start_time; while (true) { TraceFrame frame = tmp_reader.read_frame(); if (frame.time() >= ticks_start_time) { break; } } } while (true) { if (tmp_reader.at_end()) { cout << "No event found matching specified ticks target."; dbg->notify_restart_failed(); return; } TraceFrame frame = tmp_reader.read_frame(); if (frame.tid() == task->tuid().tid() && frame.ticks() > target) { break; } last_time = frame.time(); } timeline.seek_to_ticks(last_time, target); } interrupt_pending = true; if (checkpoint_to_restore.mark) { timeline.seek_to_mark(checkpoint_to_restore.mark); last_query_tuid = last_continue_tuid = checkpoint_to_restore.last_continue_tuid; if (debugger_restart_checkpoint.is_explicit == Checkpoint::EXPLICIT) { timeline.remove_explicit_checkpoint(debugger_restart_checkpoint.mark); } debugger_restart_checkpoint = checkpoint_to_restore; if (timeline.can_add_checkpoint()) { timeline.add_explicit_checkpoint(); } return; } stop_replaying_to_target = false; if (req.restart().type == RESTART_FROM_EVENT) { // Note that we don't reset the target pid; we intentionally keep targeting // the same process no matter what is running when we hit the event. target.event = req.restart().param; target.event = min(final_event - 1, target.event); timeline.seek_to_before_event(target.event); do { ReplayResult result = timeline.replay_step_forward(RUN_CONTINUE, target.event); // We should never reach the end of the trace without hitting the stop // condition below. DEBUG_ASSERT(result.status != REPLAY_EXITED); if (is_last_thread_exit(result.break_status) && result.break_status.task_context.thread_group->tgid == target.pid) { // Debuggee task is about to exit. Stop here. in_debuggee_end_state = true; break; } } while (!at_target()); } activate_debugger(); } static uint32_t get_cpu_features(SupportedArch arch) { uint32_t cpu_features; switch (arch) { case x86: case x86_64: { cpu_features = arch == x86_64 ? GdbConnection::CPU_X86_64 : 0; unsigned int AVX_cpuid_flags = AVX_FEATURE_FLAG | OSXSAVE_FEATURE_FLAG; auto cpuid_data = cpuid(CPUID_GETFEATURES, 0); // We're assuming here that AVX support on the system making the recording // is the same as the AVX support during replay. But if that's not true, // rr is totally broken anyway. if ((cpuid_data.ecx & AVX_cpuid_flags) == AVX_cpuid_flags) { cpu_features |= GdbConnection::CPU_AVX; } break; } case aarch64: cpu_features = GdbConnection::CPU_AARCH64; break; default: FATAL() << "Unknown architecture"; return 0; } return cpu_features; } struct DebuggerParams { char exe_image[PATH_MAX]; char host[16]; // INET_ADDRSTRLEN, omitted for header churn short port; }; static void push_default_gdb_options(vector& vec, bool serve_files = false) { // The gdb protocol uses the "vRun" packet to reload // remote targets. The packet is specified to be like // "vCont", in which gdb waits infinitely long for a // stop reply packet. But in practice, gdb client // expects the vRun to complete within the remote-reply // timeout, after which it issues vCont. The timeout // causes gdb<-->rr communication to go haywire. // // rr can take a very long time indeed to send the // stop-reply to gdb after restarting replay; the time // to reach a specified execution target is // theoretically unbounded. Timing out on vRun is // technically a gdb bug, but because the rr replay and // the gdb reload models don't quite match up, we'll // work around it on the rr side by disabling the // remote-reply timeout. vec.push_back("-l"); vec.push_back("10000"); if (!serve_files) { // For now, avoid requesting binary files through vFile. That is slow and // hard to make work correctly, because gdb requests files based on the // names it sees in memory and in ELF, and those names may be symlinks to // the filenames in the trace, so it's hard to match those names to files in // the trace. vec.push_back("-ex"); vec.push_back("set sysroot /"); } } static void push_target_remote_cmd(vector& vec, const string& host, unsigned short port) { vec.push_back("-ex"); stringstream ss; // If we omit the address, then gdb can try to resolve "localhost" which // in some broken environments may not actually resolve to the local host ss << "target extended-remote " << host << ":" << port; vec.push_back(ss.str()); } /** * Wait for exactly one gdb host to connect to this remote target on * the specified IP address |host|, port |port|. If |probe| is nonzero, * a unique port based on |start_port| will be searched for. Otherwise, * if |port| is already bound, this function will fail. * * Pass the |tgid| of the task on which this debug-connection request * is being made. The remaining debugging session will be limited to * traffic regarding |tgid|, but clients don't need to and shouldn't * need to assume that. * * If we're opening this connection on behalf of a known client, pass * an fd in |client_params_fd|; we'll write the allocated port and |exe_image| * through the fd before waiting for a connection. |exe_image| is the * process that will be debugged by client, or null ptr if there isn't * a client. * * This function is infallible: either it will return a valid * debugging context, or it won't return. */ static unique_ptr await_connection( Task* t, ScopedFd& listen_fd, const GdbConnection::Features& features) { auto dbg = unique_ptr(new GdbConnection(t->tgid(), features)); dbg->set_cpu_features(get_cpu_features(t->arch())); dbg->await_debugger(listen_fd); return dbg; } static void print_debugger_launch_command(Task* t, const string& host, unsigned short port, const char* debugger_name, FILE* out) { vector options; push_default_gdb_options(options); push_target_remote_cmd(options, host, port); fprintf(out, "%s ", debugger_name); for (auto& opt : options) { fprintf(out, "'%s' ", opt.c_str()); } fprintf(out, "%s\n", t->vm()->exe_image().c_str()); } void GdbServer::serve_replay(const ConnectionFlags& flags) { do { ReplayResult result = timeline.replay_step_forward(RUN_CONTINUE, target.event); if (result.status == REPLAY_EXITED) { LOG(info) << "Debugger was not launched before end of trace"; return; } } while (!at_target()); unsigned short port = flags.dbg_port > 0 ? flags.dbg_port : getpid(); // Don't probe if the user specified a port. Explicitly // selecting a port is usually done by scripts, which would // presumably break if a different port were to be selected by // rr (otherwise why would they specify a port in the first // place). So fail with a clearer error message. auto probe = flags.dbg_port > 0 ? DONT_PROBE : PROBE_PORT; Task* t = timeline.current_session().current_task(); ScopedFd listen_fd = open_socket(flags.dbg_host.c_str(), &port, probe); if (flags.debugger_params_write_pipe) { DebuggerParams params; memset(¶ms, 0, sizeof(params)); strncpy(params.exe_image, t->vm()->exe_image().c_str(), sizeof(params.exe_image) - 1); strncpy(params.host, flags.dbg_host.c_str(), sizeof(params.host) - 1); params.port = port; ssize_t nwritten = write(*flags.debugger_params_write_pipe, ¶ms, sizeof(params)); DEBUG_ASSERT(nwritten == sizeof(params)); } else { fputs("Launch gdb with\n ", stderr); print_debugger_launch_command(t, flags.dbg_host, port, flags.debugger_name.c_str(), stderr); } if (flags.debugger_params_write_pipe) { flags.debugger_params_write_pipe->close(); } debuggee_tguid = t->thread_group()->tguid(); FrameTime first_run_event = std::max(t->vm()->first_run_event(), t->thread_group()->first_run_event()); if (first_run_event) { timeline.set_reverse_execution_barrier_event(first_run_event); } do { LOG(debug) << "initializing debugger connection"; dbg = await_connection(t, listen_fd, GdbConnection::Features()); activate_debugger(); GdbRequest last_resume_request; while (debug_one_step(last_resume_request) == CONTINUE_DEBUGGING) { } timeline.remove_breakpoints_and_watchpoints(); } while (flags.keep_listening); LOG(debug) << "debugger server exiting ..."; } static string create_gdb_command_file(const string& macros) { TempFile file = create_temporary_file("rr-gdb-commands-XXXXXX"); // This fd is just leaked. That's fine since we only call this once // per rr invocation at the moment. int fd = file.fd.extract(); unlink(file.name.c_str()); ssize_t len = macros.size(); int written = write(fd, macros.c_str(), len); if (written != len) { FATAL() << "Failed to write gdb command file"; } stringstream procfile; procfile << "/proc/" << getpid() << "/fd/" << fd; return procfile.str(); } static string to_string(const vector& args) { stringstream ss; for (auto& a : args) { ss << "'" << a << "' "; } return ss.str(); } static bool needs_target(const string& option) { return !strncmp(option.c_str(), "continue", option.size()); } /** * Exec gdb using the params that were written to * |params_pipe_fd|. Optionally, pre-define in the gdb client the set * of macros defined in |macros| if nonnull. */ void GdbServer::launch_gdb(ScopedFd& params_pipe_fd, const string& gdb_binary_file_path, const vector& gdb_options, bool serve_files) { auto macros = gdb_rr_macros(); string gdb_command_file = create_gdb_command_file(macros); DebuggerParams params; ssize_t nread; while (true) { nread = read(params_pipe_fd, ¶ms, sizeof(params)); if (nread == 0) { // pipe was closed. Probably rr failed/died. return; } if (nread != -1 || errno != EINTR) { break; } } DEBUG_ASSERT(nread == sizeof(params)); vector args; args.push_back(gdb_binary_file_path); push_default_gdb_options(args, serve_files); args.push_back("-x"); args.push_back(gdb_command_file); bool did_set_remote = false; for (size_t i = 0; i < gdb_options.size(); ++i) { if (!did_set_remote && gdb_options[i] == "-ex" && i + 1 < gdb_options.size() && needs_target(gdb_options[i + 1])) { push_target_remote_cmd(args, string(params.host), params.port); did_set_remote = true; } args.push_back(gdb_options[i]); } if (!did_set_remote) { push_target_remote_cmd(args, string(params.host), params.port); } args.push_back(params.exe_image); vector env = current_env(); env.push_back("GDB_UNDER_RR=1"); LOG(debug) << "launching " << to_string(args); StringVectorToCharArray c_args(args); StringVectorToCharArray c_env(env); execvpe(gdb_binary_file_path.c_str(), c_args.get(), c_env.get()); CLEAN_FATAL() << "Failed to exec " << gdb_binary_file_path << "."; } void GdbServer::emergency_debug(Task* t) { // See the comment in |guard_overshoot()| explaining why we do // this. Unlike in that context though, we don't know if |t| // overshot an internal breakpoint. If it did, cover that // breakpoint up. if (t->vm()) { t->vm()->remove_all_breakpoints(); } // Don't launch a debugger on fatal errors; the user is most // likely already in a debugger, and wouldn't be able to // control another session. Instead, launch a new GdbServer and wait for // the user to connect from another window. GdbConnection::Features features; // Don't advertise reverse_execution to gdb becase a) it won't work and // b) some gdb versions will fail if the user doesn't turn off async // mode (and we don't want to require users to do that) features.reverse_execution = false; unsigned short port = t->tid; ScopedFd listen_fd = open_socket(localhost_addr.c_str(), &port, PROBE_PORT); char* test_monitor_pid = getenv("RUNNING_UNDER_TEST_MONITOR"); if (test_monitor_pid) { pid_t pid = atoi(test_monitor_pid); // Tell test-monitor to wake up and take a snapshot. It will also // connect the emergency debugger so let that happen. FILE* gdb_cmd = fopen("gdb_cmd", "w"); if (gdb_cmd) { print_debugger_launch_command(t, localhost_addr, port, "gdb", gdb_cmd); fclose(gdb_cmd); } kill(pid, SIGURG); } else { dump_rr_stack(); fputs("Launch gdb with\n ", stderr); print_debugger_launch_command(t, localhost_addr, port, "gdb", stderr); } unique_ptr dbg = await_connection(t, listen_fd, features); GdbServer(dbg, t).process_debugger_requests(); } string GdbServer::init_script() { return gdb_rr_macros(); } static ScopedFd generate_fake_proc_maps(Task* t) { TempFile file = create_temporary_file("rr-fake-proc-maps-XXXXXX"); unlink(file.name.c_str()); int fd = dup(file.fd); if (fd < 0) { FATAL() << "Cannot dup"; } FILE* f = fdopen(fd, "w"); int addr_min_width = word_size(t->arch()) == 8 ? 10 : 8; for (AddressSpace::Maps::iterator it = t->vm()->maps().begin(); it != t->vm()->maps().end(); ++it) { // If this is the mapping just before the rr page and it's still librrpage, // merge this mapping with the subsequent one. We'd like gdb to treat // librrpage as the vdso, but it'll only do so if the entire vdso is one // mapping. auto m = *it; uintptr_t map_end = (long long)m.recorded_map.end().as_int(); if (m.recorded_map.end() == t->vm()->rr_page_start()) { auto it2 = it; if (++it2 != t->vm()->maps().end()) { auto m2 = *it2; if (m2.flags & AddressSpace::Mapping::IS_RR_PAGE) { // Extend this mapping map_end += t->vm()->rr_page_size(); // Skip the rr page ++it; } } } int len = fprintf(f, "%0*llx-%0*llx %s%s%s%s %08llx %02x:%02x %lld", addr_min_width, (long long)m.recorded_map.start().as_int(), addr_min_width, (long long)map_end, (m.recorded_map.prot() & PROT_READ) ? "r" : "-", (m.recorded_map.prot() & PROT_WRITE) ? "w" : "-", (m.recorded_map.prot() & PROT_EXEC) ? "x" : "-", (m.recorded_map.flags() & MAP_SHARED) ? "s" : "p", (long long)m.recorded_map.file_offset_bytes(), major(m.recorded_map.device()), minor(m.recorded_map.device()), (long long)m.recorded_map.inode()); while (len < 72) { fputc(' ', f); ++len; } fputc(' ', f); string name; const string& fsname = m.recorded_map.fsname(); for (size_t i = 0; i < fsname.size(); ++i) { if (fsname[i] == '\n') { name.append("\\012"); } else { name.push_back(fsname[i]); } } fputs(name.c_str(), f); fputc('\n', f); } if (ferror(f) || fclose(f)) { FATAL() << "Can't write"; } return move(file.fd); } static bool is_ld_mapping(string map_name) { char ld_start[] = "ld-"; size_t matchpos = map_name.find_last_of('/'); string fname = map_name.substr(matchpos == string::npos ? 0 : matchpos + 1); return memcmp(fname.c_str(), ld_start, sizeof(ld_start)-1) == 0; } static bool is_likely_interp(string fsname) { return fsname == "/lib64/ld-linux-x86-64.so.2" || fsname == "/lib/ld-linux.so.2"; } static remote_ptr base_addr_from_rendezvous(Task* t, string fname) { remote_ptr interpreter_base = t->vm()->saved_interpreter_base(); if (!interpreter_base || !t->vm()->has_mapping(interpreter_base)) { return nullptr; } string ld_path = t->vm()->saved_ld_path(); if (ld_path.length() == 0) { FATAL() << "Failed to retrieve interpreter name with interpreter_base=" << interpreter_base; } ScopedFd ld(ld_path.c_str(), O_RDONLY); if (ld < 0) { FATAL() << "Open failed: " << ld_path; } ElfFileReader reader(ld); auto syms = reader.read_symbols(".dynsym", ".dynstr"); static const char r_debug[] = "_r_debug"; bool found = false; uintptr_t r_debug_offset = 0; for (size_t i = 0; i < syms.size(); ++i) { if (!syms.is_name(i, r_debug)) { continue; } r_debug_offset = syms.addr(i); found = true; } if (!found) { return nullptr; } bool ok = true; remote_ptr r_debug_remote = interpreter_base.as_int()+r_debug_offset; remote_ptr link_map = t->read_mem(REMOTE_PTR_FIELD(r_debug_remote, r_map), &ok); while (ok && link_map != nullptr) { if (fname == t->read_c_str(t->read_mem(REMOTE_PTR_FIELD(link_map, l_name), &ok), &ok)) { remote_ptr result = t->read_mem(REMOTE_PTR_FIELD(link_map, l_addr), &ok); return ok ? result : nullptr; } link_map = t->read_mem(REMOTE_PTR_FIELD(link_map, l_next), &ok); } return nullptr; } int GdbServer::open_file(Session& session, Task* continue_task, const std::string& file_name) { // XXX should we require file_scope_pid == 0 here? ScopedFd contents; LOG(debug) << "Trying to open " << file_name; if (file_name.substr(0, 6) == "/proc/") { char* tid_end; long tid = strtol(file_name.c_str() + 6, &tid_end, 10); if (*tid_end != '/') { return -1; } if (!strncmp(tid_end, "/task/", 6)) { tid = strtol(tid_end + 6, &tid_end, 10); if (*tid_end != '/') { return -1; } } if (tid != (pid_t)tid) { return -1; } Task* t = session.find_task(tid); if (!t) { return -1; } if (!strcmp(tid_end, "/maps")) { contents = generate_fake_proc_maps(t); } else { return -1; } } else { // See if we can find the file by serving one of our mappings std::string normalized_file_name = file_name; normalize_file_name(normalized_file_name); for (const auto& m : continue_task->vm()->maps()) { // The dynamic linker is generally a symlink that is resolved by the // kernel when the process image gets loaded. We add a special case to // substitute the correct mapping, so gdb can find the dynamic linker // rendezvous structures. // XXX: These don't tend to vary across systems, so hardcoding them here works // ok, but it'd be better, to just read INTERP from the main executable // and record which is the corresponding file. if (m.recorded_map.fsname().compare(0, normalized_file_name.length(), normalized_file_name) == 0 || (is_ld_mapping(m.recorded_map.fsname()) && is_likely_interp(normalized_file_name))) { int ret_fd = 0; while (files.find(ret_fd) != files.end() || memory_files.find(ret_fd) != memory_files.end()) { ++ret_fd; } LOG(debug) << "Found as memory mapping " << m.recorded_map; memory_files.insert(make_pair(ret_fd, FileId(m.recorded_map))); return ret_fd; } } // Last ditch attempt: Dig through the tracee's libc rendezvous struct to // see if we can find this file by a different name (e.g. if it was opened // via symlink) remote_ptr base = base_addr_from_rendezvous(continue_task, file_name); if (base != nullptr && continue_task->vm()->has_mapping(base)) { int ret_fd = 0; while (files.find(ret_fd) != files.end() || memory_files.find(ret_fd) != memory_files.end()) { ++ret_fd; } memory_files.insert(make_pair(ret_fd, FileId(continue_task->vm()->mapping_of(base).recorded_map))); return ret_fd; } LOG(debug) << "... not found"; return -1; } int ret_fd = 0; while (files.find(ret_fd) != files.end()) { ++ret_fd; } files.insert(make_pair(ret_fd, move(contents))); return ret_fd; } } // namespace rr rr-5.5.0/src/GdbServer.h000066400000000000000000000245201412202446200147670ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_GDB_SERVER_H_ #define RR_GDB_SERVER_H_ #include #include #include #include "DiversionSession.h" #include "GdbConnection.h" #include "ReplaySession.h" #include "ReplayTimeline.h" #include "ScopedFd.h" #include "ThreadDb.h" #include "TraceFrame.h" namespace rr { static std::string localhost_addr = "127.0.0.1"; class GdbServer { // Not ideal but we can't inherit friend from GdbCommand friend std::string invoke_checkpoint(GdbServer&, Task*, const std::vector&); friend std::string invoke_delete_checkpoint(GdbServer&, Task*, const std::vector&); friend std::string invoke_info_checkpoints(GdbServer&, Task*, const std::vector&); public: struct Target { Target() : pid(0), require_exec(false), event(0) {} // Target process to debug, or 0 to just debug the first process pid_t pid; // If true, wait for the target process to exec() before attaching debugger bool require_exec; // Wait until at least 'event' has elapsed before attaching FrameTime event; }; struct ConnectionFlags { // -1 to let GdbServer choose the port, a positive integer to select a // specific port to listen on. If keep_listening is on, wait for another // debugger connection after the first one is terminated. int dbg_port; std::string dbg_host; bool keep_listening; // If non-null, then when the gdbserver is set up, we write its connection // parameters through this pipe. GdbServer::launch_gdb is passed the // other end of this pipe to exec gdb with the parameters. ScopedFd* debugger_params_write_pipe; // Name of the debugger to suggest. Only used if debugger_params_write_pipe // is null. std::string debugger_name; ConnectionFlags() : dbg_port(-1), dbg_host(localhost_addr), keep_listening(false), debugger_params_write_pipe(nullptr) {} }; /** * Create a gdbserver serving the replay of 'session'. */ GdbServer(std::shared_ptr session, const Target& target) : target(target), final_event(UINT32_MAX), in_debuggee_end_state(false), stop_replaying_to_target(false), interrupt_pending(false), timeline(std::move(session)), emergency_debug_session(nullptr) { memset(&stop_siginfo, 0, sizeof(stop_siginfo)); } /** * Actually run the server. Returns only when the debugger disconnects. */ void serve_replay(const ConnectionFlags& flags); /** * exec()'s gdb using parameters read from params_pipe_fd (and sent through * the pipe passed to serve_replay_with_debugger). */ static void launch_gdb(ScopedFd& params_pipe_fd, const std::string& gdb_binary_file_path, const std::vector& gdb_options, bool serve_files); /** * Start a debugging connection for |t| and return when there are no * more requests to process (usually because the debugger detaches). * * This helper doesn't attempt to determine whether blocking rr on a * debugger connection might be a bad idea. It will always open the debug * socket and block awaiting a connection. */ static void emergency_debug(Task* t); /** * A string containing the default gdbinit script that we load into gdb. */ static std::string init_script(); /** * Called from a signal handler (or other thread) during serve_replay, * this will cause the replay-to-target phase to be interrupted and * debugging started wherever the replay happens to be. */ void interrupt_replay_to_target() { stop_replaying_to_target = true; } /** * Return the register |which|, which may not have a defined value. */ static GdbRegisterValue get_reg(const Registers& regs, const ExtraRegisters& extra_regs, GdbRegister which); ReplayTimeline& get_timeline() { return timeline; } private: GdbServer(std::unique_ptr& dbg, Task* t); Session& current_session() { return timeline.is_running() ? timeline.current_session() : *emergency_debug_session; } void dispatch_regs_request(const Registers& regs, const ExtraRegisters& extra_regs); enum ReportState { REPORT_NORMAL, REPORT_THREADS_DEAD }; void maybe_intercept_mem_request(Task* target, const GdbRequest& req, std::vector* result); /** * Process the single debugger request |req| inside the session |session|. * * Callers should implement any special semantics they want for * particular debugger requests before calling this helper, to do * generic processing. */ void dispatch_debugger_request(Session& session, const GdbRequest& req, ReportState state); bool at_target(); void activate_debugger(); void restart_session(const GdbRequest& req); GdbRequest process_debugger_requests(ReportState state = REPORT_NORMAL); enum ContinueOrStop { CONTINUE_DEBUGGING, STOP_DEBUGGING }; bool detach_or_restart(const GdbRequest& req, ContinueOrStop* s); ContinueOrStop handle_exited_state(GdbRequest& last_resume_request); ContinueOrStop debug_one_step(GdbRequest& last_resume_request); /** * If 'req' is a reverse-singlestep, try to obtain the resulting state * directly from ReplayTimeline's mark database. If that succeeds, * report the singlestep break status to gdb and process any get-registers * requests. Repeat until we get a request that isn't reverse-singlestep * or get-registers, returning that request in 'req'. * During reverse-next commands, gdb tends to issue a series of * reverse-singlestep/get-registers pairs, and this makes those much * more efficient by avoiding having to actually reverse-singlestep the * session. */ void try_lazy_reverse_singlesteps(GdbRequest& req); /** * Process debugger requests made in |diversion_session| until action needs * to be taken by the caller (a resume-execution request is received). * The received request is returned through |req|. * Returns true if diversion should continue, false if it should end. */ bool diverter_process_debugger_requests(DiversionSession& diversion_session, uint32_t& diversion_refcount, GdbRequest* req); /** * Create a new diversion session using |replay| session as the * template. The |replay| session isn't mutated. * * Execution begins in the new diversion session under the control of * |dbg| starting with initial thread target |task|. The diversion * session ends at the request of |dbg|, and |divert| returns the first * request made that wasn't handled by the diversion session. That * is, the first request that should be handled by |replay| upon * resuming execution in that session. */ GdbRequest divert(ReplaySession& replay); /** * If |break_status| indicates a stop that we should report to gdb, * report it. |req| is the resume request that generated the stop. */ void maybe_notify_stop(const GdbRequest& req, const BreakStatus& break_status); /** * Return the checkpoint stored as |checkpoint_id| or nullptr if there * isn't one. */ ReplaySession::shr_ptr get_checkpoint(int checkpoint_id); /** * Delete the checkpoint stored as |checkpoint_id| if it exists, or do * nothing if it doesn't exist. */ void delete_checkpoint(int checkpoint_id); /** * Handle GDB file open requests. If we can serve this read request, add * an entry to `files` with the file contents and return our internal * file descriptor. */ int open_file(Session& session, Task *continue_task, const std::string& file_name); Target target; // dbg is initially null. Once the debugger connection is established, it // never changes. std::unique_ptr dbg; // When dbg is non-null, the ThreadGroupUid of the task being debugged. Never // changes once the connection is established --- we don't currently // support switching gdb between debuggee processes. ThreadGroupUid debuggee_tguid; // ThreadDb for debuggee ThreadGroup std::unique_ptr thread_db; // The TaskUid of the last continued task. TaskUid last_continue_tuid; // The TaskUid of the last queried task. TaskUid last_query_tuid; FrameTime final_event; // siginfo for last notified stop. siginfo_t stop_siginfo; bool in_debuggee_end_state; // True when the user has interrupted replaying to a target event. volatile bool stop_replaying_to_target; // True when a DREQ_INTERRUPT has been received but not handled, or when // we've restarted and want the first continue to be interrupted immediately. bool interrupt_pending; ReplayTimeline timeline; Session* emergency_debug_session; struct Checkpoint { enum Explicit { EXPLICIT, NOT_EXPLICIT }; Checkpoint(ReplayTimeline& timeline, TaskUid last_continue_tuid, Explicit e, const std::string& where) : last_continue_tuid(last_continue_tuid), is_explicit(e), where(where) { if (e == EXPLICIT) { mark = timeline.add_explicit_checkpoint(); } else { mark = timeline.mark(); } } Checkpoint() : is_explicit(NOT_EXPLICIT) {} ReplayTimeline::Mark mark; TaskUid last_continue_tuid; Explicit is_explicit; std::string where; }; // |debugger_restart_mark| is the point where we will restart from with // a no-op debugger "run" command. Checkpoint debugger_restart_checkpoint; // gdb checkpoints, indexed by ID std::map checkpoints; // Set of symbols to look up, for qSymbol. std::set symbols; // Iterator into |symbols|. std::set::iterator symbols_iter; // Contents of opened files. Maps our internal file descriptor to a real // file descriptor. Exposing our real file descriptor values is probably a // bad idea. std::map files; std::map memory_files; // The pid for gdb's last vFile:setfs pid_t file_scope_pid; }; } // namespace rr #endif /* RR_GDB_SERVER_H_ */ rr-5.5.0/src/HasTaskSet.cc000066400000000000000000000011351412202446200152510ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "HasTaskSet.h" #include "Task.h" #include "log.h" namespace rr { void HasTaskSet::insert_task(Task* t) { LOG(debug) << "adding " << t->tid << " to task set " << this; tasks.insert(t); } void HasTaskSet::erase_task(Task* t) { LOG(debug) << "removing " << t->tid << " from task set " << this; tasks.erase(t); } Task* HasTaskSet::first_running_task() const { for (auto t : task_set()) { if (!t->already_exited() && !t->is_dying()) { return t; } } return nullptr; } } // namespace rr rr-5.5.0/src/HasTaskSet.h000066400000000000000000000011331412202446200151110ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_HASTASKSET_H_ #define RR_HASTASKSET_H_ #include namespace rr { class Task; /** * Base class for classes that manage a set of Tasks. */ class HasTaskSet { public: typedef std::set TaskSet; const TaskSet& task_set() const { return tasks; } void insert_task(Task* t); void erase_task(Task* t); bool has_task(Task* t) const { return tasks.find(t) != tasks.end(); } Task* first_running_task() const; protected: TaskSet tasks; }; } // namespace rr #endif /* RR_HASTASKSET_H_ */ rr-5.5.0/src/HelpCommand.cc000066400000000000000000000016171412202446200154330ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Command.h" #include "main.h" using namespace std; namespace rr { class HelpCommand : public Command { public: virtual int run(std::vector& args) override; protected: HelpCommand(const char* name, const char* help) : Command(name, help) {} static HelpCommand help1; static HelpCommand help2; static HelpCommand help3; }; HelpCommand HelpCommand::help1("help", " rr help [command]\n"); HelpCommand HelpCommand::help2("-h", nullptr); HelpCommand HelpCommand::help3("--help", nullptr); int HelpCommand::run(std::vector& args) { if (args.size() == 0) { print_usage(stdout); return 0; } Command* command = Command::command_for_name(args[0]); if (!command) { print_usage(stderr); return 1; } command->print_help(stdout); return 0; } } // namespace rr rr-5.5.0/src/LsCommand.cc000066400000000000000000000141711412202446200151200ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include #include #include #include #include #include #include #include "Command.h" #include "main.h" #include "TraceStream.h" #include "util.h" using namespace std; namespace rr { class LsCommand : public Command { public: virtual int run(vector& args); protected: LsCommand(const char* name, const char* help) : Command(name, help) {} static LsCommand singleton; }; LsCommand LsCommand::singleton( "ls", " rr ls [OPTION]...\n" " -l, --long-listing use a long listing format\n" " (trace name | start time | size | command line)\n" " -t, --sort-by-age, sort from newest to oldest\n" " -r, --reverse, the sort order\n"); struct LsFlags { bool reverse; bool full_listing; bool sort_by_time; LsFlags() : reverse(false), full_listing(false), sort_by_time(false) {} }; static bool parse_ls_arg(vector& args, LsFlags& flags) { if (parse_global_option(args)) { return true; } static const OptionSpec options[] = { { 'r', "reverse", NO_PARAMETER }, { 'l', "long-listing", NO_PARAMETER }, { 't', "sort-by-age", NO_PARAMETER } }; ParsedOption opt; if (!Command::parse_option(args, options, &opt)) { return false; } switch (opt.short_name) { case 'r': flags.reverse = true; break; case 'l': flags.full_listing = true; break; case 't': flags.sort_by_time = true; break; default: assert(0 && "Unknown option"); } return true; } struct TraceInfo { string name; struct timespec ctime; TraceInfo(string in_name) : name(in_name) {} }; static bool compare_by_name(const TraceInfo& at, const TraceInfo& bt) { auto a = at.name; auto b = bt.name; return lexicographical_compare(begin(a), end(a), begin(b), end(b)); } static bool get_folder_size(string dir_name, string& size_str) { DIR* dir = opendir(dir_name.c_str()); if (!dir) { cerr << "Cannot open " << dir_name << endl; return false; } size_t bytes = 0; while (struct dirent* ent = readdir(dir)) { string path = dir_name + "/" + ent->d_name; struct stat st; if (stat(path.c_str(), &st) == -1) { cerr << "stat " << path << " failed\n"; return false; } bytes += st.st_size; } closedir(dir); static const char suffixes[] = " KMGT"; double size = bytes; size_t suffix_idx = 0; while (size >= 1000.0) { size /= 1024.0; suffix_idx++; } char suffix = suffixes[suffix_idx]; ostringstream cvt; if (suffix == ' ') { cvt << bytes; } else if (size >= 10) { cvt << int(size) << suffix; } else { cvt << fixed << setprecision(1) << size << suffix; } size_str = cvt.str(); return true; } static bool is_valid_trace(const string& entry) { if (entry[0] == '.' || entry[0] == '#') { return false; } if (entry[entry.length() - 1] == '~') { return false; } return true; } static string get_exec_path(TraceReader& reader) { while (true) { TraceTaskEvent r = reader.read_task_event(); if (r.type() == TraceTaskEvent::NONE) { break; } if (r.type() == TraceTaskEvent::EXEC) { return r.cmd_line()[0]; } } return string(); } static int ls(const string& traces_dir, const LsFlags& flags, FILE* out) { DIR* dir = opendir(traces_dir.c_str()); if (!dir) { fprintf(stderr, "Cannot open %s", traces_dir.c_str()); return 1; } vector traces; while (struct dirent* trace_dir = readdir(dir)) { if (!is_valid_trace(trace_dir->d_name)) { continue; } traces.emplace_back(TraceInfo(string(trace_dir->d_name))); if (flags.sort_by_time || flags.full_listing) { struct stat st; stat((traces_dir + "/" + trace_dir->d_name + "/data").c_str(), &st); traces.back().ctime = st.st_ctim; } } closedir(dir); if (flags.sort_by_time) { auto compare_by_time = [&](const TraceInfo& at, const TraceInfo& bt) -> bool { if (at.ctime.tv_sec == bt.ctime.tv_sec) { return at.ctime.tv_nsec < bt.ctime.tv_nsec; } return at.ctime.tv_sec < bt.ctime.tv_sec; }; sort(traces.begin(), traces.end(), compare_by_time); } else { sort(traces.begin(), traces.end(), compare_by_name); } if (flags.reverse) { reverse(begin(traces), end(traces)); }; if (!flags.full_listing) { for (TraceInfo& t : traces) { cout << t.name << "\n"; } return 0; } int max_name_size = accumulate(traces.begin(), traces.end(), 0, [](int m, TraceInfo& t) { return max(m, static_cast(t.name.length())); }); fprintf(out, "%-*s %-19s %5s %s\n", max_name_size, "NAME", "WHEN", "SIZE", "CMD"); for (TraceInfo& t : traces) { // Record date & runtime estimates string data_file = traces_dir + "/" + t.name + "/data"; char outstr[200]; struct tm ctime_tm; if (localtime_r(&t.ctime.tv_sec, &ctime_tm)) { strftime(outstr, sizeof(outstr), "%F %T", &ctime_tm); } else { strcpy(outstr, ""); } string folder_size = "????"; string exe = "(incomplete)"; string version_file = traces_dir + "/" + t.name + "/version"; struct stat st; if (stat(version_file.c_str(), &st) != -1) { TraceReader reader(traces_dir + "/" + t.name); get_folder_size(reader.dir(), folder_size); exe = get_exec_path(reader); } fprintf(out, "%-*s %s %5s %s\n", max_name_size, t.name.c_str(), outstr, folder_size.c_str(), exe.c_str()); } return 0; } int LsCommand::run(vector& args) { bool found_dir = false; string trace_dir; LsFlags flags; while (!args.empty()) { if (parse_ls_arg(args, flags)) { continue; } if (!found_dir && parse_optional_trace_dir(args, &trace_dir)) { found_dir = true; continue; } print_help(stderr); return 1; }; if (!found_dir) { trace_dir = trace_save_dir(); } return ls(trace_dir, flags, stdout); }; } // namespace rr rr-5.5.0/src/MagicSaveDataMonitor.cc000066400000000000000000000052641412202446200172470ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "MagicSaveDataMonitor.h" #include #include #include "RecordTask.h" #include "ReplayTask.h" #include "Session.h" #include "log.h" #include "util.h" namespace rr { static void dump_path_data(Task* t, FrameTime global_time, const char* tag, char* filename, size_t filename_size, const void* buf, size_t buf_len, remote_ptr addr) { format_dump_filename(t, global_time, tag, filename, filename_size); dump_binary_data(filename, tag, (const uint32_t*)buf, buf_len / 4, addr); } static void notify_save_data_error(ReplayTask* t, remote_ptr addr, const void* rec_buf, size_t rec_buf_len, const void* rep_buf, size_t rep_buf_len) { char rec_dump[PATH_MAX]; char rep_dump[PATH_MAX]; FrameTime global_time = t->current_trace_frame().time(); dump_path_data(t, global_time, "rec_save_data", rec_dump, sizeof(rec_dump), rec_buf, rec_buf_len, addr); dump_path_data(t, global_time, "rep_save_data", rep_dump, sizeof(rep_dump), rep_buf, rep_buf_len, addr); ASSERT(t, (rec_buf_len == rep_buf_len && !memcmp(rec_buf, rep_buf, rec_buf_len))) << "Divergence in contents of 'tracee-save buffer'. Recording executed\n" "\n" " write(" << RR_MAGIC_SAVE_DATA_FD << ", " << addr << ", " << rec_buf_len << ")\n" "\n" "and replay executed\n" "\n" " write(" << RR_MAGIC_SAVE_DATA_FD << ", " << addr << ", " << rep_buf_len << ")\n" "\n" "The contents of the tracee-save buffers have been dumped to disk.\n" "Compare them by using the following command\n" "\n" "$ diff -u " << rec_dump << " " << rep_dump << " >save-data-diverge.diff\n"; } void MagicSaveDataMonitor::did_write(Task* t, const std::vector& ranges, LazyOffset&) { for (auto& r : ranges) { if (t->session().is_recording()) { static_cast(t)->record_remote(r.data.cast(), r.length); } else if (t->session().is_replaying()) { auto rt = static_cast(t); auto bytes = rt->read_mem(r.data.cast(), r.length); auto rec = rt->trace_reader().read_raw_data(); if (rec.data != bytes) { notify_save_data_error(rt, rec.addr, rec.data.data(), rec.data.size(), bytes.data(), bytes.size()); } } } } } // namespace rr rr-5.5.0/src/MagicSaveDataMonitor.h000066400000000000000000000011321412202446200170770ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_MAGIC_SAVE_DATA_MONITOR_H_ #define RR_MAGIC_SAVE_DATA_MONITOR_H_ #include "FileMonitor.h" namespace rr { /** * A FileMonitor to track writes to RR_MAGIC_SAVE_DATA_FD. */ class MagicSaveDataMonitor : public FileMonitor { public: MagicSaveDataMonitor() {} virtual Type type() override { return MagicSaveData; } virtual void did_write(Task* t, const std::vector& ranges, LazyOffset& offset) override; }; } // namespace rr #endif /* RR_MAGIC_SAVE_DATA_MONITOR_H_ */ rr-5.5.0/src/MemoryRange.h000066400000000000000000000040441412202446200153300ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_MEMORY_RANGE_H_ #define RR_MEMORY_RANGE_H_ #include "core.h" #include "remote_ptr.h" namespace rr { /** * Range of memory addresses that can be used as a std::map key. */ class MemoryRange { public: MemoryRange() {} MemoryRange(remote_ptr addr, size_t num_bytes) : start_(addr), end_(addr + num_bytes) { DEBUG_ASSERT(start_ <= end_); } MemoryRange(remote_ptr addr, remote_ptr end) : start_(addr), end_(end) { DEBUG_ASSERT(start_ <= end); } MemoryRange(const MemoryRange&) = default; MemoryRange& operator=(const MemoryRange&) = default; bool operator==(const MemoryRange& o) const { return start_ == o.start_ && end_ == o.end_; } bool operator<(const MemoryRange& o) const { return start_ != o.start_ ? start_ < o.start_ : end_ < o.end_; } /** * Return true iff |o| is an address range fully contained by * this. */ bool contains(const MemoryRange& o) const { return start_ <= o.start_ && o.end_ <= end_; } bool contains(remote_ptr p) const { return start_ <= p && p < end_; } bool intersects(const MemoryRange& other) const { remote_ptr s = std::max(start_, other.start_); remote_ptr e = std::min(end_, other.end_); return s < e; } MemoryRange intersect(const MemoryRange& other) const { remote_ptr s = std::max(start_, other.start_); remote_ptr e = std::min(end_, other.end_); return MemoryRange(s, std::max(s, e)); } remote_ptr start() const { return start_; } remote_ptr end() const { return end_; } size_t size() const { return end_ - start_; } // XXX DO NOT USE void update_start(remote_ptr s) const { const_cast(this)->start_ = s; } private: remote_ptr start_; remote_ptr end_; }; inline std::ostream& operator<<(std::ostream& o, const MemoryRange& m) { o << m.start() << "-" << m.end(); return o; } } // namespace rr #endif /* RR_MEMORY_RANGE_H_ */ rr-5.5.0/src/MmappedFileMonitor.cc000066400000000000000000000066141412202446200170010ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "MmappedFileMonitor.h" #include "RecordSession.h" #include "RecordTask.h" #include "ReplayTask.h" #include "log.h" using namespace std; namespace rr { MmappedFileMonitor::MmappedFileMonitor(Task* t, int fd) { ASSERT(t, !t->session().is_replaying()); dead_ = false; auto stat = t->stat_fd(fd); device_ = stat.st_dev; inode_ = stat.st_ino; } MmappedFileMonitor::MmappedFileMonitor(Task* t, EmuFile::shr_ptr f) { ASSERT(t, t->session().is_replaying()); dead_ = false; device_ = f->device(); inode_ = f->inode(); } void MmappedFileMonitor::did_write(Task* t, const std::vector& ranges, LazyOffset& offset) { // If there are no remaining mappings that we care about, those can't reappear // without going through mmap again, at which point this will be reset to // false. if (dead_) { return; } if (ranges.empty()) { return; } // Dead until proven otherwise dead_ = true; int64_t realized_offset = 0; bool is_replay = t->session().is_replaying(); for (auto v : t->session().vms()) { for (const auto& m : v->maps()) { auto km = m.map; if (is_replay) { if (!m.emu_file || m.emu_file->device() != device_ || m.emu_file->inode() != inode_) { continue; } } else { if (km.device() != device_ || km.inode() != inode_) { continue; } // If the mapping is MAP_PRIVATE then this write is dangerous // because it's unpredictable what will be seen in the mapping. // However, it could be OK if the application doesn't read from // this part of the mapping. Just optimistically assume this mapping // is not affected. if (!(km.flags() & MAP_SHARED)) { LOG(warn) << "MAP_PRIVATE mapping affected by write"; continue; } } // We're discovering a mapping we care about if (dead_) { dead_ = false; realized_offset = offset.retrieve(true); } // stat matches. uint64_t mapping_offset = km.file_offset_bytes(); int64_t local_offset = realized_offset; for (auto r : ranges) { remote_ptr start = km.start() + local_offset - mapping_offset; MemoryRange mr(start, r.length); if (km.intersects(mr)) { if (is_replay) { // If we're writing beyond the EmuFile's end, resize it. m.emu_file->ensure_size(local_offset + r.length); } else { ASSERT(t, !v->task_set().empty()); // We will record multiple writes if the file is mapped multiple // times. This is inefficient --- one is sufficient --- but not // wrong. // Make sure we use a task for this address space. `t` might have // a different address space. for (auto tt : v->task_set()) { // If the task here has execed, we may not be able to record its // memory any longer, so loop through all tasks in this address // space in turn in case any *didn't* exec. if (!tt->already_exited() && static_cast(tt)->record_remote_fallible(km.intersect(mr)) > 0) { break; } } } } local_offset += r.length; } } } } } // namespace rr rr-5.5.0/src/MmappedFileMonitor.h000066400000000000000000000022651412202446200166410ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_MMAPPED_FILE_MONITOR_H_ #define RR_MMAPPED_FILE_MONITOR_H_ #include "EmuFs.h" #include "FileMonitor.h" #include namespace rr { /** * A FileMonitor to track writes to files that are mmapped in so they can be * replayed. */ class MmappedFileMonitor : public FileMonitor { public: MmappedFileMonitor(Task* t, int fd); MmappedFileMonitor(Task* t, EmuFile::shr_ptr f); virtual Type type() override { return Mmapped; } void revive() { dead_ = false; } // If this write could potentially affect memory we need to PREVENT_SWITCH, // since the timing of the write is otherwise unpredictable from our // perspective. virtual Switchable will_write(Task*) override { return dead_ ? ALLOW_SWITCH : PREVENT_SWITCH; } /** * During recording, note writes to mapped segments. */ virtual void did_write(Task* t, const std::vector& ranges, LazyOffset& offset) override; private: // Whether this monitor is still actively monitoring bool dead_; dev_t device_; ino_t inode_; }; } // namespace rr #endif /* RR_MMAPPED_FILE_MONITOR_H_ */ rr-5.5.0/src/MonitoredSharedMemory.cc000066400000000000000000000050651412202446200175250ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "MonitoredSharedMemory.h" #include #include "AddressSpace.h" #include "AutoRemoteSyscalls.h" #include "RecordTask.h" #include "Session.h" #include "core.h" #include "log.h" using namespace std; namespace rr { MonitoredSharedMemory::~MonitoredSharedMemory() { munmap(real_mem, size); } static const char dconf_suffix[] = "/dconf/user"; void MonitoredSharedMemory::maybe_monitor(RecordTask* t, const string& file_name, const AddressSpace::Mapping& m, int tracee_fd, uint64_t offset) { size_t dconf_suffix_len = sizeof(dconf_suffix) - 1; if (file_name.size() < dconf_suffix_len || file_name.substr(file_name.size() - dconf_suffix_len) != dconf_suffix) { return; } AutoRemoteSyscalls remote(t); ScopedFd fd = remote.retrieve_fd(tracee_fd); uint8_t* real_mem = static_cast( mmap(NULL, m.map.size(), PROT_READ, MAP_SHARED, fd, offset)); ASSERT(t, real_mem != MAP_FAILED); auto result = shared_ptr( new MonitoredSharedMemory(real_mem, m.map.size())); const AddressSpace::Mapping& shared = Session::steal_mapping(remote, m, move(result)); // m may be invalid now memcpy(shared.local_addr, real_mem, shared.map.size()); } MonitoredSharedMemory::shr_ptr MonitoredSharedMemory::subrange(uintptr_t, uintptr_t) { DEBUG_ASSERT(false && "Subranges not supported yet!"); return nullptr; } void MonitoredSharedMemory::check_all(RecordTask* t) { vector> addrs; for (auto a : t->vm()->monitored_addrs()) { addrs.push_back(a); } for (auto a : addrs) { auto m = t->vm()->mapping_of(a); if (m.monitored_shared_memory) { m.monitored_shared_memory->check_for_changes(t, m); } } } void MonitoredSharedMemory::check_for_changes(RecordTask* t, AddressSpace::Mapping& m) { ASSERT(t, m.map.size() == size); if (!m.local_addr) { // reestablish local mapping after a fork or whatever AutoRemoteSyscalls remote(t); auto msm = m.monitored_shared_memory; m = Session::recreate_shared_mmap(remote, m, Session::DISCARD_CONTENTS, move(msm)); } if (!memcmp(m.local_addr, real_mem, size)) { return; } memcpy(m.local_addr, real_mem, size); t->record_local(m.map.start(), size, real_mem); } } rr-5.5.0/src/MonitoredSharedMemory.h000066400000000000000000000034561412202446200173710ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_MONITORED_SHARED_MEMORY_H_ #define RR_MONITORED_SHARED_MEMORY_H_ #include #include "AddressSpace.h" namespace rr { class RecordTask; /** * Support tracees that share memory read-only with a non-tracee that * writes to the memory. Currently this just supports limited cases that * suffice for dconf: no remapping, coalescing or splitting of the memory is * allowed (|subrange| below just asserts). It doesn't handle mappings where * the mapping has more pages than the file. * * After such memory is mapped in the tracee, we also map it in rr at |real_mem| * and replace the tracee's mapping with a "shadow buffer" that's only shared * with rr. Then periodically rr reads the real memory, and if it doesn't match * the shadow buffer, we update the shadow buffer with the new values and * record that we did so. * * Currently we check the real memory after each syscall exit. This ensures * that if the tracee is woken up by some IPC mechanism (or after sched_yield), * it will get a chance to see updated memory values. */ class MonitoredSharedMemory { public: ~MonitoredSharedMemory(); typedef std::shared_ptr shr_ptr; static void maybe_monitor(RecordTask* t, const std::string& file_name, const AddressSpace::Mapping& m, int tracee_fd, uint64_t offset); static void check_all(RecordTask* t); shr_ptr subrange(uintptr_t start, uintptr_t size); private: void check_for_changes(RecordTask* t, AddressSpace::Mapping& m); MonitoredSharedMemory(uint8_t* real_mem, size_t size) : real_mem(real_mem), size(size) {} uint8_t* real_mem; size_t size; }; } // namespace rr #endif /* RR_MONITORED_SHARED_MEMORY_H_ */ rr-5.5.0/src/Monkeypatcher.cc000066400000000000000000001166151412202446200160620ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Monkeypatcher.h" #include #include #include #include "AddressSpace.h" #include "AutoRemoteSyscalls.h" #include "ElfReader.h" #include "Flags.h" #include "RecordSession.h" #include "RecordTask.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "core.h" #include "kernel_abi.h" #include "kernel_metadata.h" #include "log.h" using namespace std; namespace rr { #include "AssemblyTemplates.generated" static void write_and_record_bytes(RecordTask* t, remote_ptr child_addr, size_t size, const void* buf) { t->write_bytes_helper(child_addr, size, buf); t->record_local(child_addr, size, buf); } template static void write_and_record_bytes(RecordTask* t, remote_ptr child_addr, const uint8_t (&buf)[N]) { write_and_record_bytes(t, child_addr, N, buf); } template static void write_and_record_mem(RecordTask* t, remote_ptr child_addr, const T* val, int count) { t->write_bytes_helper(child_addr, sizeof(*val) * count, static_cast(val)); t->record_local(child_addr, sizeof(T) * count, val); } /** * RecordSession sets up an LD_PRELOAD environment variable with an entry * SYSCALLBUF_LIB_FILENAME_PADDED (and, if enabled, an LD_AUDIT environment * variable with an entry RTLDAUDIT_LIB_FILENAME_PADDED) which is big enough to * hold either the 32-bit or 64-bit preload/audit library file names. * Immediately after exec we enter this function, which patches the environment * variable value with the correct library name for the task's architecture. * * It's possible for this to fail if a tracee alters the LD_PRELOAD value * and then does an exec. That's just too bad. If we ever have to handle that, * we should modify the environment passed to the exec call. This function * failing isn't necessarily fatal; a tracee might not rely on the functions * overridden by the preload library, or might override them itself (e.g. * because we're recording an rr replay). */ #define setup_library_path(arch, env_var, soname, task) \ setup_library_path_arch(task, env_var, soname ## _BASE, \ soname ## _PADDED, soname ## _32) template static void setup_library_path_arch(RecordTask* t, const char* env_var, const char* soname_base, const char* soname_padded, const char* soname_32) { const char* lib_name = sizeof(typename Arch::unsigned_word) < sizeof(uintptr_t) ? soname_32 : soname_padded; auto env_assignment = string(env_var) + "="; auto p = t->regs().sp().cast(); auto argc = t->read_mem(p); p += 1 + argc + 1; // skip argc, argc parameters, and trailing NULL while (true) { auto envp = t->read_mem(p); if (!envp) { LOG(debug) << env_var << " not found"; return; } string env = t->read_c_str(envp); if (env.find(env_assignment) != 0) { ++p; continue; } size_t lib_pos = env.find(soname_base); if (lib_pos == string::npos) { LOG(debug) << soname_base << " not found in " << env_var; return; } size_t next_colon = env.find(':', lib_pos); if (next_colon != string::npos) { while ((next_colon + 1 < env.length()) && (env[next_colon + 1] == ':' || env[next_colon + 1] == 0)) { ++next_colon; } if (next_colon + 1 < lib_pos + sizeof(soname_padded) - 1) { LOG(debug) << "Insufficient space for " << lib_name << " in " << env_var << " before next ':'"; return; } } if (env.length() < lib_pos + sizeof(soname_padded) - 1) { LOG(debug) << "Insufficient space for " << lib_name << " in " << env_var << " before end of string"; return; } remote_ptr dest = envp + lib_pos; write_and_record_mem(t, dest.cast(), lib_name, strlen(soname_padded)); return; } } template static void setup_preload_library_path(RecordTask* t) { static_assert(sizeof(SYSCALLBUF_LIB_FILENAME_PADDED) == sizeof(SYSCALLBUF_LIB_FILENAME_32), "filename length mismatch"); setup_library_path(Arch, "LD_PRELOAD", SYSCALLBUF_LIB_FILENAME, t); } template static void setup_audit_library_path(RecordTask* t) { static_assert(sizeof(RTLDAUDIT_LIB_FILENAME_PADDED) == sizeof(RTLDAUDIT_LIB_FILENAME_32), "filename length mismatch"); if (t->session().use_audit()) { setup_library_path(Arch, "LD_AUDIT", RTLDAUDIT_LIB_FILENAME, t); } } void Monkeypatcher::init_dynamic_syscall_patching( RecordTask* t, int syscall_patch_hook_count, remote_ptr syscall_patch_hooks) { if (syscall_patch_hook_count && syscall_hooks.empty()) { syscall_hooks = t->read_mem(syscall_patch_hooks, syscall_patch_hook_count); } } template static bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, RecordTask* t, const syscall_patch_hook& hook); template static void substitute(uint8_t* buffer, uint64_t return_addr, uint32_t trampoline_relative_addr); template static void substitute_extended_jump(uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, uint64_t target_addr); template <> void substitute_extended_jump( uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, uint64_t target_addr) { int64_t offset = target_addr - (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, (uint32_t)offset); } template <> void substitute_extended_jump( uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr) { X64SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, (uint32_t)(return_addr >> 32), target_addr); } /** * Allocate an extended jump in an extended jump page and return its address. * The resulting address must be within 2G of from_end, and the instruction * there must jump to to_start. */ template static remote_ptr allocate_extended_jump( RecordTask* t, vector& pages, remote_ptr from_end) { Monkeypatcher::ExtendedJumpPage* page = nullptr; for (auto& p : pages) { remote_ptr page_jump_start = p.addr + p.allocated; int64_t offset = page_jump_start - from_end; if ((int32_t)offset == offset && p.allocated + ExtendedJumpPatch::size <= page_size()) { page = &p; break; } } if (!page) { // We're looking for a gap of three pages --- one page to allocate and // a page on each side as a guard page. uint32_t required_space = 3 * page_size(); remote_ptr free_mem = t->vm()->find_free_memory(required_space, // Find free space after the patch site. t->vm()->mapping_of(from_end).map.start()); remote_ptr addr = (free_mem + page_size()).cast(); int64_t offset = addr - from_end; if ((int32_t)offset != offset) { LOG(debug) << "Can't find space close enough for the jump"; return nullptr; } { AutoRemoteSyscalls remote(t); int prot = PROT_READ | PROT_EXEC; int flags = MAP_ANONYMOUS | MAP_FIXED | MAP_PRIVATE; remote.infallible_mmap_syscall(addr, page_size(), prot, flags, -1, 0); KernelMapping recorded(addr, addr + page_size(), string(), KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, prot, flags); t->vm()->map(t, addr, page_size(), prot, flags, 0, string(), KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, nullptr, &recorded); t->vm()->mapping_flags_of(addr) |= AddressSpace::Mapping::IS_PATCH_STUBS; t->trace_writer().write_mapped_region(t, recorded, recorded.fake_stat(), recorded.fsname(), vector(), TraceWriter::PATCH_MAPPING); } pages.push_back(Monkeypatcher::ExtendedJumpPage(addr)); page = &pages.back(); } remote_ptr jump_addr = page->addr + page->allocated; page->allocated += ExtendedJumpPatch::size; return jump_addr; } bool Monkeypatcher::is_jump_stub_instruction(remote_code_ptr ip) { remote_ptr pp = ip.to_data_ptr(); auto it = syscallbuf_stubs.upper_bound(pp); if (it == syscallbuf_stubs.begin()) { return false; } --it; return it->first <= pp && pp < it->first + it->second.size; } /** * Some functions make system calls while storing local variables in memory * below the stack pointer. We need to decrement the stack pointer by * some "safety zone" amount to get clear of those variables before we make * a call instruction. So, we allocate a stub per patched callsite, and jump * from the callsite to the stub. The stub decrements the stack pointer, * calls the appropriate syscall hook function, reincrements the stack pointer, * and jumps back to immediately after the patched callsite. * * It's important that gdb stack traces work while a thread is stopped in the * syscallbuf code. To ensure that the above manipulations don't foil gdb's * stack walking code, we add CFI data to all the stubs. To ease that, the * stubs are written in assembly and linked into the preload library. * * On x86-64 with ASLR, we need to be able to patch a call to a stub from * sites more than 2^31 bytes away. We only have space for a 5-byte jump * instruction. So, we allocate "extender pages" --- pages of memory within * 2GB of the patch site, that contain the stub code. We don't really need this * on x86, but we do it there too for consistency. * */ template static bool patch_syscall_with_hook_x86ish(Monkeypatcher& patcher, RecordTask* t, const syscall_patch_hook& hook) { uint8_t jump_patch[JumpPatch::size]; // We're patching in a relative jump, so we need to compute the offset from // the end of the jump to our actual destination. auto jump_patch_start = t->regs().ip().to_data_ptr(); auto jump_patch_end = jump_patch_start + sizeof(jump_patch); auto return_addr = t->regs().ip().to_data_ptr().as_int() + syscall_instruction_length(x86_64) + hook.patch_region_length; if ((hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { auto adjust = hook.patch_region_length + syscall_instruction_length(x86_64); jump_patch_start -= adjust; jump_patch_end -= adjust; return_addr -= adjust; } remote_ptr extended_jump_start = allocate_extended_jump( t, patcher.extended_jump_pages, jump_patch_end); if (extended_jump_start.is_null()) { return false; } uint8_t stub_patch[ExtendedJumpPatch::size]; substitute_extended_jump(stub_patch, extended_jump_start.as_int(), return_addr, hook.hook_address); write_and_record_bytes(t, extended_jump_start, stub_patch); patcher.syscallbuf_stubs[extended_jump_start] = { &hook, ExtendedJumpPatch::size }; intptr_t jump_offset = extended_jump_start - jump_patch_end; int32_t jump_offset32 = (int32_t)jump_offset; ASSERT(t, jump_offset32 == jump_offset) << "allocate_extended_jump didn't work"; JumpPatch::substitute(jump_patch, jump_offset32); write_and_record_bytes(t, jump_patch_start, jump_patch); // pad with NOPs to the next instruction static const uint8_t NOP = 0x90; DEBUG_ASSERT(syscall_instruction_length(x86_64) == syscall_instruction_length(x86)); size_t nops_bufsize = syscall_instruction_length(x86_64) + hook.patch_region_length - sizeof(jump_patch); std::unique_ptr nops(new uint8_t[nops_bufsize]); memset(nops.get(), NOP, nops_bufsize); write_and_record_mem(t, jump_patch_start + sizeof(jump_patch), nops.get(), nops_bufsize); return true; } template <> bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, RecordTask* t, const syscall_patch_hook& hook) { return patch_syscall_with_hook_x86ish(patcher, t, hook); } template <> bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, RecordTask* t, const syscall_patch_hook& hook) { return patch_syscall_with_hook_x86ish(patcher, t, hook); } template <> bool patch_syscall_with_hook_arch(Monkeypatcher&, RecordTask*, const syscall_patch_hook&) { FATAL() << "Unimplemented"; return false; } static bool patch_syscall_with_hook(Monkeypatcher& patcher, RecordTask* t, const syscall_patch_hook& hook) { RR_ARCH_FUNCTION(patch_syscall_with_hook_arch, t->arch(), patcher, t, hook); } template static bool match_extended_jump_patch(uint8_t patch[], uint64_t *return_addr); template <> bool match_extended_jump_patch( uint8_t patch[], uint64_t *return_addr) { uint32_t return_addr_lo, return_addr_hi; uint64_t jmp_target; if (!X64SyscallStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, &jmp_target)) { return false; } *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); return true; } template <> bool match_extended_jump_patch( uint8_t patch[], uint64_t *return_addr) { uint32_t return_addr_32, jmp_target_relative; if (!X86SyscallStubExtendedJump::match(patch, &return_addr_32, &jmp_target_relative)) { return false; } *return_addr = return_addr_32; return true; } template static void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, uint64_t jmp_target); template <> void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, uint64_t jmp_target) { (void)patch_addr; X64SyscallStubRestore::substitute(buffer, jmp_target); } template <> void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, uint64_t jmp_target) { int64_t offset = jmp_target - (patch_addr + X86SyscallStubRestore::trampoline_relative_addr_end); // An offset that appears to be > 2GB is OK here, since EIP will just // wrap around. X86SyscallStubRestore::substitute(buffer, (uint32_t)offset); } template static void unpatch_extended_jumps(Monkeypatcher& patcher, Task* t) { for (auto patch : patcher.syscallbuf_stubs) { const syscall_patch_hook &hook = *patch.second.hook; ASSERT(t, patch.second.size == ExtendedJumpPatch::size); uint8_t bytes[ExtendedJumpPatch::size]; t->read_bytes_helper(patch.first, sizeof(bytes), bytes); uint64_t return_addr; if (!match_extended_jump_patch(bytes, &return_addr)) { ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; return; } std::vector syscall = rr::syscall_instruction(t->arch()); // Replace with // extended_jump: // (unless PATCH_SYSCALL_INSTRUCTION_IS_LAST) // // (if PATCH_SYSCALL_INSTRUCTION_IS_LAST) // jmp *(return_addr) // As long as there are not relative branches or anything, this should // always be correct. ASSERT(t, hook.patch_region_length + ReplacementPatch::size + syscall.size() < ExtendedJumpPatch::size); uint8_t *ptr = bytes; if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { memcpy(ptr, syscall.data(), syscall.size()); ptr += syscall.size(); } memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); ptr += hook.patch_region_length; if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { memcpy(ptr, syscall.data(), syscall.size()); ptr += syscall.size(); } substitute_replacement_patch(ptr, patch.first.as_int()+(ptr-bytes), return_addr); t->write_bytes_helper(patch.first, sizeof(bytes), bytes); } } template static void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t); template <> void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { return unpatch_extended_jumps(patcher, t); } template <> void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { return unpatch_extended_jumps(patcher, t); } template <> void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t) { (void)patcher; (void)t; FATAL() << "Unimplemented"; } void Monkeypatcher::unpatch_syscalls_in(Task *t) { RR_ARCH_FUNCTION(unpatch_syscalls_arch, t->arch(), *this, t); } static string bytes_to_string(uint8_t* bytes, size_t size) { stringstream ss; for (size_t i = 0; i < size; ++i) { if (i > 0) { ss << ' '; } ss << HEX(bytes[i]); } return ss.str(); } static bool task_safe_for_syscall_patching(RecordTask* t, remote_code_ptr start, remote_code_ptr end) { if (!t->is_running()) { remote_code_ptr ip = t->ip(); if (start <= ip && ip < end) { return false; } } for (auto& e : t->pending_events) { if (e.is_syscall_event()) { remote_code_ptr ip = e.Syscall().regs.ip(); if (start <= ip && ip < end) { return false; } } } return true; } static bool safe_for_syscall_patching(remote_code_ptr start, remote_code_ptr end, RecordTask* exclude) { for (auto& p : exclude->session().tasks()) { RecordTask* rt = static_cast(p.second); if (rt != exclude && !task_safe_for_syscall_patching(rt, start, end)) { return false; } } return true; } bool Monkeypatcher::try_patch_vsyscall_caller(RecordTask* t, remote_code_ptr ret_addr) { uint8_t bytes[X64VSyscallEntry::size]; remote_ptr patch_start = ret_addr.to_data_ptr() - sizeof(bytes); size_t bytes_count = t->read_bytes_fallible(patch_start, sizeof(bytes), bytes); if (bytes_count < sizeof(bytes)) { return false; } uint32_t target_addr = 0; if (!X64VSyscallEntry::match(bytes, &target_addr)) { return false; } uint64_t target_addr_sext = (uint64_t)(int32_t)target_addr; int syscallno = 0; switch (target_addr_sext) { case 0xffffffffff600000: syscallno = X64Arch::gettimeofday; break; case 0xffffffffff600400: syscallno = X64Arch::time; break; case 0xffffffffff600800: syscallno = X64Arch::getcpu; break; default: return false; } X64VSyscallReplacement::substitute(bytes, syscallno); write_and_record_bytes(t, patch_start, bytes); LOG(debug) << "monkeypatched vsyscall caller at " << patch_start; return true; } // Syscalls can be patched either on entry or exit. For most syscall // instruction code patterns we can steal bytes after the syscall instruction // and thus we patch on entry, but some patterns require using bytes from // before the syscall instruction itself and thus can only be patched on exit. // The `entering_syscall` flag tells us whether or not we're at syscall entry. // If we are, and we find a pattern that can only be patched at exit, we'll // set a flag on the RecordTask telling it to try again after syscall exit. bool Monkeypatcher::try_patch_syscall(RecordTask* t, bool entering_syscall) { if (syscall_hooks.empty()) { // Syscall hooks not set up yet. Don't spew warnings, and don't // fill tried_to_patch_syscall_addresses with addresses that we might be // able to patch later. return false; } if (t->emulated_ptracer) { // Syscall patching can confuse ptracers, which may be surprised to see // a syscall instruction at the current IP but then when running // forwards, that the syscall occurs deep in the preload library instead. return false; } if (t->is_in_traced_syscall()) { // Never try to patch the traced-syscall in our preload library! return false; } Registers r = t->regs(); remote_code_ptr ip = r.ip(); if (tried_to_patch_syscall_addresses.count(ip)) { return false; } // We could examine the current syscall number and if it's not one that // we support syscall buffering for, refuse to patch the syscall instruction. // This would, on the face of it, reduce overhead since patching the // instruction just means a useless trip through the syscall buffering logic. // However, it actually wouldn't help much since we'd still do a switch // on the syscall number in this function instead, and due to context // switching costs any overhead saved would be insignificant. // Also, implementing that would require keeping a buffered-syscalls // list in sync with the preload code, which is unnecessary complexity. SupportedArch arch; if (!get_syscall_instruction_arch( t, ip.decrement_by_syscall_insn_length(t->arch()), &arch) || arch != t->arch()) { LOG(debug) << "Declining to patch cross-architecture syscall at " << ip; tried_to_patch_syscall_addresses.insert(ip); return false; } static const intptr_t MAXIMUM_LOOKBACK = 6; uint8_t bytes[256 + MAXIMUM_LOOKBACK]; size_t bytes_count = t->read_bytes_fallible( ip.to_data_ptr() - MAXIMUM_LOOKBACK, sizeof(bytes), bytes); if (bytes_count < MAXIMUM_LOOKBACK) { LOG(debug) << "Declining to patch syscall at " << ip << " for lack of lookback"; tried_to_patch_syscall_addresses.insert(ip); return false; } size_t following_bytes_count = bytes_count - MAXIMUM_LOOKBACK; uint8_t* following_bytes = &bytes[MAXIMUM_LOOKBACK]; intptr_t syscallno = r.original_syscallno(); bool success = false; for (auto& hook : syscall_hooks) { bool matches_hook = false; if ((!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) && following_bytes_count >= hook.patch_region_length && memcmp(following_bytes, hook.patch_region_bytes, hook.patch_region_length) == 0)) { matches_hook = true; } else if ((hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) && bytes_count >= hook.patch_region_length + (size_t)rr::syscall_instruction_length(arch) && memcmp(bytes + MAXIMUM_LOOKBACK - rr::syscall_instruction_length(arch) - hook.patch_region_length, hook.patch_region_bytes, hook.patch_region_length) == 0) { if (entering_syscall) { // A patch that uses bytes before the syscall can't be done when // entering the syscall, it must be done when exiting. So set a flag on // the Task that tells us to come back later. t->retry_syscall_patching = true; LOG(debug) << "Deferring syscall patching at " << ip << " in " << t << " until syscall exit."; return false; } matches_hook = true; } if (!matches_hook) { continue; } // Search for a following short-jump instruction that targets an // instruction // after the syscall. False positives are OK. // glibc-2.23.1-8.fc24.x86_64's __clock_nanosleep needs this. bool found_potential_interfering_branch = false; size_t max_bytes, warn_offset; uint8_t* search_bytes; if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { max_bytes = bytes_count; search_bytes = bytes; warn_offset = MAXIMUM_LOOKBACK; } else { max_bytes = following_bytes_count; search_bytes = following_bytes; warn_offset = 0; } for (size_t i = 0; i + 2 <= max_bytes; ++i) { uint8_t b = search_bytes[i]; // Check for short conditional or unconditional jump if (b == 0xeb || (b >= 0x70 && b < 0x80)) { int offset = i + 2 + (int8_t)search_bytes[i + 1]; if ((hook.flags & PATCH_IS_MULTIPLE_INSTRUCTIONS) ? (offset >= 0 && offset < hook.patch_region_length) : offset == 0) { LOG(debug) << "Found potential interfering branch at " << ip.to_data_ptr() + i - warn_offset; // We can't patch this because it would jump straight back into // the middle of our patch code. found_potential_interfering_branch = true; } } } if (!found_potential_interfering_branch) { remote_code_ptr start_range, end_range; if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { start_range = ip.decrement_by_syscall_insn_length(arch) - hook.patch_region_length; end_range = ip; } else { start_range = ip.decrement_by_syscall_insn_length(arch); end_range = ip + hook.patch_region_length; } if (!safe_for_syscall_patching(start_range, end_range, t)) { LOG(debug) << "Temporarily declining to patch syscall at " << ip << " because a different task has its ip in the patched range"; return false; } // Get out of executing the current syscall before we patch it. if (entering_syscall && !t->exit_syscall_and_prepare_restart()) { return false; } LOG(debug) << "Patching syscall at " << ip << " syscall " << syscall_name(syscallno, t->arch()) << " tid " << t->tid << " bytes " << bytes_to_string( following_bytes, min(bytes_count, sizeof(syscall_patch_hook::patch_region_bytes))); success = patch_syscall_with_hook(*this, t, hook); break; } } if (!success) { LOG(debug) << "Failed to patch syscall at " << ip << " syscall " << syscall_name(syscallno, t->arch()) << " tid " << t->tid << " bytes " << bytes_to_string( following_bytes, min(bytes_count, sizeof(syscall_patch_hook::patch_region_bytes))); tried_to_patch_syscall_addresses.insert(ip); return false; } return true; } // VDSOs are filled with overhead critical functions related to getting the // time and current CPU. We need to ensure that these syscalls get redirected // into actual trap-into-the-kernel syscalls so rr can intercept them. template static void patch_after_exec_arch(RecordTask* t, Monkeypatcher& patcher); template static void patch_at_preload_init_arch(RecordTask* t, Monkeypatcher& patcher); template <> void patch_after_exec_arch(RecordTask* t, Monkeypatcher& patcher) { (void)patcher; setup_preload_library_path(t); setup_audit_library_path(t); if (!t->vm()->has_vdso()) { patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); } else { size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*RR_PAGE_SIZE; patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); patch_auxv_vdso(t, X86Arch::RR_AT_SYSINFO, librrpage_base + AddressSpace::RRVDSO_PAGE_OFFSET*RR_PAGE_SIZE); } } // Monkeypatch x86 vsyscall hook only after the preload library // has initialized. The vsyscall hook expects to be able to use the syscallbuf. // Before the preload library has initialized, the regular vsyscall code // will trigger ptrace traps and be handled correctly by rr. template <> void patch_at_preload_init_arch(RecordTask* t, Monkeypatcher& patcher) { auto params = t->read_mem( remote_ptr>(t->regs().arg1())); if (!params.syscallbuf_enabled) { return; } patcher.init_dynamic_syscall_patching(t, params.syscall_patch_hook_count, params.syscall_patch_hooks); } template <> void patch_after_exec_arch(RecordTask* t, Monkeypatcher& patcher) { setup_preload_library_path(t); setup_audit_library_path(t); for (const auto& m : t->vm()->maps()) { auto& km = m.map; patcher.patch_after_mmap(t, km.start(), km.size(), km.file_offset_bytes()/page_size(), -1, Monkeypatcher::MMAP_EXEC); } if (!t->vm()->has_vdso()) { patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); } else { size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*RR_PAGE_SIZE; patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); } } template <> void patch_after_exec_arch(RecordTask* t, Monkeypatcher& patcher) { setup_preload_library_path(t); setup_audit_library_path(t); for (const auto& m : t->vm()->maps()) { auto& km = m.map; patcher.patch_after_mmap(t, km.start(), km.size(), km.file_offset_bytes()/page_size(), -1, Monkeypatcher::MMAP_EXEC); } if (!t->vm()->has_vdso()) { patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); } else { size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*RR_PAGE_SIZE; patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); } } template <> void patch_at_preload_init_arch(RecordTask* t, Monkeypatcher& patcher) { auto params = t->read_mem( remote_ptr>(t->regs().arg1())); if (!params.syscallbuf_enabled) { return; } patcher.init_dynamic_syscall_patching(t, params.syscall_patch_hook_count, params.syscall_patch_hooks); } template <> void patch_at_preload_init_arch(RecordTask* t, Monkeypatcher&) { auto params = t->read_mem( remote_ptr>(t->regs().arg1())); if (!params.syscallbuf_enabled) { return; } FATAL() << "Unimplemented"; } void Monkeypatcher::patch_after_exec(RecordTask* t) { ASSERT(t, 1 == t->vm()->task_set().size()) << "Can't have multiple threads immediately after exec!"; RR_ARCH_FUNCTION(patch_after_exec_arch, t->arch(), t, *this); } void Monkeypatcher::patch_at_preload_init(RecordTask* t) { // NB: the tracee can't be interrupted with a signal while // we're processing the rrcall, because it's masked off all // signals. RR_ARCH_FUNCTION(patch_at_preload_init_arch, t->arch(), t, *this); } static remote_ptr resolve_address(ElfReader& reader, uintptr_t elf_addr, remote_ptr map_start, size_t map_size, size_t map_offset_pages) { uintptr_t file_offset; if (!reader.addr_to_offset(elf_addr, file_offset)) { LOG(warn) << "ELF address " << HEX(elf_addr) << " not in file"; } uintptr_t map_offset = uintptr_t(map_offset_pages) * page_size(); if (file_offset < map_offset || file_offset + 32 > map_offset + map_size) { // The value(s) to be set are outside the mapped range. This happens // because code and data can be mapped in separate, partial mmaps in which // case some symbols will be outside the mapped range. return nullptr; } return map_start + uintptr_t(file_offset - map_offset); } static void set_and_record_bytes(RecordTask* t, ElfReader& reader, uintptr_t elf_addr, const void* bytes, size_t size, remote_ptr map_start, size_t map_size, size_t map_offset_pages) { remote_ptr addr = resolve_address(reader, elf_addr, map_start, map_size, map_offset_pages); if (!addr) { return; } bool ok = true; t->write_bytes_helper(addr, size, bytes, &ok); // Writing can fail when the value appears to be in the mapped range, but it // actually is beyond the file length. if (ok) { t->record_local(addr, size, bytes); } } /** * Patch _dl_runtime_resolve_(fxsave,xsave,xsavec) to clear "FDP Data Pointer" * register so that CPU-specific behaviors involving that register don't leak * into stack memory. */ static void patch_dl_runtime_resolve(Monkeypatcher& patcher, RecordTask* t, ElfReader& reader, uintptr_t elf_addr, remote_ptr map_start, size_t map_size, size_t map_offset_pages) { if (t->arch() != x86_64) { return; } remote_ptr addr = resolve_address(reader, elf_addr, map_start, map_size, map_offset_pages); if (!addr) { return; } uint8_t impl[X64DLRuntimeResolve::size + X64EndBr::size]; uint8_t *impl_start = impl; t->read_bytes(addr, impl); if (X64EndBr::match(impl) || X86EndBr::match(impl)) { assert(X64EndBr::size == X86EndBr::size); LOG(debug) << "Starts with endbr, skipping"; addr += X64EndBr::size; impl_start += X64EndBr::size; } if (!X64DLRuntimeResolve::match(impl_start) && !X64DLRuntimeResolve2::match(impl_start)) { LOG(warn) << "_dl_runtime_resolve implementation doesn't look right"; return; } uint8_t call_patch[X64CallMonkeypatch::size]; // We're patching in a relative call, so we need to compute the offset from // the end of the call to our actual destination. auto call_patch_start = addr.cast(); auto call_patch_end = call_patch_start + sizeof(call_patch); remote_ptr extended_call_start = allocate_extended_jump( t, patcher.extended_jump_pages, call_patch_end); if (extended_call_start.is_null()) { return; } uint8_t stub_patch[X64DLRuntimeResolvePrelude::size]; X64DLRuntimeResolvePrelude::substitute(stub_patch); write_and_record_bytes(t, extended_call_start, stub_patch); intptr_t call_offset = extended_call_start - call_patch_end; int32_t call_offset32 = (int32_t)call_offset; ASSERT(t, call_offset32 == call_offset) << "allocate_extended_jump didn't work"; X64CallMonkeypatch::substitute(call_patch, call_offset32); write_and_record_bytes(t, call_patch_start, call_patch); // pad with NOPs to the next instruction static const uint8_t NOP = 0x90; uint8_t nops[X64DLRuntimeResolve::size - sizeof(call_patch)]; memset(nops, NOP, sizeof(nops)); write_and_record_mem(t, call_patch_start + sizeof(call_patch), nops, sizeof(nops)); } static bool file_may_need_instrumentation(const AddressSpace::Mapping& map) { size_t file_part = map.map.fsname().rfind('/'); if (file_part == string::npos) { file_part = 0; } else { ++file_part; } const string& fsname = map.map.fsname(); return fsname.find("libpthread", file_part) != string::npos || fsname.find("ld", file_part) != string::npos; } void Monkeypatcher::patch_after_mmap(RecordTask* t, remote_ptr start, size_t size, size_t offset_pages, int child_fd, MmapMode mode) { const auto& map = t->vm()->mapping_of(start); if (file_may_need_instrumentation(map) && (t->arch() == x86 || t->arch() == x86_64)) { ScopedFd open_fd; if (child_fd >= 0) { open_fd = t->open_fd(child_fd, O_RDONLY); ASSERT(t, open_fd.is_open()) << "Failed to open child fd " << child_fd; } else { char buf[100]; sprintf(buf, "/proc/%d/map_files/%llx-%llx", t->tid, (long long)start.as_int(), (long long)start.as_int() + size); // Reading these directly requires CAP_SYS_ADMIN, so open the link target // instead. char link[PATH_MAX]; int ret = readlink(buf, link, sizeof(link) - 1); if (ret < 0) { return; } link[ret] = 0; open_fd = ScopedFd(link, O_RDONLY); if (!open_fd.is_open()) { return; } } ElfFileReader reader(open_fd, t->arch()); // Check for symbols first in the library itself, regardless of whether // there is a debuglink. For example, on Fedora 26, the .symtab and // .strtab sections are stripped from the debuginfo file for // libpthread.so. SymbolTable syms = reader.read_symbols(".symtab", ".strtab"); if (syms.size() == 0) { ScopedFd debug_fd = reader.open_debug_file(map.map.fsname()); if (debug_fd.is_open()) { ElfFileReader debug_reader(debug_fd, t->arch()); syms = debug_reader.read_symbols(".symtab", ".strtab"); } } for (size_t i = 0; i < syms.size(); ++i) { if (syms.is_name(i, "__elision_aconf")) { static const int zero = 0; // Setting __elision_aconf.retry_try_xbegin to zero means that // pthread rwlocks don't try to use elision at all. See ELIDE_LOCK // in glibc's elide.h. set_and_record_bytes(t, reader, syms.addr(i) + 8, &zero, sizeof(zero), start, size, offset_pages); } if (syms.is_name(i, "elision_init")) { // Make elision_init return without doing anything. This means // the __elision_available and __pthread_force_elision flags will // remain zero, disabling elision for mutexes. See glibc's // elision-conf.c. static const uint8_t ret = 0xC3; set_and_record_bytes(t, reader, syms.addr(i), &ret, sizeof(ret), start, size, offset_pages); } // The following operations can only be applied once because after the // patch is applied the code no longer matches the expected template. // For replaying a replay to work, we need to only apply these changes // during a real exec, not during the mmap operations performed when rr // replays an exec. if (mode == MMAP_EXEC && (syms.is_name(i, "_dl_runtime_resolve_fxsave") || syms.is_name(i, "_dl_runtime_resolve_xsave") || syms.is_name(i, "_dl_runtime_resolve_xsavec"))) { patch_dl_runtime_resolve(*this, t, reader, syms.addr(i), start, size, offset_pages); } } } } } // namespace rr rr-5.5.0/src/Monkeypatcher.h000066400000000000000000000107701412202446200157170ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_MONKEYPATCHER_H_ #define RR_MONKEYPATCHER_H_ #include #include #include #include "preload/preload_interface.h" #include "remote_code_ptr.h" #include "remote_ptr.h" namespace rr { class RecordTask; class ScopedFd; class Task; /** * A class encapsulating patching state. There is one instance of this * class per tracee address space. Currently this class performs the following * tasks: * * 1) Patch the VDSO's user-space-only implementation of certain system calls * (e.g. gettimeofday) to do a proper kernel system call instead, so rr can * trap and record it (x86-64 only). * * 2) Patch the VDSO __kernel_vsyscall fast-system-call stub to redirect to * our syscall hook in the preload library (x86 only). * * 3) Patch syscall instructions whose following instructions match a known * pattern to call the syscall hook. * * Monkeypatcher only runs during recording, never replay. */ class Monkeypatcher { public: Monkeypatcher() {} Monkeypatcher(const Monkeypatcher&) = default; /** * Apply any necessary patching immediately after exec. * In this hook we patch everything that doesn't depend on the preload * library being loaded. */ void patch_after_exec(RecordTask* t); /** * During librrpreload initialization, apply patches that require the * preload library to be initialized. */ void patch_at_preload_init(RecordTask* t); /** * Try to patch the syscall instruction that |t| just entered. If this * returns false, patching failed and the syscall should be processed * as normal. If this returns true, patching succeeded and the syscall * was aborted; ip() has been reset to the start of the patched syscall, * and execution should resume normally to execute the patched code. * Zero or more mapping operations are also recorded to the trace and must * be replayed. */ bool try_patch_syscall(RecordTask* t, bool entering_syscall = true); /** * Replace all extended jumps by syscalls again. Note that we do not try to * patch the original locations, since we don't know what the tracee may have * done with them in the meantime, we only patch the extended jump stubs, * which the tracee isn't allowed to touch. */ void unpatch_syscalls_in(Task *t); /** * Try to patch the vsyscall-entry pattern occurring right before ret_addr * to instead point into the corresponding entry points in the vdso. * Returns true if the patching succeeded, false if it doesn't. The tasks * registers are left unmodified. */ bool try_patch_vsyscall_caller(RecordTask *t, remote_code_ptr ret_addr); void init_dynamic_syscall_patching( RecordTask* t, int syscall_patch_hook_count, remote_ptr syscall_patch_hooks); /** * Try to allocate a stub from the sycall patching stub buffer. Returns null * if there's no buffer or we've run out of free stubs. */ remote_ptr allocate_stub(RecordTask* t, size_t bytes); enum MmapMode { MMAP_EXEC, MMAP_SYSCALL, }; /** * Apply any necessary patching immediately after an mmap. We use this to * patch libpthread.so. */ void patch_after_mmap(RecordTask* t, remote_ptr start, size_t size, size_t offset_pages, int child_fd, MmapMode mode); /** * The list of pages we've allocated to hold our extended jumps. */ struct ExtendedJumpPage { ExtendedJumpPage(remote_ptr addr) : addr(addr), allocated(0) {} remote_ptr addr; size_t allocated; }; std::vector extended_jump_pages; bool is_jump_stub_instruction(remote_code_ptr p); struct patched_syscall { // Pointer to hook inside the syscall_hooks array, which gets initialized // once and is fixed afterwars. const syscall_patch_hook *hook; size_t size; }; /** * Addresses/lengths of syscallbuf stubs. */ std::map, patched_syscall> syscallbuf_stubs; private: /** * The list of supported syscall patches obtained from the preload * library. Each one matches a specific byte signature for the instruction(s) * after a syscall instruction. */ std::vector syscall_hooks; /** * The addresses of the instructions following syscalls that we've tried * (or are currently trying) to patch. */ std::unordered_set tried_to_patch_syscall_addresses; }; } // namespace rr #endif /* RR_MONKEYPATCHER_H_ */ rr-5.5.0/src/PackCommand.cc000066400000000000000000000406771412202446200154320ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include #include #include #include #include #include #include #include #include #include #include #include "Command.h" #include "Flags.h" #include "GdbServer.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "TraceStream.h" #include "kernel_metadata.h" #include "log.h" #include "main.h" #include "../third-party/blake2/blake2.h" using namespace std; namespace rr { /** * Pack the trace directory to eliminate duplicate files and to include all * files needed for transportation. * * We try VERY HARD to avoid losing data if a PackCommand is interrupted. */ class PackCommand : public Command { public: virtual int run(vector& args) override; protected: PackCommand(const char* name, const char* help) : Command(name, help) {} static PackCommand singleton; }; PackCommand PackCommand::singleton( "pack", " rr pack [OPTION]... []\n" " --symlink Create symlinks to all mmapped files\n" " instead of copying them.\n" "\n" "Eliminates duplicate files in the trace directory, and copies files into\n" "the trace directory as necessary to ensure that all needed files are in\n" "the trace directory and none of them are links to files outside the\n" "trace directory. This makes the trace directory independent of changes\n" "to other files and ready to be transported elsewhere (e.g. by packaging\n" "it into a ZIP or tar archive).\n" "Be careful sharing traces with others; they may contain sensitive information.\n"); struct PackFlags { /* If true, insert symlinks into the trace dir which point to the original * files, rather than copying the files themselves */ bool symlink; PackFlags() : symlink(false) {} }; struct FileHash { uint8_t bytes[32]; }; bool operator<(const FileHash& h1, const FileHash& h2) { return memcmp(h1.bytes, h2.bytes, sizeof(h1)) < 0; } struct FileInfo { FileHash hash; uint64_t size; bool is_hardlink; }; static bool name_comparator(const TraceReader::MappedData& d1, const TraceReader::MappedData d2) { return d1.file_name < d2.file_name; } static bool names_equal(const TraceReader::MappedData& d1, const TraceReader::MappedData d2) { return d1.file_name == d2.file_name; } static bool size_comparator(const TraceReader::MappedData& d1, const TraceReader::MappedData d2) { return d1.data_offset_bytes > d2.data_offset_bytes; } static void* process_files_thread(void* p) { // Don't use log.h macros here since they're not necessarily thread-safe auto data = static_cast>*>(p); for (auto& pair : *data) { const char* name = pair.first.file_name.c_str(); const char* right_slash = strrchr(name, '/'); pair.second.is_hardlink = right_slash && strncmp(right_slash + 1, "mmap_hardlink_", 14) == 0; ScopedFd fd(name, O_RDONLY); if (!fd.is_open()) { fprintf(stderr, "Failed to open %s\n", name); exit(1); } struct stat stat_buf; if (fstat(fd, &stat_buf) < 0) { fprintf(stderr, "Failed to stat %s\n", name); exit(1); } if (uint64_t(stat_buf.st_size) != pair.first.file_size_bytes) { fprintf(stderr, "File size mismatch for %s\n", name); exit(1); } pair.second.size = stat_buf.st_size; blake2b_state b2_state; if (blake2b_init(&b2_state, sizeof(pair.second.hash.bytes))) { fprintf(stderr, "blake2b_init failed"); exit(1); } while (true) { char buf[1024 * 1024]; ssize_t r = read(fd, buf, sizeof(buf)); if (r < 0) { fprintf(stderr, "Failed reading from %s\n", name); exit(1); } if (r == 0) { break; } if (blake2b_update(&b2_state, buf, r)) { fprintf(stderr, "blake2b_update failed"); exit(1); } } if (blake2b_final(&b2_state, pair.second.hash.bytes, sizeof(pair.second.hash.bytes))) { fprintf(stderr, "blake2b_final failed"); exit(1); } } return nullptr; } // Return a size-sorted list of all mmapped files found in the trace static vector gather_files(const string& trace_dir) { TraceReader trace(trace_dir); vector files; while (true) { TraceReader::MappedData data; bool found; trace.read_mapped_region(&data, &found, TraceReader::VALIDATE, TraceReader::ANY_TIME); if (!found) { break; } if (data.source == TraceReader::SOURCE_FILE) { files.push_back(data); } } // First, eliminate duplicates stable_sort(files.begin(), files.end(), name_comparator); auto last = unique(files.begin(), files.end(), names_equal); files.erase(last, files.end()); // Then sort by decreasing size stable_sort(files.begin(), files.end(), size_comparator); return files; } // Take a list of all mmapped files and compute their BLAKE2b hashes. // BLAKE2b was chosen because it's fast and cryptographically strong (we don't // compare the actual file contents, we're relying on hash collision avoidance). static map gather_file_info(const string& trace_dir) { vector files = gather_files(trace_dir); int use_cpus = min(20, get_num_cpus()); use_cpus = min((int)files.size(), use_cpus); // Assign files round-robin to threads vector>> thread_files; thread_files.resize(use_cpus); for (size_t i = 0; i < files.size(); ++i) { FileInfo info; thread_files[i % use_cpus].push_back(make_pair(files[i], info)); } vector threads; for (size_t i = 0; i < thread_files.size(); ++i) { pthread_t thread; pthread_create(&thread, nullptr, process_files_thread, &thread_files[i]); threads.push_back(thread); } for (pthread_t t : threads) { pthread_join(t, nullptr); } map file_info; for (auto& f : thread_files) { for (auto& ff : f) { file_info[ff.first.file_name] = ff.second; } } return file_info; } static bool is_in_trace_dir(const string& file_name, const string& trace_dir) { return file_name.find(trace_dir) == 0; } static const char* last_filename_component(const string& file_name) { const char* last_slash = strrchr(file_name.c_str(), '/'); const char* last_component = last_slash ? last_slash + 1 : file_name.c_str(); if (strncmp(last_component, "mmap_hardlink_", 14) == 0) { last_component += 14; while (*last_component && *last_component != '_') { ++last_component; } if (*last_component == '_') { ++last_component; } } return last_component; } static string copy_into_trace(const string& file_name, const string& trace_dir, int* name_index) { // We don't bother trying to do a reflink-copy here because if that was going // to succeed, rr would probably already have used it during recording. string new_name; ScopedFd out_fd; const char* last_component = last_filename_component(file_name); while (true) { char new_name_buf[PATH_MAX]; snprintf(new_name_buf, sizeof(new_name_buf) - 1, "mmap_pack_%d_%s", *name_index, last_component); new_name_buf[sizeof(new_name_buf) - 1] = 0; new_name = trace_dir + "/" + new_name_buf; ++*name_index; out_fd = open(new_name.c_str(), O_WRONLY | O_CREAT | O_EXCL, 0700); if (!out_fd.is_open()) { if (errno == EEXIST) { continue; } FATAL() << "Couldn't create " << new_name; } break; } ScopedFd in_fd(file_name.c_str(), O_RDONLY); if (!in_fd.is_open()) { FATAL() << "Couldn't open " << file_name; } while (true) { char buf[1024 * 1024]; ssize_t r = read(in_fd, buf, sizeof(buf)); if (r < 0) { FATAL() << "Can't read from " << file_name; } if (r == 0) { break; } ssize_t written = 0; while (written < r) { ssize_t w = write(out_fd, buf + written, r - written); if (w <= 0) { FATAL() << "Can't write to " << new_name; } written += w; } } // Try to avoid dataloss if (fsync(out_fd) < 0) { FATAL() << "Can't write to " << new_name; } return new_name; } // Generates a symlink inside the trace directory, pointing to the provided // file name. static string symlink_into_trace(const string& file_name, const string& trace_dir, int* name_index) { string new_name; ScopedFd out_fd; const char* last_component = last_filename_component(file_name); while (true) { char new_name_buf[PATH_MAX]; snprintf(new_name_buf, sizeof(new_name_buf) - 1, "mmap_symlink_%d_%s", *name_index, last_component); new_name_buf[sizeof(new_name_buf) - 1] = 0; new_name = trace_dir + "/" + new_name_buf; ++*name_index; int ret = symlink(file_name.c_str(), new_name.c_str()); if (ret < 0) { if (errno == EEXIST) { continue; } FATAL() << "Couldn't create symlink `" << new_name << "' to `" << file_name << "'."; } break; } return new_name; } // Insert symlinks into the trace directory, one for each mmapped file found in // the trace. Returns a mapping of absolute original file paths and the new // relative paths to the symlinks which are to be used in their place. Files // that already exist in the trace directory (including hardlinks) are left // in place and not symlinked. static map compute_canonical_symlink_map( const string& trace_dir) { map symlink_map; int name_index = 0; // Get all mmapped files from trace vector files = gather_files(trace_dir); for (auto& p : files) { string name = p.file_name; // If file is not in trace dir, create a symlink to it if (!is_in_trace_dir(p.file_name, trace_dir)) { name = symlink_into_trace(p.file_name, trace_dir, &name_index); } // Update the file map with the relative path of the target file symlink_map[p.file_name] = string(strrchr(name.c_str(), '/') + 1); } return symlink_map; } /** * This computes a map giving, for each file referenced in the trace, the name * of a trace file to use instead. This copies files into the * trace directory if they're not in the tracedir already, or if they're * hardlinks to files outside the trace directory. All of the copied files * will have names starting with "mmap_pack_". For files in the trace directory * that have the same hash, we select just one of the files as the destination * for all files with that hash. */ static map compute_canonical_mmapped_files( const string& trace_dir) { map file_info = gather_file_info(trace_dir); map hash_to_name; for (auto& p : file_info) { const auto& existing = hash_to_name.find(p.second.hash); if (existing != hash_to_name.end()) { auto& info_existing = file_info[existing->second]; if (!info_existing.is_hardlink && is_in_trace_dir(existing->second, trace_dir)) { continue; } } hash_to_name[p.second.hash] = p.first; } int name_index = 0; for (auto& p : hash_to_name) { // Copy hardlinked files into the trace to avoid the possibility of someone // overwriting the original file. auto& info = file_info[p.second]; if (info.is_hardlink || !is_in_trace_dir(p.second, trace_dir)) { p.second = copy_into_trace(p.second, trace_dir, &name_index); } } map file_map; for (auto& p : file_info) { string name = hash_to_name[p.second.hash]; if (!is_in_trace_dir(name, trace_dir)) { FATAL() << "Internal error; file is not in trace dir"; } // Replace absolute paths with trace-relative file names file_map[p.first] = string(strrchr(name.c_str(), '/') + 1); } return file_map; } // Write out a new 'mmaps' file with the new file names and atomically // replace the existing 'mmaps' file with it. static void rewrite_mmaps(const map& file_map, const string& trace_dir) { string path = trace_dir + "/pack_mmaps"; CompressedWriter writer(path, TraceStream::mmaps_block_size(), 1); TraceReader trace(trace_dir); vector files; while (true) { TraceReader::MappedData data; bool found; vector extra_fds; bool skip_monitoring_mapped_fd; KernelMapping km = trace.read_mapped_region( &data, &found, TraceReader::VALIDATE, TraceReader::ANY_TIME, &extra_fds, &skip_monitoring_mapped_fd); if (!found) { break; } if (data.source == TraceReader::SOURCE_FILE) { auto m = file_map.find(data.file_name); if (m == file_map.end()) { FATAL() << "Internal error, didn't assign file " << data.file_name; } data.file_name = m->second; } TraceWriter::write_mapped_region_to_alternative_stream( writer, data, km, extra_fds, skip_monitoring_mapped_fd); } // Try not to lose data! writer.close(CompressedWriter::SYNC); if (!writer.good()) { FATAL() << "Error writing " << path; } // OK, now the atomic switchover to the new maps file. // BEFORE this point, we haven't altered any of the original trace files. // A crash might leave some "mmap_pack_" files around but that's OK. A retried // "rr pack" that runs to completion will clean them all up. // AFTER this point, we have altered the mmaps file and the trace remains // valid. string mmaps_path = trace_dir + "/mmaps"; if (rename(path.c_str(), mmaps_path.c_str()) < 0) { FATAL() << "Error renaming " << path << " to " << mmaps_path; } } // Delete any "mmap_" files that aren't destination files in our file_map. static void delete_unnecessary_files(const map& file_map, const string& trace_dir) { set retain; for (auto& p : file_map) { retain.insert(p.second); } DIR* dir = opendir(trace_dir.c_str()); if (!dir) { FATAL() << "Can't open directory " << trace_dir; } struct dirent* d; errno = 0; vector names; while ((d = readdir(dir)) != nullptr) { if (strncmp(d->d_name, "mmap_", 5) == 0 && retain.count(string(d->d_name)) == 0) { names.push_back(string(d->d_name)); } } if (errno) { FATAL() << "Can't read directory " << trace_dir; } closedir(dir); for (auto& n : names) { string name = trace_dir + "/" + n; if (unlink(name.c_str()) < 0) { FATAL() << "Can't delete file " << name; } } } static int pack(const string& trace_dir, const PackFlags& flags) { string dir; { // validate trace and produce default trace directory if trace_dir is empty TraceReader reader(trace_dir); dir = reader.dir(); } char buf[PATH_MAX]; char* ret = realpath(dir.c_str(), buf); if (!ret) { FATAL() << "realpath failed on " << dir; } string abspath(buf); if (flags.symlink) { map canonical_symlink_map = compute_canonical_symlink_map(abspath); rewrite_mmaps(canonical_symlink_map, abspath); delete_unnecessary_files(canonical_symlink_map, abspath); } else { map canonical_mmapped_files = compute_canonical_mmapped_files(abspath); rewrite_mmaps(canonical_mmapped_files, abspath); delete_unnecessary_files(canonical_mmapped_files, abspath); } if (!probably_not_interactive(STDOUT_FILENO)) { printf("rr: Packed trace directory `%s'.\n", dir.c_str()); } return 0; } static bool parse_pack_arg(vector& args, PackFlags& flags) { static const OptionSpec options[] = { { 0, "symlink", NO_PARAMETER }, }; ParsedOption opt; auto args_copy = args; if (!Command::parse_option(args_copy, options, &opt)) { return false; } switch (opt.short_name) { case 0: flags.symlink = true; break; default: DEBUG_ASSERT(0 && "Unknown pack option"); } args = args_copy; return true; } int PackCommand::run(vector& args) { bool found_dir = false; string trace_dir; PackFlags flags; while (parse_pack_arg(args, flags)) { } while (!args.empty()) { if (!found_dir && parse_optional_trace_dir(args, &trace_dir)) { found_dir = true; continue; } print_help(stderr); return 1; } return pack(trace_dir, flags); } } // namespace rr rr-5.5.0/src/PerfCounters.cc000066400000000000000000000543621412202446200156700ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "PerfCounters.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "Flags.h" #include "Session.h" #include "Task.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" #include "util.h" using namespace std; namespace rr { #define PERF_COUNT_RR 0x72727272L static bool attributes_initialized; // At some point we might support multiple kinds of ticks for the same CPU arch. // At that point this will need to become more complicated. static struct perf_event_attr ticks_attr; static struct perf_event_attr minus_ticks_attr; static struct perf_event_attr cycles_attr; static struct perf_event_attr hw_interrupts_attr; static struct perf_event_attr llsc_fail_attr; static uint32_t pmu_flags; static uint32_t skid_size; static bool has_ioc_period_bug; static bool only_one_counter; static bool activate_useless_counter; /* * Find out the cpu model using the cpuid instruction. * Full list of CPUIDs at http://sandpile.org/x86/cpuid.htm * Another list at * http://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers */ enum CpuMicroarch { UnknownCpu, FirstIntel, IntelMerom = FirstIntel, IntelPenryn, IntelNehalem, IntelWestmere, IntelSandyBridge, IntelIvyBridge, IntelHaswell, IntelBroadwell, IntelSkylake, IntelSilvermont, IntelGoldmont, IntelKabylake, IntelCometlake, IntelIcelake, IntelTigerlake, LastIntel = IntelTigerlake, FirstAMD, AMDF15R30 = FirstAMD, AMDZen, LastAMD = AMDZen, FirstARM, ARMNeoverseN1 = FirstARM, LastARM = ARMNeoverseN1, }; /* * Set if this CPU supports ticks counting retired conditional branches. */ #define PMU_TICKS_RCB (1<<0) /* * Some CPUs turn off the whole PMU when there are no remaining events * scheduled (perhaps as a power consumption optimization). This can be a * very expensive operation, and is thus best avoided. For cpus, where this * is a problem, we keep a cycles counter (which corresponds to one of the * fixed function counters, so we don't use up a programmable PMC) that we * don't otherwise use, but keeps the PMU active, greatly increasing * performance. */ #define PMU_BENEFITS_FROM_USELESS_COUNTER (1<<1) /* * Set if this CPU supports ticks counting all taken branches * (excluding interrupts, far branches, and rets). */ #define PMU_TICKS_TAKEN_BRANCHES (1<<3) struct PmuConfig { CpuMicroarch uarch; const char* name; unsigned rcb_cntr_event; unsigned minus_ticks_cntr_event; unsigned hw_intr_cntr_event; unsigned llsc_cntr_event; uint32_t skid_size; uint32_t flags; }; // XXX please only edit this if you really know what you're doing. // event = 0x5101c4: // - 51 = generic PMU // - 01 = umask for event BR_INST_RETIRED.CONDITIONAL // - c4 = eventsel for event BR_INST_RETIRED.CONDITIONAL // event = 0x5301cb: // - 51 = generic PMU // - 01 = umask for event HW_INTERRUPTS.RECEIVED // - cb = eventsel for event HW_INTERRUPTS.RECEIVED // See Intel 64 and IA32 Architectures Performance Monitoring Events. // See check_events from libpfm4. static const PmuConfig pmu_configs[] = { { IntelTigerlake, "Intel Tigerlake", 0x5111c4, 0, 0, 0, 100, PMU_TICKS_RCB }, { IntelIcelake, "Intel Icelake", 0x5111c4, 0, 0, 0, 100, PMU_TICKS_RCB }, { IntelCometlake, "Intel Cometlake", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelKabylake, "Intel Kabylake", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelSilvermont, "Intel Silvermont", 0x517ec4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelGoldmont, "Intel Goldmont", 0x517ec4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelSkylake, "Intel Skylake", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelBroadwell, "Intel Broadwell", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelHaswell, "Intel Haswell", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelIvyBridge, "Intel Ivy Bridge", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelSandyBridge, "Intel Sandy Bridge", 0x5101c4, 0, 0x5301cb, 0, 100, PMU_TICKS_RCB }, { IntelNehalem, "Intel Nehalem", 0x5101c4, 0, 0x50011d, 0, 100, PMU_TICKS_RCB }, { IntelWestmere, "Intel Westmere", 0x5101c4, 0, 0x50011d, 0, 100, PMU_TICKS_RCB }, { IntelPenryn, "Intel Penryn", 0, 0, 0, 0, 100, 0 }, { IntelMerom, "Intel Merom", 0, 0, 0, 0, 100, 0 }, { AMDF15R30, "AMD Family 15h Revision 30h", 0xc4, 0xc6, 0, 0, 250, PMU_TICKS_TAKEN_BRANCHES }, // 0xd1 == RETIRED_CONDITIONAL_BRANCH_INSTRUCTIONS - Number of retired conditional branch instructions // 0x2c == INTERRUPT_TAKEN - Counts the number of interrupts taken // Both counters are available on Zen, Zen+ and Zen2. { AMDZen, "AMD Zen", 0x5100d1, 0, 0x51002c, 0, 10000, PMU_TICKS_RCB }, // 0x21 == BR_RETIRED - Architecturally retired taken branches // 0x6F == STREX_SPEC - Speculatively executed strex instructions { ARMNeoverseN1, "ARM Neoverse N1", 0x21, 0, 0, 0x6F, 1000, PMU_TICKS_TAKEN_BRANCHES } }; #define RR_SKID_MAX 10000 static string lowercase(const string& s) { string c = s; transform(c.begin(), c.end(), c.begin(), ::tolower); return c; } static void init_perf_event_attr(struct perf_event_attr* attr, perf_type_id type, unsigned config) { memset(attr, 0, sizeof(*attr)); attr->type = type; attr->size = sizeof(*attr); attr->config = config; // rr requires that its events count userspace tracee code // only. attr->exclude_kernel = 1; attr->exclude_guest = 1; } static const uint64_t IN_TX = 1ULL << 32; static const uint64_t IN_TXCP = 1ULL << 33; static int64_t read_counter(ScopedFd& fd) { int64_t val; ssize_t nread = read(fd, &val, sizeof(val)); DEBUG_ASSERT(nread == sizeof(val)); return val; } static ScopedFd start_counter(pid_t tid, int group_fd, struct perf_event_attr* attr, bool* disabled_txcp = nullptr) { if (disabled_txcp) { *disabled_txcp = false; } attr->pinned = group_fd == -1; int fd = syscall(__NR_perf_event_open, attr, tid, -1, group_fd, PERF_FLAG_FD_CLOEXEC); if (0 >= fd && errno == EINVAL && attr->type == PERF_TYPE_RAW && (attr->config & IN_TXCP)) { // The kernel might not support IN_TXCP, so try again without it. struct perf_event_attr tmp_attr = *attr; tmp_attr.config &= ~IN_TXCP; fd = syscall(__NR_perf_event_open, &tmp_attr, tid, -1, group_fd, PERF_FLAG_FD_CLOEXEC); if (fd >= 0) { if (disabled_txcp) { *disabled_txcp = true; } LOG(warn) << "kernel does not support IN_TXCP"; if ((cpuid(CPUID_GETEXTENDEDFEATURES, 0).ebx & HLE_FEATURE_FLAG) && !Flags::get().suppress_environment_warnings) { fprintf(stderr, "Your CPU supports Hardware Lock Elision but your kernel does\n" "not support setting the IN_TXCP PMU flag. Record and replay\n" "of code that uses HLE will fail unless you update your\n" "kernel.\n"); } } } if (0 >= fd) { if (errno == EACCES) { CLEAN_FATAL() << "Permission denied to use 'perf_event_open'; are hardware perf events " "available? See https://github.com/rr-debugger/rr/wiki/Will-rr-work-on-my-system"; } if (errno == ENOENT) { CLEAN_FATAL() << "Unable to open performance counter with 'perf_event_open'; " "are hardware perf events available? See https://github.com/rr-debugger/rr/wiki/Will-rr-work-on-my-system"; } FATAL() << "Failed to initialize counter"; } return fd; } static void check_for_ioc_period_bug() { // Start a cycles counter struct perf_event_attr attr = rr::ticks_attr; attr.sample_period = 0xffffffff; attr.exclude_kernel = 1; ScopedFd bug_fd = start_counter(0, -1, &attr); uint64_t new_period = 1; if (ioctl(bug_fd, PERF_EVENT_IOC_PERIOD, &new_period)) { FATAL() << "ioctl(PERF_EVENT_IOC_PERIOD) failed"; } struct pollfd poll_bug_fd = {.fd = bug_fd, .events = POLL_IN, .revents = 0 }; poll(&poll_bug_fd, 1, 0); has_ioc_period_bug = poll_bug_fd.revents == 0; LOG(debug) << "has_ioc_period_bug=" << has_ioc_period_bug; } static const int NUM_BRANCHES = 500; volatile uint32_t accumulator_sink = 0; static void do_branches() { // Do NUM_BRANCHES conditional branches that can't be optimized out. // 'accumulator' is always odd and can't be zero uint32_t accumulator = uint32_t(rand()) * 2 + 1; for (int i = 0; i < NUM_BRANCHES && accumulator; ++i) { accumulator = ((accumulator * 7) + 2) & 0xffffff; } // Use 'accumulator' so it can't be optimized out. accumulator_sink = accumulator; } // Architecture specific detection code #if defined(__i386__) || defined(__x86_64__) #include "PerfCounters_x86.h" #elif defined(__aarch64__) #include "PerfCounters_aarch64.h" #else #error Must define microarchitecture detection code for this architecture #endif static void check_working_counters() { struct perf_event_attr attr = rr::ticks_attr; attr.sample_period = 0; struct perf_event_attr attr2 = rr::cycles_attr; attr.sample_period = 0; ScopedFd fd = start_counter(0, -1, &attr); ScopedFd fd2 = start_counter(0, -1, &attr2); do_branches(); int64_t events = read_counter(fd); int64_t events2 = read_counter(fd2); if (events < NUM_BRANCHES) { char config[100]; sprintf(config, "%llx", (long long)ticks_attr.config); FATAL() << "\nGot " << events << " branch events, expected at least " << NUM_BRANCHES << ".\n" "\nThe hardware performance counter seems to not be working. Check\n" "that hardware performance counters are working by running\n" " perf stat -e r" << config << " true\n" "and checking that it reports a nonzero number of events.\n" "If performance counters seem to be working with 'perf', file an\n" "rr issue, otherwise check your hardware/OS/VM configuration. Also\n" "check that other software is not using performance counters on\n" "this CPU."; } only_one_counter = events2 == 0; LOG(debug) << "only_one_counter=" << only_one_counter; if (only_one_counter) { arch_check_restricted_counter(); } } static void check_for_bugs(CpuMicroarch uarch) { if (running_under_rr()) { // Under rr we emulate idealized performance counters, so we can assume // none of the bugs apply. return; } check_for_ioc_period_bug(); check_working_counters(); check_for_arch_bugs(uarch); } static CpuMicroarch get_cpu_microarch() { string forced_uarch = lowercase(Flags::get().forced_uarch); if (!forced_uarch.empty()) { for (size_t i = 0; i < array_length(pmu_configs); ++i) { const PmuConfig& pmu = pmu_configs[i]; string name = lowercase(pmu.name); if (name.npos != name.find(forced_uarch)) { LOG(info) << "Using forced uarch " << pmu.name; return pmu.uarch; } } CLEAN_FATAL() << "Forced uarch " << Flags::get().forced_uarch << " isn't known."; } return compute_cpu_microarch(); } static void init_attributes() { if (attributes_initialized) { return; } attributes_initialized = true; CpuMicroarch uarch = get_cpu_microarch(); const PmuConfig* pmu = nullptr; for (size_t i = 0; i < array_length(pmu_configs); ++i) { if (uarch == pmu_configs[i].uarch) { pmu = &pmu_configs[i]; break; } } DEBUG_ASSERT(pmu); if (!(pmu->flags & (PMU_TICKS_RCB | PMU_TICKS_TAKEN_BRANCHES))) { FATAL() << "Microarchitecture `" << pmu->name << "' currently unsupported."; } if (running_under_rr()) { init_perf_event_attr(&ticks_attr, PERF_TYPE_HARDWARE, PERF_COUNT_RR); skid_size = RR_SKID_MAX; pmu_flags = pmu->flags & (PMU_TICKS_RCB | PMU_TICKS_TAKEN_BRANCHES); } else { skid_size = pmu->skid_size; pmu_flags = pmu->flags; init_perf_event_attr(&ticks_attr, PERF_TYPE_RAW, pmu->rcb_cntr_event); if (pmu->minus_ticks_cntr_event != 0) { init_perf_event_attr(&minus_ticks_attr, PERF_TYPE_RAW, pmu->minus_ticks_cntr_event); } init_perf_event_attr(&cycles_attr, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES); init_perf_event_attr(&hw_interrupts_attr, PERF_TYPE_RAW, pmu->hw_intr_cntr_event); init_perf_event_attr(&llsc_fail_attr, PERF_TYPE_RAW, pmu->llsc_cntr_event); // libpfm encodes the event with this bit set, so we'll do the // same thing. Unclear if necessary. hw_interrupts_attr.exclude_hv = 1; check_for_bugs(uarch); /* * For maintainability, and since it doesn't impact performance when not * needed, we always activate this. If it ever turns out to be a problem, * this can be set to pmu->flags & PMU_BENEFITS_FROM_USELESS_COUNTER, * instead. * * We also disable this counter when running under rr. Even though it's the * same event for the same task as the outer rr, the linux kernel does not * coalesce them and tries to schedule the new one on a general purpose PMC. * On CPUs with only 2 general PMCs (e.g. KNL), we'd run out. */ activate_useless_counter = has_ioc_period_bug && !running_under_rr(); } } bool PerfCounters::is_rr_ticks_attr(const perf_event_attr& attr) { return attr.type == PERF_TYPE_HARDWARE && attr.config == PERF_COUNT_RR; } bool PerfCounters::supports_ticks_semantics(TicksSemantics ticks_semantics) { init_attributes(); switch (ticks_semantics) { case TICKS_RETIRED_CONDITIONAL_BRANCHES: return (pmu_flags & PMU_TICKS_RCB) != 0; case TICKS_TAKEN_BRANCHES: return (pmu_flags & PMU_TICKS_TAKEN_BRANCHES) != 0; default: FATAL() << "Unknown ticks_semantics " << ticks_semantics; return false; } } TicksSemantics PerfCounters::default_ticks_semantics() { init_attributes(); if (pmu_flags & PMU_TICKS_TAKEN_BRANCHES) { return TICKS_TAKEN_BRANCHES; } if (pmu_flags & PMU_TICKS_RCB) { return TICKS_RETIRED_CONDITIONAL_BRANCHES; } FATAL() << "Unsupported architecture"; return TICKS_TAKEN_BRANCHES; } uint32_t PerfCounters::skid_size() { init_attributes(); return rr::skid_size; } PerfCounters::PerfCounters(pid_t tid, TicksSemantics ticks_semantics) : tid(tid), ticks_semantics_(ticks_semantics), started(false), counting(false) { if (!supports_ticks_semantics(ticks_semantics)) { FATAL() << "Ticks semantics " << ticks_semantics << " not supported"; } } static void make_counter_async(ScopedFd& fd, int signal) { if (fcntl(fd, F_SETFL, O_ASYNC) || fcntl(fd, F_SETSIG, signal)) { FATAL() << "Failed to make ticks counter ASYNC with sig" << signal_name(signal); } } void PerfCounters::reset(Ticks ticks_period) { DEBUG_ASSERT(ticks_period >= 0); if (ticks_period == 0 && !always_recreate_counters()) { // We can't switch a counter between sampling and non-sampling via // PERF_EVENT_IOC_PERIOD so just turn 0 into a very big number. ticks_period = uint64_t(1) << 60; } if (!started) { LOG(debug) << "Recreating counters with period " << ticks_period; struct perf_event_attr attr = rr::ticks_attr; struct perf_event_attr minus_attr = rr::minus_ticks_attr; attr.sample_period = ticks_period; fd_ticks_interrupt = start_counter(tid, -1, &attr); if (minus_attr.config != 0) { fd_minus_ticks_measure = start_counter(tid, fd_ticks_interrupt, &minus_attr); } if (!only_one_counter && !running_under_rr()) { reset_arch_extras(); } if (activate_useless_counter && !fd_useless_counter.is_open()) { // N.B.: This is deliberately not in the same group as the other counters // since we want to keep it scheduled at all times. fd_useless_counter = start_counter(tid, -1, &cycles_attr); } struct f_owner_ex own; own.type = F_OWNER_TID; own.pid = tid; if (fcntl(fd_ticks_interrupt, F_SETOWN_EX, &own)) { FATAL() << "Failed to SETOWN_EX ticks event fd"; } make_counter_async(fd_ticks_interrupt, PerfCounters::TIME_SLICE_SIGNAL); } else { LOG(debug) << "Resetting counters with period " << ticks_period; if (ioctl(fd_ticks_interrupt, PERF_EVENT_IOC_RESET, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_RESET) failed"; } if (ioctl(fd_ticks_interrupt, PERF_EVENT_IOC_PERIOD, &ticks_period)) { FATAL() << "ioctl(PERF_EVENT_IOC_PERIOD) failed with period " << ticks_period; } if (ioctl(fd_ticks_interrupt, PERF_EVENT_IOC_ENABLE, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_ENABLE) failed"; } if (fd_minus_ticks_measure.is_open()) { if (ioctl(fd_minus_ticks_measure, PERF_EVENT_IOC_RESET, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_RESET) failed"; } if (ioctl(fd_minus_ticks_measure, PERF_EVENT_IOC_ENABLE, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_ENABLE) failed"; } } if (fd_ticks_measure.is_open()) { if (ioctl(fd_ticks_measure, PERF_EVENT_IOC_RESET, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_RESET) failed"; } if (ioctl(fd_ticks_measure, PERF_EVENT_IOC_ENABLE, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_ENABLE) failed"; } } if (fd_ticks_in_transaction.is_open()) { if (ioctl(fd_ticks_in_transaction, PERF_EVENT_IOC_RESET, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_RESET) failed"; } if (ioctl(fd_ticks_in_transaction, PERF_EVENT_IOC_ENABLE, 0)) { FATAL() << "ioctl(PERF_EVENT_IOC_ENABLE) failed"; } } } started = true; counting = true; counting_period = ticks_period; } void PerfCounters::set_tid(pid_t tid) { stop(); this->tid = tid; } void PerfCounters::stop() { if (!started) { return; } started = false; fd_ticks_interrupt.close(); fd_ticks_measure.close(); fd_minus_ticks_measure.close(); fd_useless_counter.close(); fd_ticks_in_transaction.close(); } void PerfCounters::stop_counting() { if (!counting) { return; } counting = false; if (always_recreate_counters()) { stop(); } else { ioctl(fd_ticks_interrupt, PERF_EVENT_IOC_DISABLE, 0); if (fd_minus_ticks_measure.is_open()) { ioctl(fd_minus_ticks_measure, PERF_EVENT_IOC_DISABLE, 0); } if (fd_ticks_measure.is_open()) { ioctl(fd_ticks_measure, PERF_EVENT_IOC_DISABLE, 0); } if (fd_ticks_in_transaction.is_open()) { ioctl(fd_ticks_in_transaction, PERF_EVENT_IOC_DISABLE, 0); } } } Ticks PerfCounters::ticks_for_unconditional_indirect_branch(Task*) { return (pmu_flags & PMU_TICKS_TAKEN_BRANCHES) ? 1 : 0; } Ticks PerfCounters::ticks_for_direct_call(Task*) { return (pmu_flags & PMU_TICKS_TAKEN_BRANCHES) ? 1 : 0; } Ticks PerfCounters::read_ticks(Task* t) { if (!started || !counting) { return 0; } if (fd_ticks_in_transaction.is_open()) { uint64_t transaction_ticks = read_counter(fd_ticks_in_transaction); if (transaction_ticks > 0) { LOG(debug) << transaction_ticks << " IN_TX ticks detected"; if (!Flags::get().force_things) { ASSERT(t, false) << transaction_ticks << " IN_TX ticks detected while HLE not supported due to KVM PMU\n" "virtualization bug. See " "http://marc.info/?l=linux-kernel&m=148582794808419&w=2\n" "Aborting. Retry with -F to override, but it will probably\n" "fail."; } } } if (fd_strex_counter.is_open()) { uint64_t strex_count = read_counter(fd_strex_counter); if (strex_count > 0) { LOG(debug) << strex_count << " strex detected"; if (!Flags::get().force_things) { CLEAN_FATAL() << strex_count << " (speculatively) executed strex instructions detected. \n" "On aarch64, rr only supports applications making use of LSE\n" "atomics rather than legacy LL/SC-based atomics.\n" "Aborting. Retry with -F to override, but replaying such\n" "a recording will probably fail."; } } } uint64_t adjusted_counting_period = counting_period + (t->session().is_recording() ? recording_skid_size() : skid_size()); uint64_t interrupt_val = read_counter(fd_ticks_interrupt); if (!fd_ticks_measure.is_open()) { if (fd_minus_ticks_measure.is_open()) { uint64_t minus_measure_val = read_counter(fd_minus_ticks_measure); interrupt_val -= minus_measure_val; } if (t->session().is_recording()) { if (counting_period && interrupt_val > adjusted_counting_period) { LOG(warn) << "Recorded ticks of " << interrupt_val << " overshot requested ticks target by " << interrupt_val - counting_period << " ticks.\n" "On AMD systems this is known to occur occasionally for unknown reasons.\n" "Recording should continue normally. Please report any unexpected rr failures\n" "received after this warning, any conditions that reliably reproduce it,\n" "or sightings of this warning on non-AMD systems."; } } else { ASSERT(t, !counting_period || interrupt_val <= adjusted_counting_period) << "Detected " << interrupt_val << " ticks, expected no more than " << adjusted_counting_period; } return interrupt_val; } uint64_t measure_val = read_counter(fd_ticks_measure); if (measure_val > interrupt_val) { // There is some kind of kernel or hardware bug that means we sometimes // see more events with IN_TXCP set than without. These are clearly // spurious events :-(. For now, work around it by returning the // interrupt_val. That will work if HLE hasn't been used in this interval. // Note that interrupt_val > measure_val is valid behavior (when HLE is // being used). LOG(debug) << "Measured too many ticks; measure=" << measure_val << ", interrupt=" << interrupt_val; ASSERT(t, !counting_period || interrupt_val <= adjusted_counting_period) << "Detected " << interrupt_val << " ticks, expected no more than " << adjusted_counting_period; return interrupt_val; } ASSERT(t, !counting_period || measure_val <= adjusted_counting_period) << "Detected " << measure_val << " ticks, expected no more than " << adjusted_counting_period; return measure_val; } } // namespace rr rr-5.5.0/src/PerfCounters.h000066400000000000000000000102411412202446200155160ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PERF_COUNTERS_H_ #define RR_PERF_COUNTERS_H_ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif #include #include #include #include "ScopedFd.h" #include "Ticks.h" struct perf_event_attr; namespace rr { class Task; enum TicksSemantics { TICKS_RETIRED_CONDITIONAL_BRANCHES, TICKS_TAKEN_BRANCHES, }; /** * A class encapsulating the performance counters we use to monitor * each task during recording and replay. * * Normally we monitor a single kind of event that we use as a proxy * for progress, which we call "ticks". Currently this is the count of retired * conditional branches. We support dispatching a signal when the counter * reaches a particular value. * * When extra_perf_counters_enabled() returns true, we monitor additional * counters of interest. */ class PerfCounters { public: /** * Create performance counters monitoring the given task. */ PerfCounters(pid_t tid, TicksSemantics ticks_semantics); ~PerfCounters() { stop(); } void set_tid(pid_t tid); /** * Reset all counter values to 0 and program the counters to send * TIME_SLICE_SIGNAL when 'ticks_period' tick events have elapsed. (In reality * the hardware triggers its interrupt some time after that. We also allow * the interrupt to fire early.) * This must be called while the task is stopped, and it must be called * before the task is allowed to run again. * `ticks_period` of zero means don't interrupt at all. */ void reset(Ticks ticks_period); template void reset_arch_extras(); /** * Close the perfcounter fds. They will be automatically reopened if/when * reset is called again. */ void stop(); /** * Suspend counting until the next reset. This may or may not actually stop * the performance counters, depending on whether or not this is required * for correctness on this kernel version. */ void stop_counting(); /** * Return the number of ticks we need for an emulated branch. */ static Ticks ticks_for_unconditional_indirect_branch(Task*); /** * Return the number of ticks we need for a direct call. */ static Ticks ticks_for_direct_call(Task*); /** * Read the current value of the ticks counter. * `t` is used for debugging purposes. */ Ticks read_ticks(Task* t); /** * Returns what ticks mean for these counters. */ TicksSemantics ticks_semantics() const { return ticks_semantics_; } /** * Return the fd we last used to generate the ticks-counter signal. */ int ticks_interrupt_fd() const { return fd_ticks_interrupt.get(); } /* This choice is fairly arbitrary; linux doesn't use SIGSTKFLT so we * hope that tracees don't either. */ enum { TIME_SLICE_SIGNAL = SIGSTKFLT }; static bool is_rr_ticks_attr(const perf_event_attr& attr); static bool supports_ticks_semantics(TicksSemantics ticks_semantics); static TicksSemantics default_ticks_semantics(); /** * When an interrupt is requested, at most this many ticks may elapse before * the interrupt is delivered. */ static uint32_t skid_size(); /** * Use a separate skid_size for recording since we seem to see more skid * in practice during recording, in particular during the * async_signal_syscalls tests */ static uint32_t recording_skid_size() { return skid_size() * 5; } private: // Only valid while 'counting' is true Ticks counting_period; pid_t tid; // We use separate fds for counting ticks and for generating interrupts. The // former ignores ticks in aborted transactions, and does not support // sample_period; the latter does not ignore ticks in aborted transactions, // but does support sample_period. ScopedFd fd_ticks_measure; ScopedFd fd_minus_ticks_measure; ScopedFd fd_ticks_interrupt; ScopedFd fd_useless_counter; // x86(_64) specific counter to support recording HLE ScopedFd fd_ticks_in_transaction; // aarch64 specific counter to detect use of ll/sc instructions ScopedFd fd_strex_counter; TicksSemantics ticks_semantics_; bool started; bool counting; }; } // namespace rr #endif /* RR_PERF_COUNTERS_H_ */ rr-5.5.0/src/PerfCounters_aarch64.h000066400000000000000000000034231412202446200170320ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ // This file is included from PerfCounters.cc static const char* midr_path = "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1"; /** * Return the detected, known microarchitecture of this CPU, or don't * return; i.e. never return UnknownCpu. */ static CpuMicroarch compute_cpu_microarch() { FILE *midr_el1 = fopen(midr_path, "r"); if (!midr_el1) { CLEAN_FATAL() << "Failed to read midr register from kernel"; } uint32_t midir; if (1 != fscanf(midr_el1, "%x", &midir)) { CLEAN_FATAL() << "Failed to read midr register from kernel"; } fclose(midr_el1); switch (midir) { case 0x413fd0c1: return ARMNeoverseN1; default: break; } CLEAN_FATAL() << "Aarch64 CPU type " << HEX(midir) << " unknown"; return UnknownCpu; // not reached } static void arch_check_restricted_counter() { if (!Flags::get().suppress_environment_warnings) { fprintf(stderr, "Your CPU supports only one performance counter.\n" "Use of LL/SC instructions will not be detected and will\n" "cause silently corrupt recordings. It is highly recommended\n" "that you alter your configuration to enable additional performance\n" "counters.\n"); } } static bool always_recreate_counters() { return false; } static void check_for_arch_bugs(__attribute__((unused)) CpuMicroarch uarch) {} template <> void PerfCounters::reset_arch_extras() { // LL/SC can't be recorded reliably. Start a counter to detect // any usage, such that we can give an intelligent error message. struct perf_event_attr attr = rr::llsc_fail_attr; attr.sample_period = 0; fd_strex_counter = start_counter(tid, fd_ticks_interrupt, &attr); } rr-5.5.0/src/PerfCounters_x86.h000066400000000000000000000275111412202446200162330ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ // This file is included from PerfCounters.cc static bool has_kvm_in_txcp_bug; static bool has_xen_pmi_bug; static bool supports_txcp; /** * Return the detected, known microarchitecture of this CPU, or don't * return; i.e. never return UnknownCpu. */ static CpuMicroarch compute_cpu_microarch() { auto cpuid_vendor = cpuid(CPUID_GETVENDORSTRING, 0); char vendor[12]; memcpy(&vendor[0], &cpuid_vendor.ebx, 4); memcpy(&vendor[4], &cpuid_vendor.edx, 4); memcpy(&vendor[8], &cpuid_vendor.ecx, 4); if (strncmp(vendor, "GenuineIntel", sizeof(vendor)) && strncmp(vendor, "AuthenticAMD", sizeof(vendor))) { CLEAN_FATAL() << "Unknown CPU vendor '" << vendor << "'"; } auto cpuid_data = cpuid(CPUID_GETFEATURES, 0); unsigned int cpu_type = cpuid_data.eax & 0xF0FF0; unsigned int ext_family = (cpuid_data.eax >> 20) & 0xff; switch (cpu_type) { case 0x006F0: case 0x10660: return IntelMerom; case 0x10670: case 0x106D0: return IntelPenryn; case 0x106A0: case 0x106E0: case 0x206E0: return IntelNehalem; case 0x20650: case 0x206C0: case 0x206F0: return IntelWestmere; case 0x206A0: case 0x206D0: case 0x306e0: return IntelSandyBridge; case 0x306A0: return IntelIvyBridge; case 0x306C0: /* Devil's Canyon */ case 0x306F0: case 0x40650: case 0x40660: return IntelHaswell; case 0x306D0: case 0x40670: case 0x406F0: case 0x50660: return IntelBroadwell; case 0x406e0: case 0x50650: case 0x506e0: return IntelSkylake; case 0x30670: case 0x406c0: case 0x50670: return IntelSilvermont; case 0x506f0: return IntelGoldmont; case 0x706e0: return IntelIcelake; case 0x806c0: case 0x806d0: return IntelTigerlake; case 0x806e0: case 0x906e0: return IntelKabylake; case 0xa0650: case 0xa0660: return IntelCometlake; case 0x30f00: return AMDF15R30; case 0x00f10: // Naples, Whitehaven, Summit Ridge, Snowy Owl (Zen) (UNTESTED) case 0x10f10: // Raven Ridge, Great Horned Owl (Zen) (UNTESTED) case 0x10f80: // Banded Kestrel (Zen), Picasso (Zen+) (UNTESTED) case 0x20f00: // Dali (Zen) (UNTESTED) case 0x00f80: // Colfax, Pinnacle Ridge (Zen+) (UNTESTED) case 0x30f10: // Rome, Castle Peak (Zen 2) case 0x60f00: // Renoir (Zen 2) (UNTESTED) case 0x70f10: // Matisse (Zen 2) (UNTESTED) if (ext_family == 8) { return AMDZen; } else if (ext_family == 3) { return AMDF15R30; } break; case 0x20f10: // Vermeer (Zen 3) case 0x50f00: // Cezanne (Zen 3) if (ext_family == 0xa) { return AMDZen; } default: break; } if (!strcmp(vendor, "AuthenticAMD")) { CLEAN_FATAL() << "AMD CPU type " << HEX(cpu_type) << " unknown"; } else { CLEAN_FATAL() << "Intel CPU type " << HEX(cpu_type) << " unknown"; } return UnknownCpu; // not reached } static void check_for_kvm_in_txcp_bug() { int64_t count = 0; struct perf_event_attr attr = rr::ticks_attr; attr.config |= IN_TXCP; attr.sample_period = 0; bool disabled_txcp; ScopedFd fd = start_counter(0, -1, &attr, &disabled_txcp); if (fd.is_open() && !disabled_txcp) { ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); do_branches(); count = read_counter(fd); } supports_txcp = count > 0; has_kvm_in_txcp_bug = supports_txcp && count < NUM_BRANCHES; LOG(debug) << "supports txcp=" << supports_txcp; LOG(debug) << "has_kvm_in_txcp_bug=" << has_kvm_in_txcp_bug << " count=" << count; } static void check_for_xen_pmi_bug() { int32_t count = -1; struct perf_event_attr attr = rr::ticks_attr; attr.sample_period = NUM_BRANCHES - 1; ScopedFd fd = start_counter(0, -1, &attr); if (fd.is_open()) { // Do NUM_BRANCHES conditional branches that can't be optimized out. // 'accumulator' is always odd and can't be zero uint32_t accumulator = uint32_t(rand()) * 2 + 1; int raw_fd = fd; asm volatile( #if defined(__x86_64__) "mov %[_SYS_ioctl], %%rax;" "mov %[raw_fd], %%edi;" "xor %%rdx, %%rdx;" "mov %[_PERF_EVENT_IOC_ENABLE], %%rsi;" "syscall;" "cmp $-4095, %%rax;" "jae 2f;" "mov %[_SYS_ioctl], %%rax;" "mov %[_PERF_EVENT_IOC_RESET], %%rsi;" "syscall;" // From this point on all conditional branches count! "cmp $-4095, %%rax;" "jae 2f;" // Reset the counter period to the desired value. "mov %[_SYS_ioctl], %%rax;" "mov %[_PERF_EVENT_IOC_PERIOD], %%rsi;" "mov %[period], %%rdx;" "syscall;" "cmp $-4095, %%rax;" "jae 2f;" "mov %[_iterations], %%rax;" "1: dec %%rax;" // Multiply by 7. "mov %[accumulator], %%edx;" "shl $3, %[accumulator];" "sub %%edx, %[accumulator];" // Add 2. "add $2, %[accumulator];" // Mask off bits. "and $0xffffff, %[accumulator];" // And loop. "test %%rax, %%rax;" "jnz 1b;" "mov %[_PERF_EVENT_IOC_DISABLE], %%rsi;" "mov %[_SYS_ioctl], %%rax;" "xor %%rdx, %%rdx;" // We didn't touch rdi. "syscall;" "cmp $-4095, %%rax;" "jae 2f;" "movl $0, %[count];" "2: nop;" #elif defined(__i386__) "mov %[_SYS_ioctl], %%eax;" "mov %[raw_fd], %%ebx;" "xor %%edx, %%edx;" "mov %[_PERF_EVENT_IOC_ENABLE], %%ecx;" "int $0x80;" "cmp $-4095, %%eax;" "jae 2f;" "mov %[_SYS_ioctl], %%eax;" "mov %[_PERF_EVENT_IOC_RESET], %%ecx;" "int $0x80;" // From this point on all conditional branches count! "cmp $-4095, %%eax;" "jae 2f;" // Reset the counter period to the desired value. "mov %[_SYS_ioctl], %%eax;" "mov %[_PERF_EVENT_IOC_PERIOD], %%ecx;" "mov %[period], %%edx;" "int $0x80;" "cmp $-4095, %%eax;" "jae 2f;" "mov %[_iterations], %%eax;" "1: dec %%eax;" // Multiply by 7. "mov %[accumulator], %%edx;" "shll $3, %[accumulator];" "sub %%edx, %[accumulator];" // Add 2. "add $2, %[accumulator];" // Mask off bits. "andl $0xffffff, %[accumulator];" // And loop. "test %%eax, %%eax;" "jnz 1b;" "mov %[_PERF_EVENT_IOC_DISABLE], %%ecx;" "mov %[_SYS_ioctl], %%eax;" "xor %%edx, %%edx;" // We didn't touch rdi. "int $0x80;" "cmp $-4095, %%eax;" "jae 2f;" "movl $0, %[count];" "2: nop;" #else #error unknown CPU architecture #endif : [accumulator] "+rm"(accumulator), [count] "=rm"(count) : [_SYS_ioctl] "i"(SYS_ioctl), [_PERF_EVENT_IOC_DISABLE] "i"(PERF_EVENT_IOC_DISABLE), [_PERF_EVENT_IOC_ENABLE] "i"(PERF_EVENT_IOC_ENABLE), [_PERF_EVENT_IOC_PERIOD] "i"(PERF_EVENT_IOC_PERIOD), [_PERF_EVENT_IOC_RESET] "i"(PERF_EVENT_IOC_RESET), // The check for the failure of some of our ioctls is in // the measured region, so account for that when looping. [_iterations] "i"(NUM_BRANCHES - 2), [period] "rm"(&attr.sample_period), [raw_fd] "rm"(raw_fd) : #if defined(__x86_64__) "rax", "rdx", "rdi", "rsi" // `syscall` clobbers rcx and r11. , "rcx", "r11" #elif defined(__i386__) "eax", "ebx", "ecx", "edx" #else #error unknown CPU architecture #endif ); // If things worked above, `count` should have been set to 0. if (count == 0) { count = read_counter(fd); } // Use 'accumulator' so it can't be optimized out. accumulator_sink = accumulator; } has_xen_pmi_bug = count > NUM_BRANCHES || count == -1; if (has_xen_pmi_bug) { LOG(debug) << "has_xen_pmi_bug=" << has_xen_pmi_bug << " count=" << count; if (!Flags::get().force_things) { FATAL() << "Overcount triggered by PMU interrupts detected due to Xen PMU " "virtualization bug.\n" "Aborting. Retry with -F to override, but it will probably\n" "fail."; } } } static void check_for_zen_speclockmap() { // When the SpecLockMap optimization is not disabled, rr will not work // reliably (e.g. it would work fine on a single process with a single // thread, but not more). When the optimization is disabled, the // perf counter for retired lock instructions of type SpecLockMapCommit // (on PMC 0x25) stays at 0. // See more details at https://github.com/rr-debugger/rr/issues/2034. struct perf_event_attr attr; // 0x25 == RETIRED_LOCK_INSTRUCTIONS - Counts the number of retired locked instructions // + 0x08 == SPECLOCKMAPCOMMIT init_perf_event_attr(&attr, PERF_TYPE_RAW, 0x510825); ScopedFd fd = start_counter(0, -1, &attr); if (fd.is_open()) { int atomic = 0; int64_t count = read_counter(fd); // A lock add is known to increase the perf counter we're looking at. asm volatile("lock addl $1, %0": "+m" (atomic)); if (read_counter(fd) == count) { LOG(debug) << "SpecLockMap is disabled"; } else { LOG(debug) << "SpecLockMap is not disabled"; fprintf(stderr, "On Zen CPUs, rr will not work reliably unless you disable the " "hardware SpecLockMap optimization.\nFor instructions on how to " "do this, see https://github.com/rr-debugger/rr/wiki/Zen\n"); } } } static void check_for_arch_bugs(CpuMicroarch uarch) { if (uarch >= FirstIntel && uarch <= LastIntel) { check_for_kvm_in_txcp_bug(); check_for_xen_pmi_bug(); } if (uarch == AMDZen) { check_for_zen_speclockmap(); } } static bool always_recreate_counters() { // When we have the KVM IN_TXCP bug, reenabling the TXCP counter after // disabling it does not work. return has_ioc_period_bug || has_kvm_in_txcp_bug; } static void arch_check_restricted_counter() { if ((cpuid(CPUID_GETEXTENDEDFEATURES, 0).ebx & HLE_FEATURE_FLAG) && !Flags::get().suppress_environment_warnings) { fprintf(stderr, "Your CPU supports Hardware Lock Elision but you only have one\n" "hardware performance counter available. Record and replay\n" "of code that uses HLE will fail unless you alter your\n" "configuration to make more than one hardware performance counter\n" "available.\n"); } } template void PerfCounters::reset_arch_extras() { if (supports_txcp) { struct perf_event_attr attr = rr::ticks_attr; if (has_kvm_in_txcp_bug) { // IN_TXCP isn't going to work reliably. Assume that HLE/RTM are not // used, // and check that. attr.sample_period = 0; attr.config |= IN_TX; fd_ticks_in_transaction = start_counter(tid, fd_ticks_interrupt, &attr); } else { // Set up a separate counter for measuring ticks, which does not have // a sample period and does not count events during aborted // transactions. // We have to use two separate counters here because the kernel does // not support setting a sample_period with IN_TXCP, apparently for // reasons related to this Intel note on IA32_PERFEVTSEL2: // ``When IN_TXCP=1 & IN_TX=1 and in sampling, spurious PMI may // occur and transactions may continuously abort near overflow // conditions. Software should favor using IN_TXCP for counting over // sampling. If sampling, software should use large “sample-after“ // value after clearing the counter configured to use IN_TXCP and // also always reset the counter even when no overflow condition // was reported.'' attr.sample_period = 0; attr.config |= IN_TXCP; fd_ticks_measure = start_counter(tid, fd_ticks_interrupt, &attr); } } } rr-5.5.0/src/PreserveFileMonitor.h000066400000000000000000000015521412202446200170470ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PRESERVE_FILE_MONITOR_H_ #define RR_PRESERVE_FILE_MONITOR_H_ #include "FileMonitor.h" namespace rr { /** * A FileMonitor that does no monitoring of I/O itself, but prevents the file * descriptor from being closed (except via privileged syscalls made by * preload.c) or seen in /proc/pid/fd/. * * The mere existence of this monitor disables syscall buffering for the fd, so * we get syscall traps for close() etc on the fd. Then * rec_prepare_syscall_arch calls allow_close() to check whether closing is * allowed. */ class PreserveFileMonitor : public FileMonitor { public: PreserveFileMonitor() {} virtual Type type() override { return Preserve; } virtual bool is_rr_fd() override { return true; } }; } // namespace rr #endif /* RR_PRESERVE_FILE_MONITOR_H_ */ rr-5.5.0/src/ProcFdDirMonitor.cc000066400000000000000000000074351412202446200164340ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "ProcFdDirMonitor.h" #include #include #include "AutoRemoteSyscalls.h" #include "RecordSession.h" #include "RecordTask.h" #include "log.h" using namespace std; namespace rr { ProcFdDirMonitor::ProcFdDirMonitor(Task* t, const string& pathname) { // XXX this makes some assumptions about namespaces... Probably fails // if |t| is not the same pid namespace as rr int ends_with_slash = (pathname.back() == '/'); if (pathname.substr(0, 6) == string("/proc/") && pathname.substr(pathname.size() - 3 - ends_with_slash, 3) == string("/fd")) { string s = pathname.substr(6, pathname.size() - 9 - ends_with_slash); char* end; int tid = strtol(s.c_str(), &end, 10); if (!*end) { Task* target = t->session().find_task(tid); if (target) { tuid = target->tuid(); } } } } // returns the number of valid dirent structs left in the buffer template static int filter_dirent_structs(RecordTask* t, uint8_t* buf, size_t size) { int bytes = 0; size_t current_offset = 0; while (1) { if (current_offset == size) { break; } D* current_struct = reinterpret_cast(buf + current_offset); auto next_off = current_offset + current_struct->d_reclen; char* fname = (char*)current_struct->d_name; char* end; int fd = strtol(fname, &end, 10); if (!*end && t->fd_table()->is_rr_fd(fd)) { // Skip this entry. memmove(current_struct, buf + next_off, size - next_off); size -= (next_off - current_offset); next_off = current_offset; } else { // Either this is a tracee fd or not an fd at all (e.g. '.') bytes += current_struct->d_reclen; } current_offset = next_off; } return bytes; } template static void filter_dirents_arch(RecordTask* t) { auto regs = t->regs(); remote_ptr ptr(regs.arg2()); size_t len = regs.arg3(); if (regs.syscall_failed() || !regs.syscall_result()) { return; } while (1) { vector buf = t->read_mem(ptr, len); int bytes = regs.syscall_result(); if (regs.original_syscallno() == Arch::getdents64) { bytes = filter_dirent_structs(t, buf.data(), bytes); } else { bytes = filter_dirent_structs(t, buf.data(), bytes); } if (bytes > 0) { t->write_mem(ptr, buf.data(), bytes); regs.set_syscall_result(bytes); t->set_regs(regs); // Explicitly record what the kernel may have touched and we discarded, // because it's userspace modification that will not be caught otherwise. if (len > (size_t)bytes) { t->record_remote(ptr + bytes, len - bytes); } return; } // We filtered out all the entries, so we need to repeat the syscall. { AutoRemoteSyscalls remote(t); remote.syscall(regs.original_syscallno(), regs.arg1(), regs.arg2(), regs.arg3()); // Only copy over the syscall result. In particular, we don't want to // copy the AutoRemoteSyscalls ip(). regs.set_syscall_result(t->regs().syscall_result()); } if (regs.syscall_failed() || regs.syscall_result() == 0) { // Save the new syscall result, and record the buffer we will otherwise // ignore. t->record_remote(ptr, len); t->set_regs(regs); return; } } } static void filter_dirents(RecordTask* t) { RR_ARCH_FUNCTION(filter_dirents_arch, t->arch(), t); } void ProcFdDirMonitor::filter_getdents(RecordTask* t) { ASSERT(t, !t->session().is_replaying()); auto* target = static_cast(t->session().find_task(tuid)); if (!target) { return; } filter_dirents(t); } } // namespace rr rr-5.5.0/src/ProcFdDirMonitor.h000066400000000000000000000013441412202446200162670ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PROC_FD_DIR_MONITOR_H_ #define RR_PROC_FD_DIR_MONITOR_H_ #include "FileMonitor.h" #include "TaskishUid.h" namespace rr { /** * A FileMonitor to intercept enumerations of /proc//fd so that entries * for rr's private fds can be hidden when is a tracee. */ class ProcFdDirMonitor : public FileMonitor { public: ProcFdDirMonitor(Task* t, const std::string& pathname); virtual Type type() override { return ProcFd; } virtual void filter_getdents(RecordTask* t) override; private: // 0 if this doesn't object doesn't refer to a tracee's proc-mem. TaskUid tuid; }; } // namespace rr #endif /* RR_PROC_FD_DIR_MONITOR_H_ */ rr-5.5.0/src/ProcMemMonitor.cc000066400000000000000000000040371412202446200161550ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "ProcMemMonitor.h" #include #include "AutoRemoteSyscalls.h" #include "RecordSession.h" #include "ReplaySession.h" #include "ReplayTask.h" #include "log.h" using namespace std; namespace rr { ProcMemMonitor::ProcMemMonitor(Task* t, const string& pathname) { // XXX this makes some assumptions about namespaces... Probably fails // if |t| is not the same pid namespace as rr if (pathname.substr(0, 6) == string("/proc/") && pathname.substr(pathname.size() - 4, 4) == string("/mem")) { string s = pathname.substr(6, pathname.size() - 10); char* end; int tid = strtol(s.c_str(), &end, 10); if (!*end) { Task* target = t->session().find_task(tid); if (target) { auid = target->vm()->uid(); } } } } void ProcMemMonitor::did_write(Task* t, const std::vector& ranges, LazyOffset& lazy_offset) { if (ranges.empty()) { return; } int64_t offset = lazy_offset.retrieve(true); // In prior versions of rr, we recorded this directly into the trace. // If so, there's nothing to do here. if (t->session().is_replaying() && t->session().as_replay()->has_trace_quirk(TraceReader::ExplicitProcMem)) { return; } if (t->session().is_recording()) { // Nothing to do now (though we may have just recorded the offset) return; } auto* target = t->session().find_address_space(auid); if (!target) { return; } ReplayTask* task = static_cast(target->first_running_task()); if (!task) { return; } for (auto& r : ranges) { auto bytes = t->read_mem(r.data.cast(), r.length); remote_ptr target_addr = offset; task->write_mem(target_addr, bytes.data(), r.length); target->maybe_update_breakpoints(task, target_addr, r.length); offset += r.length; } } bool ProcMemMonitor::target_is_vm(AddressSpace *vm) { return auid == vm->uid(); } } // namespace rr rr-5.5.0/src/ProcMemMonitor.h000066400000000000000000000021361412202446200160150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PROC_MEM_MONITOR_H_ #define RR_PROC_MEM_MONITOR_H_ #include "FileMonitor.h" #include "TaskishUid.h" namespace rr { /** * A FileMonitor to track writes to /proc//mem so they can be replayed * when is a replayed tracee. */ class ProcMemMonitor : public FileMonitor { public: ProcMemMonitor(Task* t, const std::string& pathname); virtual Type type() override { return ProcMem; } // We need to PREVENT_SWITCH, since the timing of the write is otherwise // unpredictable from our perspective. virtual Switchable will_write(Task*) override { return PREVENT_SWITCH; } virtual void did_write(Task* t, const std::vector& ranges, LazyOffset& lazy_offset) override; virtual enum syscallbuf_fd_classes get_syscallbuf_class() override { return FD_CLASS_PROC_MEM; } bool target_is_vm(AddressSpace *t); private: // 0 if this doesn't object doesn't refer to a tracee's proc-mem. AddressSpaceUid auid; }; } // namespace rr #endif /* RR_PROC_MEM_MONITOR_H_ */ rr-5.5.0/src/ProcStatMonitor.cc000066400000000000000000000037161412202446200163550ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include #include #include #include "ProcStatMonitor.h" #include "RecordTask.h" #include "RecordSession.h" #include "Scheduler.h" #include "log.h" #include "util.h" using namespace std; namespace rr { // Skip any lines that contain CPUs not in our cpu mask static void filter_proc_stat(string& data, const cpu_set_t& active) { string::iterator pos = data.begin(); while (pos + 4 < data.end()) { const char *cur_data = &*pos; static char cpu_str[] = "cpu"; if (memcmp(cur_data, cpu_str, sizeof(cpu_str)-1) == 0 && isdigit(*(cur_data + 3))) { unsigned long cpu = strtoul((char*)cur_data + 3, NULL, 10); if (!CPU_ISSET(cpu, &active)) { pos = data.erase(pos, ++std::find(pos, data.end(), '\n')); continue; } } pos = ++std::find(pos, data.end(), '\n'); } } ProcStatMonitor::ProcStatMonitor(Task* t, const string&) { if (t->session().is_replaying()) return; // Grab all the data now and buffer it for later access. This matches what the // kernel does (execpt that it does the buffering on first access) and is // required to give userspace code a consistent view of the file. std::ifstream proc_stat("/proc/stat"); if (!proc_stat.is_open()) { FATAL() << "Failed to process /proc/stat"; } data = string( (std::istreambuf_iterator(proc_stat)), (std::istreambuf_iterator())); const cpu_set_t cpus = static_cast(t)->session().scheduler().pretend_affinity_mask(); filter_proc_stat(data, cpus); } bool ProcStatMonitor::emulate_read( RecordTask* t, const vector& ranges, LazyOffset& lazy_offset, uint64_t* result) { int64_t offset = lazy_offset.retrieve(false); *result = t->write_ranges(ranges, (uint8_t*)data.data() + offset, (offset > (ssize_t)data.size()) ? 0 : data.size() - offset); return true; } } // namespace rr rr-5.5.0/src/ProcStatMonitor.h000066400000000000000000000016351412202446200162150ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PROC_STAT_MONITOR_H_ #define RR_PROC_STAT_MONITOR_H_ #include "FileMonitor.h" namespace rr { /** * A FileMonitor to intercept /proc/stat in order to pretend to the * tracee that it only has the CPUs that rr is willing to give it. * This is necessary on top of the SysCpuMonitor, because some versions * of glibc have bugs that cause it to fail to parse the * /sys/devices/system/cpu/online format, causing them to fallback to /proc/stat */ class ProcStatMonitor : public FileMonitor { public: ProcStatMonitor(Task* t, const std::string& pathname); virtual Type type() override { return ProcStat; } bool emulate_read(RecordTask* t, const std::vector& ranges, LazyOffset&, uint64_t* result) override; private: std::string data; }; } // namespace rr #endif /* RR_PROC_STAT_MONITOR_H_ */ rr-5.5.0/src/PropertyTable.h000066400000000000000000000044001412202446200156730ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_PROPERTY_TABLE_H_ #define RR_PROPERTY_TABLE_H_ #include #include #include namespace rr { template class Property; /** * A PropertyTable is a heterogenously-typed set of property values. * It maps Property (effectively, property names) to values of * type T. It owns the property values. * Property values can be created, accessed and removed in a type-safe way * via the Property class. */ class PropertyTable { public: PropertyTable() {} ~PropertyTable() { for (auto& p : values) { p.first->destroy_property(p.second); } } private: template friend class Property; class PropertyBase { public: virtual ~PropertyBase() {} virtual void destroy_property(void* v) const = 0; }; std::vector> values; }; /** * Create an instance of this class to declare a property name. * The methods of this class call properties() on their Object parameter to * get the PropertyTable. */ template class Property : protected PropertyTable::PropertyBase { public: Property() {} T& create(Object& o) const { DEBUG_ASSERT(!get(o)); T* t = new T(); o.properties().values.push_back(std::make_pair( static_cast(this), t)); return *t; } T* get(Object& o) const { for (auto& p : o.properties().values) { if (p.first == this) { return static_cast(p.second); } } return nullptr; } T& get_or_create(Object& o) const { T* t = get(o); if (t) { return *t; } return create(o); } std::unique_ptr remove(Object& o) const { auto& values = o.properties().values; std::unique_ptr result; for (auto it = values.begin(); it != values.end(); ++it) { if (it->first == this) { result = std::unique_ptr(static_cast(it->second)); values.erase(it); break; } } return result; } protected: virtual void destroy_property(void* v) const override { delete static_cast(v); } }; } // namespace rr #endif /* RR_PROPERTY_TABLE_H_ */ rr-5.5.0/src/PsCommand.cc000066400000000000000000000116251412202446200151250ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include #include "Command.h" #include "TraceStream.h" #include "TraceTaskEvent.h" #include "core.h" #include "main.h" using namespace std; namespace rr { class PsCommand : public Command { public: virtual int run(vector& args) override; protected: PsCommand(const char* name, const char* help) : Command(name, help) {} static PsCommand singleton; }; PsCommand PsCommand::singleton("ps", " rr ps []\n"); static void print_exec_cmd_line(const TraceTaskEvent& event, FILE* out) { bool first = true; for (auto& word : event.cmd_line()) { fprintf(out, "%s%s", first ? "" : " ", word.c_str()); first = false; } fprintf(out, "\n"); } static void update_tid_to_pid_map(map& tid_to_pid, const TraceTaskEvent& e) { if (e.type() == TraceTaskEvent::CLONE) { if (e.clone_flags() & CLONE_THREAD) { // thread clone. Record thread's pid. tid_to_pid[e.tid()] = tid_to_pid[e.parent_tid()]; } else { // Some kind of fork. This task is its own pid. tid_to_pid[e.tid()] = e.tid(); } } else if (e.type() == TraceTaskEvent::EXIT) { tid_to_pid.erase(e.tid()); } } static int count_tids_for_pid(const std::map tid_to_pid, pid_t pid) { int count = 0; for (auto& tp : tid_to_pid) { if (tp.second == pid) { ++count; } } return count; } static ssize_t find_cmd_line(pid_t pid, const vector& events, size_t current_event, const map current_tid_to_pid) { map tid_to_pid = current_tid_to_pid; for (size_t i = current_event; i < events.size(); ++i) { const TraceTaskEvent& e = events[i]; if (e.type() == TraceTaskEvent::EXEC && tid_to_pid[e.tid()] == pid) { return i; } if (e.type() == TraceTaskEvent::EXIT && tid_to_pid[e.tid()] == pid && count_tids_for_pid(tid_to_pid, pid) == 1) { return -1; } update_tid_to_pid_map(tid_to_pid, e); } return -1; } static string find_exit_code(pid_t pid, const vector& events, size_t current_event, const map current_tid_to_pid) { map tid_to_pid = current_tid_to_pid; for (size_t i = current_event; i < events.size(); ++i) { const TraceTaskEvent& e = events[i]; if (e.type() == TraceTaskEvent::EXIT && tid_to_pid[e.tid()] == pid && count_tids_for_pid(tid_to_pid, pid) == 1) { WaitStatus status = e.exit_status(); if (status.type() == WaitStatus::EXIT) { return to_string(status.exit_code()); } DEBUG_ASSERT(status.type() == WaitStatus::FATAL_SIGNAL); return to_string(-status.fatal_sig()); } else if (e.type() == TraceTaskEvent::DETACH && tid_to_pid[e.tid()] == pid && count_tids_for_pid(tid_to_pid, pid) == 1) { return string("detach"); } update_tid_to_pid_map(tid_to_pid, e); } return string("none"); } static int ps(const string& trace_dir, FILE* out) { TraceReader trace(trace_dir); fprintf(out, "PID\tPPID\tEXIT\tCMD\n"); vector events; while (true) { TraceTaskEvent r = trace.read_task_event(); if (r.type() == TraceTaskEvent::NONE) { break; } events.push_back(r); } if (events.empty() || events[0].type() != TraceTaskEvent::EXEC) { fprintf(stderr, "Invalid trace\n"); return 1; } map tid_to_pid; pid_t initial_tid = events[0].tid(); tid_to_pid[initial_tid] = initial_tid; fprintf(out, "%d\t--\t%s\t", initial_tid, find_exit_code(initial_tid, events, 0, tid_to_pid).c_str()); print_exec_cmd_line(events[0], out); for (size_t i = 1; i < events.size(); ++i) { auto& e = events[i]; update_tid_to_pid_map(tid_to_pid, e); if (e.type() == TraceTaskEvent::CLONE && !(e.clone_flags() & CLONE_THREAD)) { pid_t pid = tid_to_pid[e.tid()]; fprintf(out, "%d", e.tid()); if (e.own_ns_tid() != e.tid()) { fprintf(out, " (%d)", e.own_ns_tid()); } fprintf(out, "\t%d\t%s\t", tid_to_pid[e.parent_tid()], find_exit_code(pid, events, i, tid_to_pid).c_str()); ssize_t cmd_line_index = find_cmd_line(pid, events, i, tid_to_pid); if (cmd_line_index < 0) { // The main thread exited. All other threads must too, so there // is no more opportunity for e's pid to exec. fprintf(out, "(forked without exec)\n"); } else { print_exec_cmd_line(events[cmd_line_index], out); } } } return 0; } int PsCommand::run(vector& args) { while (parse_global_option(args)) { } string trace_dir; if (!parse_optional_trace_dir(args, &trace_dir)) { print_help(stderr); return 1; } return ps(trace_dir, stdout); } } // namespace rr rr-5.5.0/src/RRPageMonitor.h000066400000000000000000000013451412202446200155740ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_RR_PAGE_MONITOR_H_ #define RR_RR_PAGE_MONITOR_H_ #include "FileMonitor.h" #include "TraceStream.h" namespace rr { /** * RRPageMonitor gets installed upon any open of the librrpage.so preload library. * If this file gets mmaped, rr will attempt to map it to coincide with the * required fixed location for the rr page. */ class RRPageMonitor : public FileMonitor { public: RRPageMonitor() : FileMonitor() {}; virtual Type type() override { return RRPage; } }; static_assert(TraceReader::SpecialLibRRpage != 0, "Remember to delete this if support for the quirk is ever dropped"); } // namespace rr #endif /* RR_RR_PAGE_MONITOR_H_ */ rr-5.5.0/src/RecordCommand.cc000066400000000000000000000652601412202446200157650ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "RecordCommand.h" #include #include #include #include #include #include #include "preload/preload_interface.h" #include "Flags.h" #include "RecordSession.h" #include "StringVectorToCharArray.h" #include "WaitStatus.h" #include "core.h" #include "git_revision.h" #include "kernel_metadata.h" #include "log.h" #include "main.h" #include "util.h" using namespace std; namespace rr { RecordCommand RecordCommand::singleton( "record", " rr record [OPTION]... [exe-args]...\n" " -c, --num-cpu-ticks= maximum number of 'CPU ticks' (currently \n" " retired conditional branches) to allow a \n" " task to run before interrupting it\n" " --disable-avx-512 Masks out the CPUID bits for AVX512\n" " This can improve trace portability\n" " --disable-cpuid-features [,]\n" " Mask out CPUID EAX=1 feature bits\n" " : Bitmask of bits to clear from ECX\n" " : Bitmask of bits to clear from EDX\n" " --disable-cpuid-features-ext [,[,]]\n" " Mask out CPUID EAX=7,ECX=0 feature bits\n" " : Bitmask of bits to clear from EBX\n" " : Bitmask of bits to clear from ECX\n" " : Bitmask of bits to clear from EDX\n" " --disable-cpuid-features-xsave \n" " Mask out CPUID EAX=0xD,ECX=1 feature bits\n" " : Bitmask of bits to clear from EAX\n" " -h, --chaos randomize scheduling decisions to try to \n" " reproduce bugs\n" " -n, --no-syscall-buffer disable the syscall buffer preload \n" " library even if it would otherwise be used\n" " --no-file-cloning disable file cloning for mmapped files\n" " --no-read-cloning disable file-block cloning for syscallbuf\n" " reads\n" " --num-cores=N pretend to have N cores (rr will still\n" " only run on a single core). Overrides\n" " random setting from --chaos.\n" " -o, --output-trace-dir set the output trace directory.\n" " _RR_TRACE_DIR gets ignored.\n" " Directory name is given name, not the\n" " application name.\n" " -p --print-trace-dir= print trace directory followed by a newline\n" " to given file descriptor\n" " --syscall-buffer-sig= the signal used for communication with the\n" " syscall buffer. SIGPWR by default, unused\n" " if --no-syscall-buffer is passed\n" " -t, --continue-through-signal=\n" " Unhandled signals will be ignored\n" " instead of terminating the program. The\n" " signal will still be delivered for user\n" " handlers and debugging.\n" " -u, --cpu-unbound allow tracees to run on any virtual CPU.\n" " Default is to bind to a random CPU. This " "option\n" " can cause replay divergence: use with\n" " caution.\n" " --bind-to-cpu= Bind to a particular CPU\n" " instead of a randomly chosen one.\n" " -v, --env=NAME=VALUE value to add to the environment of the\n" " tracee. There can be any number of these.\n" " -w, --wait Wait for all child processes to exit, not\n" " just the initial process.\n" " --nested= Control behavior when run inside an outer\n" " rr recording. Default: exit with error\n" " --nested=ignore Directly start child process so it's part\n" " of the outer recording\n" " --nested=detach Start a separate recording session.\n" " Must not share memory with the outer.\n" " --nested=release Run the child without recording it.\n" " Must not share memory with the outer.\n" " --setuid-sudo If running under sudo, pretend to be the\n" " user that ran sudo rather than root. This\n" " allows recording setuid/setcap binaries.\n" " --trace-id Sets the trace id to the specified id.\n" " --copy-preload-src Copy preload sources to trace dir\n" " --stap-sdt Enables the use of SystemTap statically-\n" " defined tracepoints\n" " --asan Override heuristics and always enable ASAN\n" " compatibility.\n"); struct RecordFlags { vector extra_env; /* Max counter value before the scheduler interrupts a tracee. */ Ticks max_ticks; /* Whenever |ignore_sig| is pending for a tracee, decline to * deliver it. */ int ignore_sig; /* Whenever |continue_through_sig| is delivered to a tracee, if there is no * user handler and the signal would terminate the program, just ignore it. */ int continue_through_sig; /* Whether to use syscall buffering optimization during recording. */ RecordSession::SyscallBuffering use_syscall_buffer; /* If nonzero, the desired syscall buffer size. Must be a multiple of the page * size. */ size_t syscall_buffer_size; /* CPUID features to disable */ DisableCPUIDFeatures disable_cpuid_features; int print_trace_dir; string output_trace_dir; /* Whether to use file-cloning optimization during recording. */ bool use_file_cloning; /* Whether to use read-cloning optimization during recording. */ bool use_read_cloning; /* Whether tracee processes in record and replay are allowed * to run on any logical CPU. */ BindCPU bind_cpu; /* True if we should context switch after every rr event */ bool always_switch; /* Whether to enable chaos mode in the scheduler */ bool chaos; /* Controls number of cores reported to recorded process. */ int num_cores; /* True if we should wait for all processes to exit before finishing * recording. */ bool wait_for_all; /* Start child process directly if run under nested rr recording */ NestedBehavior nested; bool scarce_fds; bool setuid_sudo; unique_ptr trace_id; /* Copy preload sources to trace dir */ bool copy_preload_src; /* The signal to use for syscallbuf desched events */ int syscallbuf_desched_sig; /* True if we should load the audit library for SystemTap SDT support. */ bool stap_sdt; /* True if we should unmap the vdso */ bool unmap_vdso; /* True if we should always enable ASAN compatibility. */ bool asan; RecordFlags() : max_ticks(Scheduler::DEFAULT_MAX_TICKS), ignore_sig(0), continue_through_sig(0), use_syscall_buffer(RecordSession::ENABLE_SYSCALL_BUF), syscall_buffer_size(0), print_trace_dir(-1), output_trace_dir(""), use_file_cloning(true), use_read_cloning(true), bind_cpu(BIND_CPU), always_switch(false), chaos(false), num_cores(0), wait_for_all(false), nested(NESTED_ERROR), scarce_fds(false), setuid_sudo(false), copy_preload_src(false), syscallbuf_desched_sig(SYSCALLBUF_DEFAULT_DESCHED_SIGNAL), stap_sdt(false), unmap_vdso(false), asan(false) {} }; static void parse_signal_name(ParsedOption& opt) { if (opt.int_value != INT64_MIN) { return; } for (int i = 1; i < _NSIG; i++) { std::string signame = signal_name(i); if (signame == opt.value) { opt.int_value = i; return; } DEBUG_ASSERT(signame[0] == 'S' && signame[1] == 'I' && signame[2] == 'G'); if (signame.substr(3) == opt.value) { opt.int_value = i; return; } } } static vector parse_feature_bits(ParsedOption& opt) { vector ret; const char* p = opt.value.c_str(); while (*p) { char* endptr; unsigned long long v = strtoull(p, &endptr, 0); if (v > UINT32_MAX || (*endptr && *endptr != ',')) { return vector(); } ret.push_back(v); p = *endptr == ',' ? endptr + 1 : endptr; } return ret; } static bool parse_record_arg(vector& args, RecordFlags& flags) { if (parse_global_option(args)) { return true; } static const OptionSpec options[] = { { 0, "no-read-cloning", NO_PARAMETER }, { 1, "no-file-cloning", NO_PARAMETER }, { 2, "syscall-buffer-size", HAS_PARAMETER }, { 3, "nested", HAS_PARAMETER }, { 4, "scarce-fds", NO_PARAMETER }, { 5, "setuid-sudo", NO_PARAMETER }, { 6, "bind-to-cpu", HAS_PARAMETER }, { 7, "disable-cpuid-features", HAS_PARAMETER }, { 8, "disable-cpuid-features-ext", HAS_PARAMETER }, { 9, "disable-cpuid-features-xsave", HAS_PARAMETER }, { 10, "num-cores", HAS_PARAMETER }, { 11, "trace-id", HAS_PARAMETER }, { 12, "copy-preload-src", NO_PARAMETER }, { 13, "syscall-buffer-sig", HAS_PARAMETER }, { 14, "stap-sdt", NO_PARAMETER }, { 15, "unmap-vdso", NO_PARAMETER }, { 16, "disable-avx-512", NO_PARAMETER }, { 17, "asan", NO_PARAMETER }, { 'c', "num-cpu-ticks", HAS_PARAMETER }, { 'h', "chaos", NO_PARAMETER }, { 'i', "ignore-signal", HAS_PARAMETER }, { 'n', "no-syscall-buffer", NO_PARAMETER }, { 'p', "print-trace-dir", HAS_PARAMETER }, { 'o', "output-trace-dir", HAS_PARAMETER }, { 's', "always-switch", NO_PARAMETER }, { 't', "continue-through-signal", HAS_PARAMETER }, { 'u', "cpu-unbound", NO_PARAMETER }, { 'v', "env", HAS_PARAMETER }, { 'w', "wait", NO_PARAMETER }}; ParsedOption opt; auto args_copy = args; if (!Command::parse_option(args_copy, options, &opt)) { return false; } switch (opt.short_name) { case 'c': if (!opt.verify_valid_int(1, Scheduler::MAX_MAX_TICKS)) { return false; } flags.max_ticks = opt.int_value; break; case 'h': LOG(info) << "Enabled chaos mode"; flags.chaos = true; break; case 'i': parse_signal_name(opt); if (!opt.verify_valid_int(1, _NSIG - 1)) { return false; } flags.ignore_sig = opt.int_value; break; case 'n': flags.use_syscall_buffer = RecordSession::DISABLE_SYSCALL_BUF; break; case 'p': if (!opt.verify_valid_int(0, INT32_MAX)) { return false; } flags.print_trace_dir = opt.int_value; break; case 'o': flags.output_trace_dir = opt.value; break; case 0: flags.use_read_cloning = false; break; case 1: flags.use_file_cloning = false; break; case 2: if (!opt.verify_valid_int(4, 1024 * 1024) || (opt.int_value & (page_size() / 1024 - 1))) { return false; } flags.syscall_buffer_size = opt.int_value * 1024; break; case 3: if (opt.value == "default" || opt.value == "error") { flags.nested = NESTED_ERROR; } else if (opt.value == "ignore") { flags.nested = NESTED_IGNORE; } else if (opt.value == "detach") { flags.nested = NESTED_DETACH; } else if (opt.value == "release") { flags.nested = NESTED_RELEASE; } else { LOG(warn) << "Unknown nesting behavior `" << opt.value << "`"; flags.nested = NESTED_ERROR; } break; case 4: flags.scarce_fds = true; break; case 5: flags.setuid_sudo = true; break; case 6: if (!opt.verify_valid_int(0, INT32_MAX)) { return false; } flags.bind_cpu = BindCPU(opt.int_value); break; case 7: { vector bits = parse_feature_bits(opt); if (bits.empty() || bits.size() > 2) { return false; } flags.disable_cpuid_features.features_ecx = bits[0]; if (bits.size() > 1) { flags.disable_cpuid_features.features_edx = bits[1]; } break; } case 8: { vector bits = parse_feature_bits(opt); if (bits.empty() || bits.size() > 3) { return false; } flags.disable_cpuid_features.extended_features_ebx = bits[0]; if (bits.size() > 1) { flags.disable_cpuid_features.extended_features_ecx = bits[1]; if (bits.size() > 2) { flags.disable_cpuid_features.extended_features_edx = bits[2]; } } break; } case 9: { vector bits = parse_feature_bits(opt); if (bits.size() != 1) { return false; } flags.disable_cpuid_features.xsave_features_eax = bits[0]; break; } case 10: { if (!opt.verify_valid_int(1, 128)) { return false; } flags.num_cores = opt.int_value; break; } case 11: { const uint8_t SUM_GROUP_LENS[5] = { 8, 12, 16, 20, 32 }; /* Parse UUIDs from string form optionally with hypens */ uint8_t digit = 0; // This counts only hex digits (i.e. not hypens) uint8_t group = 0; uint8_t acc = 0; unique_ptr buf(new TraceUuid); auto it = opt.value.begin(); while (it < opt.value.end()) { auto c = *it; if (digit > SUM_GROUP_LENS[4]) { return false; } if (digit % 2 == 0) { // First digit of the byte. if ('0' <= c && c <= '9') { acc = c - '0'; } else if ('a' <= c && c <= 'f') { acc = c - 'a' + 10; } else if ('A' <= c && c <= 'F') { acc = c - 'A' + 10; } else if (c == '-') { // Group delimiter. if (SUM_GROUP_LENS[group] != digit) { return false; } ++group; ++it; continue; } else { return false; } } else { // Second digit of the byte. acc <<= 4; if ('0' <= c && c <= '9') { acc += c - '0'; } else if ('a' <= c && c <= 'f') { acc += c - 'a' + 10; } else if ('A' <= c && c <= 'F') { acc += c - 'A' + 10; } else { return false; } buf->bytes[digit / 2] = acc; } ++digit; ++it; } if (SUM_GROUP_LENS[4] != digit) { return false; } flags.trace_id.swap(buf); break; } case 12: flags.copy_preload_src = true; break; case 13: parse_signal_name(opt); if (!opt.verify_valid_int(1, _NSIG - 1)) { return false; } flags.syscallbuf_desched_sig = opt.int_value; break; case 14: flags.stap_sdt = true; break; case 15: flags.unmap_vdso = true; break; case 16: flags.disable_cpuid_features.extended_features_ebx |= 0xdc230000; flags.disable_cpuid_features.extended_features_ecx |= 0x00002c42; flags.disable_cpuid_features.extended_features_edx |= 0x0000000c; break; case 17: flags.asan = true; break; case 's': flags.always_switch = true; break; case 't': parse_signal_name(opt); if (!opt.verify_valid_int(1, _NSIG - 1)) { return false; } flags.continue_through_sig = opt.int_value; break; case 'u': flags.bind_cpu = UNBOUND_CPU; break; case 'v': flags.extra_env.push_back(opt.value); break; case 'w': flags.wait_for_all = true; break; default: DEBUG_ASSERT(0 && "Unknown option"); } args = args_copy; return true; } static volatile double term_requested; /** * A terminating signal was received. * * If a term request has been pending for more than one second, * then assume rr is wedged and abort(). * * Note that this is not only called in a signal handler but it could * be called off the main thread. */ static void handle_SIGTERM(__attribute__((unused)) int sig) { // Don't use LOG() here because we're in a signal handler. If we do anything // that could allocate, we could deadlock. if (term_requested > 0) { double now = monotonic_now_sec(); if (now - term_requested > 1) { static const char msg[] = "Received SIGTERM while an earlier one was pending. We're " "probably wedged.\n"; write_all(STDERR_FILENO, msg, sizeof(msg) - 1); notifying_abort(); } } else { term_requested = monotonic_now_sec(); } } /** * Something segfaulted - this is probably a bug in rr. Try to at least * give a stacktrace. */ static void handle_SIGSEGV(__attribute__((unused)) int sig) { static const char msg[] = "rr itself crashed (SIGSEGV). This shouldn't happen!\n"; write_all(STDERR_FILENO, msg, sizeof(msg) - 1); notifying_abort(); } static void install_signal_handlers(void) { struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_handler = handle_SIGTERM; sigaction(SIGTERM, &sa, nullptr); sa.sa_handler = handle_SIGSEGV; sigaction(SIGSEGV, &sa, nullptr); sa.sa_handler = SIG_IGN; sigaction(SIGHUP, &sa, nullptr); sigaction(SIGINT, &sa, nullptr); sigaction(SIGABRT, &sa, nullptr); sigaction(SIGQUIT, &sa, nullptr); } static void setup_session_from_flags(RecordSession& session, const RecordFlags& flags) { session.scheduler().set_max_ticks(flags.max_ticks); session.scheduler().set_always_switch(flags.always_switch); session.set_enable_chaos(flags.chaos); if (flags.num_cores) { // Set the number of cores reported, possibly overriding the chaos mode // setting. session.set_num_cores(flags.num_cores); } session.set_use_read_cloning(flags.use_read_cloning); session.set_use_file_cloning(flags.use_file_cloning); session.set_ignore_sig(flags.ignore_sig); session.set_continue_through_sig(flags.continue_through_sig); session.set_wait_for_all(flags.wait_for_all); if (flags.syscall_buffer_size > 0) { session.set_syscall_buffer_size(flags.syscall_buffer_size); } if (flags.scarce_fds) { for (int i = 0; i < 950; ++i) { open("/dev/null", O_RDONLY); } } } static RecordSession* static_session; // This can be called during debugging to close the trace so it can be used // later. void force_close_record_session() { if (static_session) { static_session->close_trace_writer(TraceWriter::CLOSE_ERROR); } } static void copy_preload_sources_to_trace(const string& trace_dir) { string files_dir = trace_dir + "/files.rr"; mkdir(files_dir.c_str(), 0700); pid_t pid; string dest_path = files_dir + "/librrpreload.zip"; string src_path = resource_path() + "share/rr/src"; char zip[] = "zip"; char r[] = "-r"; char j[] = "-j"; char* argv[] = { zip, r, j, const_cast(dest_path.c_str()), const_cast(src_path.c_str()), NULL }; posix_spawn_file_actions_t actions; posix_spawn_file_actions_init(&actions); posix_spawn_file_actions_addopen(&actions, STDOUT_FILENO, "/dev/null", O_RDONLY, 0); posix_spawn_file_actions_addopen(&actions, STDERR_FILENO, "/dev/null", O_RDONLY, 0); int ret = posix_spawnp(&pid, argv[0], &actions, NULL, argv, environ); if (ret) { FATAL() << "Can't spawn 'zip'"; } posix_spawn_file_actions_destroy(&actions); int status; waitpid(pid, &status, 0); LOG(info) << "Got zip status " << WaitStatus(status); } static void save_rr_git_revision(const string& trace_dir) { string files_dir = trace_dir + "/files.rr"; mkdir(files_dir.c_str(), 0700); string dest_path = files_dir + "/rr_git_revision"; ScopedFd fd(dest_path.c_str(), O_CREAT | O_WRONLY, 0600); ssize_t written = write(fd, GIT_REVISION, sizeof(GIT_REVISION) - 1); if (written != sizeof(GIT_REVISION) - 1) { FATAL() << "Can't write GIT_REVISION"; } } static WaitStatus record(const vector& args, const RecordFlags& flags) { LOG(info) << "Start recording..."; auto session = RecordSession::create( args, flags.extra_env, flags.disable_cpuid_features, flags.use_syscall_buffer, flags.syscallbuf_desched_sig, flags.bind_cpu, flags.output_trace_dir, flags.trace_id.get(), flags.stap_sdt, flags.unmap_vdso, flags.asan); setup_session_from_flags(*session, flags); static_session = session.get(); if (flags.print_trace_dir >= 0) { const string& dir = session->trace_writer().dir(); write_all(flags.print_trace_dir, dir.c_str(), dir.size()); write_all(flags.print_trace_dir, "\n", 1); } if (flags.copy_preload_src) { const string& dir = session->trace_writer().dir(); copy_preload_sources_to_trace(dir); save_rr_git_revision(dir); } // Install signal handlers after creating the session, to ensure they're not // inherited by the tracee. install_signal_handlers(); RecordSession::RecordResult step_result; bool did_term_detached_tasks = false; do { bool done_initial_exec = session->done_initial_exec(); step_result = session->record_step(); // Only create latest-trace symlink if --output-trace-dir is not being used if (!done_initial_exec && session->done_initial_exec() && flags.output_trace_dir.empty()) { session->trace_writer().make_latest_trace(); } if (term_requested) { session->terminate_tracees(); if (!did_term_detached_tasks) { session->term_detached_tasks(); did_term_detached_tasks = true; } } } while (step_result.status == RecordSession::STEP_CONTINUE); session->close_trace_writer(TraceWriter::CLOSE_OK); static_session = nullptr; switch (step_result.status) { case RecordSession::STEP_CONTINUE: // SIGTERM interrupted us. return WaitStatus::for_fatal_sig(SIGTERM); case RecordSession::STEP_EXITED: return step_result.exit_status; case RecordSession::STEP_SPAWN_FAILED: cerr << "\n" << step_result.failure_message << "\n"; return WaitStatus::for_exit_code(EX_UNAVAILABLE); default: DEBUG_ASSERT(0 && "Unknown exit status"); return WaitStatus(); } } static void exec_child(vector& args) { execvp(args[0].c_str(), StringVectorToCharArray(args).get()); // That failed. Try executing the file directly. execv(args[0].c_str(), StringVectorToCharArray(args).get()); switch (errno) { case ENOENT: fprintf(stderr, "execv failed: '%s' (or interpreter) not found (%s)", args[0].c_str(), errno_name(errno).c_str()); break; default: fprintf(stderr, "execv of '%s' failed (%s)", args[0].c_str(), errno_name(errno).c_str()); break; } _exit(1); // Never returns! } static void reset_uid_sudo() { // Let's change our uids now. We do keep capabilities though, since that's // the point of the exercise. The first exec will reset both the keepcaps, // and the capabilities in the child std::string sudo_uid = getenv("SUDO_UID"); std::string sudo_gid = getenv("SUDO_GID"); DEBUG_ASSERT(!sudo_uid.empty() && !sudo_gid.empty()); uid_t tracee_uid = stoi(sudo_uid); gid_t tracee_gid = stoi(sudo_gid); // Setuid will drop effective capabilities. Save them now and set them // back after struct NativeArch::cap_header header = {.version = _LINUX_CAPABILITY_VERSION_3, .pid = 0 }; struct NativeArch::cap_data data[2]; if (syscall(NativeArch::capget, &header, data) != 0) { FATAL() << "FAILED to read capabilities"; } if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0)) { FATAL() << "FAILED to set keepcaps"; } if (setgid(tracee_gid) != 0) { FATAL() << "FAILED to setgid to sudo group"; } if (setuid(tracee_uid) != 0) { FATAL() << "FAILED to setuid to sudo user"; } if (syscall(NativeArch::capset, &header, data) != 0) { FATAL() << "FAILED to set capabilities"; } // Just make sure the ambient set is cleared, to avoid polluting the tracee prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0); } int RecordCommand::run(vector& args) { RecordFlags flags; while (parse_record_arg(args, flags)) { } if (running_under_rr()) { switch (flags.nested) { case NESTED_IGNORE: exec_child(args); return 1; case NESTED_DETACH: case NESTED_RELEASE: { int ret = syscall(SYS_rrcall_detach_teleport, (uintptr_t)0, (uintptr_t)0, (uintptr_t)0, (uintptr_t)0, (uintptr_t)0, (uintptr_t)0); if (ret < 0) { FATAL() << "Failed to detach from parent rr"; } if (running_under_rr(false)) { FATAL() << "Detaching from parent rr did not work"; } if (flags.nested == NESTED_RELEASE) { exec_child(args); return 1; } break; } default: fprintf(stderr, "rr: cannot run rr recording under rr. Exiting.\n" "Use `rr record --nested=ignore` to start the child " "process directly.\n"); return 1; } } if (!verify_not_option(args) || args.size() == 0) { print_help(stderr); return 1; } assert_prerequisites(flags.use_syscall_buffer); if (flags.setuid_sudo) { if (geteuid() != 0 || getenv("SUDO_UID") == NULL) { fprintf(stderr, "rr: --setuid-sudo option may only be used under sudo.\n" "Re-run as `sudo -EP --preserve-env=HOME rr record --setuid-sudo` to" "record privileged executables.\n"); return 1; } reset_uid_sudo(); } if (flags.chaos) { // Add up to one page worth of random padding to the environment to induce // a variety of possible stack pointer offsets vector chars; chars.resize(random() % page_size()); memset(chars.data(), '0', chars.size()); chars.push_back(0); string padding = string("RR_CHAOS_PADDING=") + chars.data(); flags.extra_env.push_back(padding); } WaitStatus status = record(args, flags); // Everything should have been cleaned up by now. check_for_leaks(); switch (status.type()) { case WaitStatus::EXIT: return status.exit_code(); case WaitStatus::FATAL_SIGNAL: signal(status.fatal_sig(), SIG_DFL); prctl(PR_SET_DUMPABLE, 0); kill(getpid(), status.fatal_sig()); break; default: FATAL() << "Don't know why we exited: " << status; break; } return 1; } } // namespace rr rr-5.5.0/src/RecordCommand.h000066400000000000000000000010521412202446200156140ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_RECORD_COMMAND_H_ #define RR_RECORD_COMMAND_H_ #include "Command.h" namespace rr { void force_close_record_session(); class RecordCommand : public Command { public: virtual int run(std::vector& args) override; static RecordCommand* get() { return &singleton; } protected: RecordCommand(const char* name, const char* help) : Command(name, help) {} static RecordCommand singleton; }; } // namespace rr #endif // RR_RECORD_COMMAND_H_ rr-5.5.0/src/RecordSession.cc000066400000000000000000002733261412202446200160360ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "RecordSession.h" #include #include #include #include #include #include #include #include #include #include "AutoRemoteSyscalls.h" #include "ElfReader.h" #include "Flags.h" #include "RecordTask.h" #include "VirtualPerfCounterMonitor.h" #include "core.h" #include "ftrace.h" #include "kernel_metadata.h" #include "kernel_supplement.h" #include "log.h" #include "record_signal.h" #include "record_syscall.h" #include "seccomp-bpf.h" namespace rr { // Undef si_addr_lsb since it's an alias for a field name that doesn't exist, // and we need to use the actual field name. #ifdef si_addr_lsb #undef si_addr_lsb #endif using namespace rr; using namespace std; template static remote_ptr mask_low_bit(remote_ptr p) { return p.as_int() & ~uintptr_t(1); } template static void record_robust_futex_change( RecordTask* t, const typename Arch::robust_list_head& head, remote_ptr base) { if (base.is_null()) { return; } remote_ptr futex_void_ptr = base + head.futex_offset; auto futex_ptr = futex_void_ptr.cast(); // We can't just record the current futex value because at this point // in task exit the robust futex handling has not happened yet. So we have // to emulate what the kernel will do! bool ok = true; uint32_t val = t->read_mem(futex_ptr, &ok); if (!ok) { return; } if (pid_t(val & FUTEX_TID_MASK) != t->own_namespace_rec_tid) { return; } val = (val & FUTEX_WAITERS) | FUTEX_OWNER_DIED; // Update memory now so that the kernel doesn't decide to do it later, at // a time that might race with other tracee execution. t->write_mem(futex_ptr, val); t->record_local(futex_ptr, &val); } /** * Any user-space writes performed by robust futex handling are captured here. * They must be emulated during replay; the kernel will not do it for us * during replay because the TID value in each futex is the recorded * TID, not the actual TID of the dying task. */ template static void record_robust_futex_changes_arch(RecordTask* t) { if (t->did_record_robust_futex_changes) { return; } t->did_record_robust_futex_changes = true; auto head_ptr = t->robust_list().cast(); if (head_ptr.is_null()) { return; } ASSERT(t, t->robust_list_len() == sizeof(typename Arch::robust_list_head)); bool ok = true; auto head = t->read_mem(head_ptr, &ok); if (!ok) { return; } record_robust_futex_change(t, head, mask_low_bit(head.list_op_pending.rptr())); for (auto current = mask_low_bit(head.list.next.rptr()); current.as_int() != head_ptr.as_int();) { record_robust_futex_change(t, head, current); auto next = t->read_mem(current, &ok); if (!ok) { return; } current = mask_low_bit(next.next.rptr()); } } static void record_robust_futex_changes(RecordTask* t) { RR_ARCH_FUNCTION(record_robust_futex_changes_arch, t->arch(), t); } static void record_exit_trace_event(RecordTask* t, WaitStatus exit_status) { t->session().trace_writer().write_task_event( TraceTaskEvent::for_exit(t->tid, exit_status)); if (t->thread_group()->tgid == t->tid) { t->thread_group()->exit_status = exit_status; } } static bool looks_like_syscall_entry(RecordTask* t) { bool ok; bool at_syscall = is_at_syscall_instruction(t, t->regs().ip().decrement_by_syscall_insn_length(t->arch()), &ok); // It's possible for the task to have died (e.g. if it got signaled twice // in rapid succession). In that case, try to just go by register contents. if (ok && !at_syscall) { return false; } if (is_x86ish(t->arch())) { // On x86 rax gets set to ENOSYS on entry. Elsewhere this does not happen. // Further, even if we did ask about the syscallno, it might have been // reset by the signal handler. However, on non-x86 platforms we currently // count taken braches, rather than only conditional ones, so it should // be impossible to see the same syscall ip twice without intervening // ticks, so the check that follows these conditions, should be sufficient // there. return t->regs().original_syscallno() >= 0 && t->regs().syscall_result_signed() == -ENOSYS; } // Getting a sched event here is better than a spurious syscall event. // Syscall entry does not cause visible register modification, so upon // hitting the sched event the register state would indeed match. return ok; } /** * Return true if we handle a ptrace exit event for task t. When this returns * true, t has been deleted and cannot be referenced again. */ static bool handle_ptrace_exit_event(RecordTask* t) { if (t->already_reaped()) { t->did_reach_zombie(); return true; } if (t->ptrace_event() != PTRACE_EVENT_EXIT) { return false; } if (t->stable_exit) { LOG(debug) << "stable exit"; } else { /* XXX: We could try to find some tasks here to unmap our buffers, but it * seems hardly worth it. */ t->destroy_buffers(nullptr, nullptr); if (!t->may_be_blocked()) { // might have been hit by a SIGKILL or a SECCOMP_RET_KILL, in which case // there might be some execution since its last recorded event that we // need to replay. // There's a weird case (in 4.13.5-200.fc26.x86_64 at least) where the // task can enter the kernel but instead of receiving a syscall ptrace // event, we receive a PTRACE_EVENT_EXIT due to a concurrent execve // (and probably a concurrent SIGKILL could do the same). The task state // has been updated to reflect syscall entry. If we record a SCHED in // that state replay of the SCHED will fail. So detect that state and fix // it up. if (looks_like_syscall_entry(t)) { // Either we're in a syscall, or we're immediately after a syscall // and it exited. if (t->ticks_at_last_recorded_syscall_exit == t->tick_count() && t->regs().ip() == t->ip_at_last_recorded_syscall_exit) { LOG(debug) << "Nothing to record after PTRACE_EVENT_EXIT"; // It's the latter case; do nothing. } else { // It's the former case ... probably. Theoretically we could have // re-executed a syscall without any ticks in between, but that seems // highly improbable. // Record the syscall-entry event that we otherwise failed to record. t->canonicalize_regs(t->arch()); t->apply_syscall_entry_regs(); // Assume it's a native-arch syscall. If it isn't, it doesn't matter // all that much since we aren't actually going to do anything with it // in this task. // Avoid calling detect_syscall_arch here since it could fail if the // task is already completely dead and gone. SyscallEvent event(t->regs().original_syscallno(), t->arch()); event.state = ENTERING_SYSCALL; t->record_event(event); } } else { // Don't try to reset the syscallbuf here. The task may be exiting // while in arbitrary syscallbuf code. And of course, because it's // exiting, it doesn't matter if we don't reset the syscallbuf. // XXX flushing the syscallbuf may be risky too... t->record_event(Event::sched(), RecordTask::FLUSH_SYSCALLBUF, RecordTask::DONT_RESET_SYSCALLBUF); } } } record_robust_futex_changes(t); WaitStatus exit_status; unsigned long msg = 0; // We can get ESRCH here if the child was killed by SIGKILL and // we made a synthetic PTRACE_EVENT_EXIT to handle it. if (t->ptrace_if_alive(PTRACE_GETEVENTMSG, nullptr, &msg)) { exit_status = WaitStatus(msg); } else { exit_status = WaitStatus::for_fatal_sig(SIGKILL); } t->did_handle_ptrace_exit_event(); // If we died because of a coredumping signal, that is a barrier event, and // every task in the address space needs to pass its PTRACE_EXIT_EVENT before // they proceed to (potentially hidden) zombie state, so we can't wait for // that to happen/ // Similarly we can't wait for this task to exit if there are other // tasks in its pid namespace that need to exit and this is the last thread // of pid-1 in that namespace, because the kernel must reap them before // letting this task complete its exit. bool may_wait_exit = !is_coredumping_signal(exit_status.fatal_sig()) && !t->waiting_for_pid_namespace_tasks_to_exit(); record_exit_trace_event(t, exit_status); t->record_exit_event(exit_status.fatal_sig(), (!t->already_reaped() && !may_wait_exit) ? RecordTask::WRITE_CHILD_TID : RecordTask::KERNEL_WRITES_CHILD_TID); if (!t->already_reaped()) { t->proceed_to_exit(may_wait_exit); } t->do_ptrace_exit_stop(exit_status); if (may_wait_exit) { t->did_reach_zombie(); } else { t->waiting_for_zombie = true; } return true; } static void note_entering_syscall(RecordTask* t) { ASSERT(t, EV_SYSCALL == t->ev().type()); t->ev().Syscall().state = ENTERING_SYSCALL; if (!t->ev().Syscall().is_restart) { /* Save a copy of the arg registers so that we * can use them to detect later restarted * syscalls, if this syscall ends up being * restarted. We have to save the registers * in this rather awkward place because we * need the original registers; the restart * (if it's not a SYS_restart_syscall restart) * will use the original registers. */ t->ev().Syscall().regs = t->regs(); } else { t->ev().Syscall().regs.set_syscallno(t->regs().syscallno()); // We may have intentionally stored the syscall result here. // Now that we're safely past the signal delivery, make the // registers look like they did at the original syscall entry // again. t->ev().Syscall().regs.set_arg1(t->ev().Syscall().regs.orig_arg1()); if (t->arch() == aarch64) { // We probably got here with a PTRACE_SYSCALL. The x7 // value will be wrong due to the aarch64 kernel bug. // Get it from the syscall event. Registers r = t->regs(); r.set_x7(t->ev().Syscall().regs.x7()); t->set_regs(r); } } } #if defined (__x86_64__) static bool is_in_vsyscall(remote_code_ptr ip) { // This is hardcoded by the Linux ABI remote_code_ptr vsyscall_start = 0xffffffffff600000; remote_code_ptr vsyscall_end = 0xffffffffff601000; return vsyscall_start <= ip && ip < vsyscall_end; } #else static bool is_in_vsyscall(remote_code_ptr) { return false; } #endif void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, StepState* step_state, RecordResult* result, bool* did_enter_syscall) { *did_enter_syscall = false; // Special case: If the tracee issues a vsyscall, we will get a seccomp trap, // but no syscall traps whatsover. In particular, we wouldn't see it during // replay either. We try to monkeypatch the caller on the assumption that known // callers of this (deprecated) interface all follow a common pattern. If we // can't patch the caller, this is a fatal error, since the recording will // otherwise be broken. remote_code_ptr ip = t->regs().ip(); if (is_in_vsyscall(ip)) { remote_ptr sp = t->regs().sp(); // The kernel assumes the return address is on the stack - we do the same remote_ptr ret_addr_addr = sp.cast(); remote_code_ptr ret_addr = t->read_mem(ret_addr_addr); // Skip this syscall. We will attempt to patch it to the vdso entry and // let the tracee retry there. Registers regs = t->regs(); regs.set_original_syscallno(-1); // We can't modify the ip here, the kernel will kill the tracee with // SIGSYS. Instead, we set a breakpoint at the return instruction. t->set_regs(regs); t->vm()->add_breakpoint(ret_addr, BKPT_INTERNAL); while (true) { t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); if (t->ptrace_event() == PTRACE_EVENT_EXIT) { return; } ASSERT(t, !t->ptrace_event()); if (t->stop_sig() == syscallbuf_desched_sig()) { continue; } if (t->stop_sig() == SIGTRAP && is_kernel_trap(t->get_siginfo().si_code)) { // Hit the breakpoint break; } t->stash_sig(); } t->vm()->remove_breakpoint(ret_addr, BKPT_INTERNAL); ASSERT(t, t->regs().ip().undo_executed_bkpt(t->arch()) == ret_addr); // Now that we're in a sane state, ask the Monkeypatcher to try and patch // that. bool patch_ok = t->vm()->monkeypatcher().try_patch_vsyscall_caller(t, ret_addr); ASSERT(t, patch_ok) << "The tracee issues a vsyscall to " << ip << " but we failed to monkeypatch the caller (return address " << ret_addr << ", sp=" << sp << "). Recording will not succeed. Exiting."; // Reset to the start of the region and continue regs = t->regs(); regs.set_ip(ret_addr.decrement_by_vsyscall_entry_length(t->arch())); t->set_regs(regs); // We patched this syscall, record that auto ev = Event::patch_syscall(); ev.PatchSyscall().patch_vsyscall = true; t->record_event(ev); step_state->continue_type = RecordSession::CONTINUE; return; } int syscallno = t->regs().original_syscallno(); if (syscallno < 0) { // negative syscall numbers after a SECCOMP event // are treated as "skip this syscall". There will be one syscall event // reported instead of two. So fake an enter-syscall event now. // It doesn't really matter what the syscall-arch is. t->canonicalize_regs(t->arch()); if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) { // If the ptrace entry stop hasn't happened yet, we're at a weird // intermediate state where the behavior of the next PTRACE_SYSCALL // will depend on the register state (i.e. whether we see an entry // trap or proceed right to the exit trap). To make things easier // on the rest of the system, do a fake syscall entry, then reset // the register state. Registers orig_regs = t->regs(); Registers r = orig_regs; r.set_original_syscallno(syscall_number_for_gettid(t->arch())); t->set_regs(r); t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); t->set_regs(orig_regs); } // Don't continue yet. At the next iteration of record_step, we'll // enter syscall_state_changed and that will trigger a continue to // the syscall exit. step_state->continue_type = RecordSession::DONT_CONTINUE; if (!process_syscall_entry(t, step_state, result, t->arch())) { return; } *did_enter_syscall = true; return; } if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) { // The next continue needs to be a PTRACE_SYSCALL to observe // the enter-syscall event. step_state->continue_type = RecordSession::CONTINUE_SYSCALL; } else { ASSERT(t, syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP); if (t->ev().is_syscall_event() && t->ev().Syscall().state == PROCESSING_SYSCALL) { // We did PTRACE_SYSCALL and already saw a syscall trap. Just ignore this. LOG(debug) << "Ignoring SECCOMP syscall trap since we already got a " "PTRACE_SYSCALL trap"; // The next continue needs to be a PTRACE_SYSCALL to observe // the exit-syscall event. step_state->continue_type = RecordSession::CONTINUE_SYSCALL; // Need to restore last_task_switchable since it will have been // reset to PREVENT_SWITCH last_task_switchable = t->ev().Syscall().switchable; } else { // We've already passed the PTRACE_SYSCALL trap for syscall entry, so // we need to handle that now. SupportedArch syscall_arch = t->detect_syscall_arch(); t->canonicalize_regs(syscall_arch); if (!process_syscall_entry(t, step_state, result, syscall_arch)) { step_state->continue_type = RecordSession::DONT_CONTINUE; return; } *did_enter_syscall = true; } } } static void seccomp_trap_done(RecordTask* t) { t->pop_seccomp_trap(); // It's safe to reset the syscall buffer now. t->delay_syscallbuf_reset_for_seccomp_trap = false; t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, failed_during_preparation), (uint8_t)1); uint8_t one = 1; t->record_local( REMOTE_PTR_FIELD(t->syscallbuf_child, failed_during_preparation), &one); if (EV_DESCHED == t->ev().type()) { // Desched processing will do the rest for us return; } // Abort the current syscallbuf record, which corresponds to the syscall that // wasn't actually executed due to seccomp. t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1); t->record_event(Event::syscallbuf_abort_commit()); // In fact, we need to. Running the syscall exit hook will ensure we // reset the buffer before we try to buffer another a syscall. t->write_mem( REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), (uint8_t)1); } static void handle_seccomp_trap(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { // The architecture may be wrong, but that's ok, because an actual syscall // entry did happen, so the registers are already updated according to the // architecture of the system call. t->canonicalize_regs(t->detect_syscall_arch()); t->apply_syscall_entry_regs(); Registers r = t->regs(); int syscallno = r.original_syscallno(); // Cause kernel processing to skip the syscall r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); t->set_regs(r); bool syscall_entry_already_recorded = false; if (t->ev().is_syscall_event()) { // A syscall event was already pushed, probably because we did a // PTRACE_SYSCALL to enter the syscall during handle_desched_event. Cancel // that event now since the seccomp SIGSYS aborts it completely. ASSERT(t, t->ev().Syscall().number == syscallno); // Make sure any prepared syscall state is discarded and any temporary // effects (e.g. redirecting pointers to scratch) undone. rec_abort_prepared_syscall(t); if (t->ev().type() == EV_SYSCALL_INTERRUPTION) { // The event could be a syscall-interruption if it was pushed by // `handle_desched_event`. In that case, it has not been recorded yet. t->pop_syscall_interruption(); } else { t->pop_syscall(); syscall_entry_already_recorded = true; } } if (t->is_in_untraced_syscall()) { ASSERT(t, !t->delay_syscallbuf_reset_for_seccomp_trap); // Don't reset the syscallbuf immediately after delivering the trap. We have // to wait until this buffered syscall aborts completely before resetting // the buffer. t->delay_syscallbuf_reset_for_seccomp_trap = true; t->push_event(Event::seccomp_trap()); // desched may be armed but we're not going to execute the syscall, let // alone block. If it fires, ignore it. t->write_mem( REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant), (uint8_t)0); } t->push_syscall_event(syscallno); t->ev().Syscall().failed_during_preparation = true; note_entering_syscall(t); if (t->is_in_untraced_syscall() && !syscall_entry_already_recorded) { t->record_current_event(); } // Use NativeArch here because different versions of system headers // have inconsistent field naming. union { NativeArch::siginfo_t native_api; siginfo_t linux_api; } si; memset(&si, 0, sizeof(si)); si.native_api.si_signo = SIGSYS; si.native_api.si_errno = seccomp_data; si.native_api.si_code = SYS_SECCOMP; switch (r.arch()) { case x86: si.native_api._sifields._sigsys._arch = AUDIT_ARCH_I386; break; case x86_64: si.native_api._sifields._sigsys._arch = AUDIT_ARCH_X86_64; break; #ifdef AUDIT_ARCH_AARCH64 case aarch64: si.native_api._sifields._sigsys._arch = AUDIT_ARCH_AARCH64; break; #endif default: DEBUG_ASSERT(0 && "Unknown architecture"); break; } si.native_api._sifields._sigsys._syscall = syscallno; // Documentation says that si_call_addr is the address of the syscall // instruction, but in tests it's immediately after the syscall // instruction. si.native_api._sifields._sigsys._call_addr = t->ip().to_data_ptr(); LOG(debug) << "Synthesizing " << si.linux_api; t->stash_synthetic_sig(si.linux_api, DETERMINISTIC_SIG); // Tests show that the current registers are preserved (on x86, eax/rax // retains the syscall number). r.set_syscallno(syscallno); t->set_regs(r); t->maybe_restore_original_syscall_registers(); if (t->is_in_untraced_syscall()) { // For buffered syscalls, go ahead and record the exit state immediately. t->ev().Syscall().state = EXITING_SYSCALL; t->record_current_event(); t->pop_syscall(); // The tracee is currently in the seccomp ptrace-stop. Advance it to the // syscall-exit stop so that when we try to deliver the SIGSYS via // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); } // Don't continue yet. At the next iteration of record_step, if we // recorded the syscall-entry we'll enter syscall_state_changed and // that will trigger a continue to the syscall exit. If we recorded the // syscall-exit we'll go straight into signal delivery. step_state->continue_type = RecordSession::DONT_CONTINUE; } static void handle_seccomp_errno(RecordTask* t, RecordSession::StepState* step_state, uint16_t seccomp_data) { t->canonicalize_regs(t->detect_syscall_arch()); Registers r = t->regs(); int syscallno = r.original_syscallno(); // Cause kernel processing to skip the syscall r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); t->set_regs(r); if (!t->is_in_untraced_syscall()) { t->push_syscall_event(syscallno); // Note that the syscall failed. prepare_clone() needs to know // this during replay of the syscall entry. t->ev().Syscall().failed_during_preparation = true; note_entering_syscall(t); } r.set_syscall_result(-seccomp_data); t->set_regs(r); // Don't continue yet. At the next iteration of record_step, if we // recorded the syscall-entry we'll enter syscall_state_changed and // that will trigger a continue to the syscall exit. step_state->continue_type = RecordSession::DONT_CONTINUE; } bool RecordSession::handle_ptrace_event(RecordTask** t_ptr, StepState* step_state, RecordResult* result, bool* did_enter_syscall) { *did_enter_syscall = false; RecordTask* t = *t_ptr; if (t->status().group_stop() || t->has_stashed_group_stop()) { t->clear_stashed_group_stop(); last_task_switchable = ALLOW_SWITCH; step_state->continue_type = DONT_CONTINUE; return true; } int event = t->ptrace_event(); if (!event) { return false; } LOG(debug) << " " << t->tid << ": handle_ptrace_event " << ptrace_event_name(event) << ": event " << t->ev(); switch (event) { case PTRACE_EVENT_SECCOMP_OBSOLETE: case PTRACE_EVENT_SECCOMP: { if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { syscall_seccomp_ordering_ = SECCOMP_BEFORE_PTRACE_SYSCALL; } int seccomp_data = t->get_ptrace_eventmsg_seccomp_data(); if (seccomp_data < 0) { // Process just died. Urk. Just wait for the exit event and pretend this stop never happened! last_task_switchable = ALLOW_SWITCH; step_state->continue_type = DONT_CONTINUE; return true; } t->apply_syscall_entry_regs(); int syscallno = t->regs().original_syscallno(); if (seccomp_data == SECCOMP_RET_DATA) { LOG(debug) << " traced syscall entered: " << syscall_name(syscallno, t->arch()); handle_seccomp_traced_syscall(t, step_state, result, did_enter_syscall); } else { // Note that we make no attempt to patch the syscall site when the // user handle does not return ALLOW. Apart from the ERRNO case, // handling these syscalls is necessarily slow anyway. uint32_t real_result; if (!seccomp_filter_rewriter().map_filter_data_to_real_result( t, seccomp_data, &real_result)) { LOG(debug) << "Process terminated unexpectedly during PTRACE_GETEVENTMSG"; step_state->continue_type = RecordSession::CONTINUE; break; } uint16_t real_result_data = real_result & SECCOMP_RET_DATA; switch (real_result & SECCOMP_RET_ACTION) { case SECCOMP_RET_TRAP: LOG(debug) << " seccomp trap for syscall: " << syscall_name(syscallno, t->arch()); handle_seccomp_trap(t, step_state, real_result_data); break; case SECCOMP_RET_ERRNO: LOG(debug) << " seccomp errno " << errno_name(real_result_data) << " for syscall: " << syscall_name(syscallno, t->arch()); handle_seccomp_errno(t, step_state, real_result_data); break; case SECCOMP_RET_KILL: LOG(debug) << " seccomp kill for syscall: " << syscall_name(syscallno, t->arch()); t->tgkill(SIGKILL); // Rely on the SIGKILL to bump us out of the ptrace stop. step_state->continue_type = RecordSession::DONT_CONTINUE; // Now wait for us to actually exit our ptrace-stop and proceed // to the PTRACE_EVENT_EXIT. This avoids the race where our // PTRACE_CONT might kick us out of the PTRACE_EVENT_EXIT before // we can process it. t->wait(); break; default: ASSERT(t, false) << "Seccomp result not handled"; break; } } break; } case PTRACE_EVENT_EXEC: { if (t->thread_group()->task_set().size() > 1) { // All tasks but the task that did the execve should have exited by // now and notified us of their exits. However, it's possible that // while running the thread-group leader, our PTRACE_CONT raced with its // PTRACE_EVENT_EXIT and it exited, and the next event we got is this // PTRACE_EVENT_EXEC after the exec'ing task changed its tid to the // leader's tid. Or maybe there are kernel bugs; on // 4.2.0-42-generic running exec_from_other_thread, we reproducibly // enter PTRACE_EVENT_EXEC for the thread-group leader without seeing // its PTRACE_EVENT_EXIT. // So, record this task's exit and destroy it. // XXX We can't do record_robust_futex_changes here because the address // space has already gone. That would only matter if some of them were // in memory accessible to another process even after exec, i.e. a // shared-memory mapping or two different thread-groups sharing the same // address space. pid_t tid = t->rec_tid; WaitStatus status = t->status(); record_exit_trace_event(t, WaitStatus(0)); t->record_exit_event(); // Don't call RecordTask::destroy() because we don't want to // PTRACE_DETACH. delete t; // Steal the exec'ing task and make it the thread-group leader, and // carry on! t = revive_task_for_exec(tid); scheduler().set_current(t); *t_ptr = t; // Tell t that it is actually stopped, because the stop we got is really // for this task, not the old dead task. t->did_waitpid(status); } t->post_exec(); // Skip past the ptrace event. step_state->continue_type = CONTINUE_SYSCALL; break; } default: ASSERT(t, false) << "Unhandled ptrace event " << ptrace_event_name(event) << "(" << event << ")"; break; } return true; } static void debug_exec_state(const char* msg, RecordTask* t) { LOG(debug) << msg << ": status=" << t->status(); } template static bool is_ptrace_any_singlestep_arch(int command) { return command >= 0 && (command == PTRACE_SINGLESTEP || command == Arch::PTRACE_SYSEMU_SINGLESTEP); } static bool is_ptrace_any_singlestep(SupportedArch arch, int command) { RR_ARCH_FUNCTION(is_ptrace_any_singlestep_arch, arch, command); } void RecordSession::task_continue(const StepState& step_state) { RecordTask* t = scheduler().current(); ASSERT(t, step_state.continue_type != DONT_CONTINUE); // A task in an emulated ptrace-stop must really stay stopped ASSERT(t, !t->emulated_stop_pending); bool may_restart = t->at_may_restart_syscall(); if (may_restart && t->seccomp_bpf_enabled) { LOG(debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev(); } if (!t->vm()->first_run_event()) { t->vm()->set_first_run_event(trace_writer().time()); } if (!t->thread_group()->first_run_event()) { t->thread_group()->set_first_run_event(trace_writer().time()); } TicksRequest ticks_request; ResumeRequest resume; if (step_state.continue_type == CONTINUE_SYSCALL) { ticks_request = RESUME_NO_TICKS; resume = RESUME_SYSCALL; } else { if (t->has_stashed_sig(PerfCounters::TIME_SLICE_SIGNAL)) { // timeslice signal already stashed, no point in generating another one // (and potentially slow) ticks_request = RESUME_UNLIMITED_TICKS; } else if (scheduler().may_use_unlimited_ticks()) { ticks_request = RESUME_UNLIMITED_TICKS; } else { ticks_request = (TicksRequest)max( 0, scheduler().current_timeslice_end() - t->tick_count()); } // Clear any lingering state, then see if we need to stop earlier for a // tracee-requested pmc interrupt on the virtualized performance counter. t->next_pmc_interrupt_is_for_user = false; if (auto vpmc = VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t)) { ASSERT(t, vpmc->target_tuid() == t->tuid()); Ticks after = max(vpmc->target_ticks() - t->tick_count(), 0); if ((uint64_t)after < (uint64_t)ticks_request) { LOG(debug) << "ticks_request constrained from " << ticks_request << " to " << after << " for vpmc"; ticks_request = (TicksRequest)after; t->next_pmc_interrupt_is_for_user = true; } } // Override requested by the tracee for testing purposes if (t->tick_request_override != (TicksRequest)0) { ASSERT(t, !t->next_pmc_interrupt_is_for_user); ticks_request = t->tick_request_override; t->tick_request_override = (TicksRequest)0; } bool singlestep = is_ptrace_any_singlestep(t->arch(), t->emulated_ptrace_cont_command); if (singlestep && is_at_syscall_instruction(t, t->ip())) { // We're about to singlestep into a syscall instruction. // Act like we're NOT singlestepping since doing a PTRACE_SINGLESTEP would // skip over the system call. LOG(debug) << "Clearing singlestep because we're about to enter a syscall"; singlestep = false; } if (singlestep) { resume = RESUME_SINGLESTEP; } else { /* We won't receive PTRACE_EVENT_SECCOMP events until * the seccomp filter is installed by the * syscall_buffer lib in the child, therefore we must * record in the traditional way (with PTRACE_SYSCALL) * until it is installed. */ /* Kernel commit https://github.com/torvalds/linux/commit/93e35efb8de45393cf61ed07f7b407629bf698ea makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE traps. Detect and handle this. */ if (!t->seccomp_bpf_enabled || may_restart || syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { resume = RESUME_SYSCALL; } else { /* When the seccomp filter is on, instead of capturing * syscalls by using PTRACE_SYSCALL, the filter will * generate the ptrace events. This means we allow the * process to run using PTRACE_CONT, and rely on the * seccomp filter to generate the special * PTRACE_EVENT_SECCOMP event once a syscall happens. * This event is handled here by simply allowing the * process to continue to the actual entry point of * the syscall (using cont_syscall_block()) and then * using the same logic as before. */ resume = RESUME_CONT; } } } t->resume_execution(resume, RESUME_NONBLOCKING, ticks_request); if (t->is_running()) { scheduler().started(t); } } /** * Step |t| forward until the tracee syscall that disarms the desched * event. If a signal becomes pending in the interim, we stash it. * This allows the caller to deliver the signal after this returns. * (In reality the desched event will already have been disarmed before we * enter this function.) */ static void advance_to_disarm_desched_syscall(RecordTask* t) { int old_sig = 0; LOG(debug) << "desched: DISARMING_DESCHED_EVENT"; /* TODO: send this through main loop. */ /* TODO: mask off signals and avoid this loop. */ do { t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_UNLIMITED_TICKS); /* We can safely ignore TIME_SLICE_SIGNAL while trying to * reach the disarm-desched ioctl: once we reach it, * the desched'd syscall will be "done" and the tracee * will be at a preemption point. In fact, we *want* * to ignore this signal. Syscalls like read() can * have large buffers passed to them, and we have to * copy-out the buffered out data to the user's * buffer. This happens in the interval where we're * reaching the disarm-desched ioctl, so that code is * susceptible to receiving TIME_SLICE_SIGNAL. */ int sig = t->stop_sig(); if (PerfCounters::TIME_SLICE_SIGNAL == sig) { continue; } // We should not receive SYSCALLBUF_DESCHED_SIGNAL since it should already // have been disarmed. However, we observe these being received here when // we arm the desched signal before we restart a blocking syscall, which // completes successfully, then we disarm, then we see a desched signal // here. if (t->session().syscallbuf_desched_sig() == sig) { continue; } if (sig && sig == old_sig) { LOG(debug) << " coalescing pending " << signal_name(sig); continue; } if (sig) { LOG(debug) << " " << signal_name(sig) << " now pending"; t->stash_sig(); } } while (!t->is_disarm_desched_event_syscall()); // Exit the syscall. t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); } /** * |t| is at a desched event and some relevant aspect of its state * changed. (For now, changes except the original desched'd syscall * being restarted.) */ void RecordSession::desched_state_changed(RecordTask* t) { LOG(debug) << "desched: IN_SYSCALL"; /* We need to ensure that the syscallbuf code doesn't * try to commit the current record; we've already * recorded that syscall. The following event sets * the abort-commit bit. */ t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1); t->record_event(Event::syscallbuf_abort_commit()); advance_to_disarm_desched_syscall(t); t->pop_desched(); /* The tracee has just finished sanity-checking the * aborted record, and won't touch the syscallbuf * during this (aborted) transaction again. So now * is a good time for us to reset the record counter. */ t->delay_syscallbuf_reset_for_desched = false; // Run the syscallbuf exit hook. This ensures we'll be able to reset // the syscallbuf before trying to buffer another syscall. t->write_mem( REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), (uint8_t)1); } static void syscall_not_restarted(RecordTask* t) { LOG(debug) << " " << t->tid << ": popping abandoned interrupted " << t->ev() << "; pending events:"; if (IS_LOGGING(debug)) { t->log_pending_events(); } t->pop_syscall_interruption(); } /** * "Thaw" a frozen interrupted syscall if |t| is restarting it. * Return true if a syscall is indeed restarted. * * A postcondition of this function is that |t->ev| is no longer a * syscall interruption, whether or whether not a syscall was * restarted. */ static bool maybe_restart_syscall(RecordTask* t) { if (is_restart_syscall_syscall(t->regs().original_syscallno(), t->arch())) { LOG(debug) << " " << t->tid << ": SYS_restart_syscall'ing " << t->ev(); } if (t->is_syscall_restart()) { t->ev().transform(EV_SYSCALL); Registers regs = t->regs(); regs.set_original_syscallno(t->ev().Syscall().regs.original_syscallno()); t->set_regs(regs); t->canonicalize_regs(t->arch()); return true; } if (EV_SYSCALL_INTERRUPTION == t->ev().type()) { syscall_not_restarted(t); } return false; } /** * After a SYS_sigreturn "exit" of task |t| with return value |ret|, * check to see if there's an interrupted syscall that /won't/ be * restarted, and if so, pop it off the pending event stack. */ static void maybe_discard_syscall_interruption(RecordTask* t, intptr_t ret) { int syscallno; if (EV_SYSCALL_INTERRUPTION != t->ev().type()) { /* We currently don't track syscalls interrupted with * ERESTARTSYS or ERESTARTNOHAND, so it's possible for * a sigreturn not to affect the event stack. */ LOG(debug) << " (no interrupted syscall to retire)"; return; } syscallno = t->ev().Syscall().number; if (0 > ret) { syscall_not_restarted(t); } else if (t->arch() == x86 || t->arch() == x86_64) { // On x86, we would have expected this to get restored to the syscallno. // Since the syscallno is in a different register on other platforms, this // assert does not apply. ASSERT(t, syscallno == ret) << "Interrupted call was " << t->ev().Syscall().syscall_name() << " and sigreturn claims to be restarting " << syscall_name(ret, t->ev().Syscall().arch()); } } /** * Copy the registers used for syscall arguments (not including * syscall number) from |from| to |to|. */ static void copy_syscall_arg_regs(Registers* to, const Registers& from) { to->set_orig_arg1(from.arg1()); to->set_arg2(from.arg2()); to->set_arg3(from.arg3()); to->set_arg4(from.arg4()); to->set_arg5(from.arg5()); to->set_arg6(from.arg6()); } static void maybe_trigger_emulated_ptrace_syscall_exit_stop(RecordTask* t) { if (t->emulated_ptrace_cont_command == PTRACE_SYSCALL) { t->emulate_ptrace_stop(WaitStatus::for_syscall(t)); } else if (is_ptrace_any_singlestep(t->arch(), t->emulated_ptrace_cont_command)) { // Deliver the singlestep trap now that we've finished executing the // syscall. t->emulate_ptrace_stop(WaitStatus::for_stop_sig(SIGTRAP), nullptr, SI_KERNEL); } } static void save_interrupted_syscall_ret_in_syscallbuf(RecordTask* t, intptr_t retval) { // Record storing the return value in the syscallbuf record, where // we expect to find it during replay. auto child_rec = t->next_syscallbuf_record(); int64_t ret = retval; t->record_local(REMOTE_PTR_FIELD(child_rec, ret), &ret); } static bool is_in_privileged_syscall(RecordTask* t) { auto type = AddressSpace::rr_page_syscall_from_exit_point(t->arch(), t->ip()); return type && type->privileged == AddressSpace::PRIVILEGED; } void RecordSession::syscall_state_changed(RecordTask* t, StepState* step_state) { switch (t->ev().Syscall().state) { case ENTERING_SYSCALL_PTRACE: debug_exec_state("EXEC_SYSCALL_ENTRY_PTRACE", t); step_state->continue_type = DONT_CONTINUE; last_task_switchable = ALLOW_SWITCH; if (t->emulated_stop_type != NOT_STOPPED) { // Don't go any further. return; } if (t->ev().Syscall().in_sysemu) { // We'll have recorded just the ENTERING_SYSCALL_PTRACE event and // nothing else. Resume with an invalid syscall to ensure no real // syscall runs. t->pop_syscall(); Registers r = t->regs(); Registers orig_regs = r; r.set_original_syscallno(-1); t->set_regs(r); t->resume_execution(RESUME_SYSCALL, RESUME_WAIT, RESUME_NO_TICKS); ASSERT(t, t->ip() == r.ip()); t->set_regs(orig_regs); maybe_trigger_emulated_ptrace_syscall_exit_stop(t); return; } last_task_switchable = PREVENT_SWITCH; t->ev().Syscall().regs = t->regs(); t->ev().Syscall().state = ENTERING_SYSCALL; // The syscallno may have been changed by the ptracer t->ev().Syscall().number = t->regs().original_syscallno(); return; case ENTERING_SYSCALL: { debug_exec_state("EXEC_SYSCALL_ENTRY", t); ASSERT(t, !t->emulated_stop_pending); last_task_switchable = t->ev().Syscall().switchable = rec_prepare_syscall(t); t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF, RecordTask::ALLOW_RESET_SYSCALLBUF, &t->ev().Syscall().regs); debug_exec_state("after cont", t); t->ev().Syscall().state = PROCESSING_SYSCALL; if (t->emulated_stop_pending) { step_state->continue_type = DONT_CONTINUE; } else { // Resume the syscall execution in the kernel context. step_state->continue_type = CONTINUE_SYSCALL; } if (t->session().done_initial_exec() && Flags::get().check_cached_mmaps) { t->vm()->verify(t); } if (t->desched_rec() && t->is_in_untraced_syscall() && t->has_stashed_sig()) { // We have a signal to deliver but we're about to (re?)enter an untraced // syscall that may block and the desched event has been disarmed. // Rearm the desched event so if the syscall blocks, it will be // interrupted and we'll have a chance to deliver our signal. LOG(debug) << "Rearming desched event so we'll get a chance to deliver " "stashed signal"; arm_desched_event(t); } if (t->detached_proxy) { // We detached. Record that. t->record_event(Event::exit(), RecordTask::DONT_FLUSH_SYSCALLBUF, RecordTask::DONT_RESET_SYSCALLBUF); t->session().trace_writer().write_task_event( TraceTaskEvent::for_detach(t->tid)); step_state->continue_type = DONT_CONTINUE; } return; } case PROCESSING_SYSCALL: debug_exec_state("EXEC_IN_SYSCALL", t); // Linux kicks tasks out of syscalls before delivering // signals. ASSERT(t, !t->stop_sig()) << "Signal " << signal_name(t->stop_sig()) << " pending while in syscall???"; t->ev().Syscall().state = EXITING_SYSCALL; step_state->continue_type = DONT_CONTINUE; return; case EXITING_SYSCALL: { debug_exec_state("EXEC_SYSCALL_DONE", t); DEBUG_ASSERT(t->stop_sig() == 0); SupportedArch syscall_arch = t->ev().Syscall().arch(); int syscallno = t->ev().Syscall().number; intptr_t retval = t->regs().syscall_result_signed(); if (t->desched_rec()) { // If we enabled the desched event above, disable it. disarm_desched_event(t); // Write syscall return value to the syscallbuf now. This lets replay // get the correct value even though we're aborting the commit. This // value affects register values in the preload code (which must be // correct since register values may escape). save_interrupted_syscall_ret_in_syscallbuf(t, retval); } // sigreturn is a special snowflake, because it // doesn't actually return. Instead, it undoes the // setup for signal delivery, which possibly includes // preparing the tracee for a restart-syscall. So we // take this opportunity to possibly pop an // interrupted-syscall event. if (is_sigreturn(syscallno, syscall_arch)) { if (is_x86ish(t->arch())) { ASSERT(t, t->regs().original_syscallno() == -1); } rec_did_sigreturn(t); t->record_current_event(); t->pop_syscall(); // We've finished processing this signal now. t->pop_signal_handler(); t->invalidate_sigmask(); maybe_discard_syscall_interruption(t, retval); if (EV_SECCOMP_TRAP == t->ev().type()) { LOG(debug) << " exiting seccomp trap"; save_interrupted_syscall_ret_in_syscallbuf(t, retval); seccomp_trap_done(t); } if (EV_DESCHED == t->ev().type()) { LOG(debug) << " exiting desched critical section"; // The signal handler could have modified the apparent syscall // return handler. Save that value into the syscall buf again so // replay will pick it up later. save_interrupted_syscall_ret_in_syscallbuf(t, retval); desched_state_changed(t); } } else { LOG(debug) << " original_syscallno:" << t->regs().original_syscallno() << " (" << syscall_name(syscallno, syscall_arch) << "); return val:" << HEX(t->regs().syscall_result()); /* a syscall_restart ending is equivalent to the * restarted syscall ending */ if (t->ev().Syscall().is_restart) { LOG(debug) << " exiting restarted " << syscall_name(syscallno, syscall_arch); } /* TODO: is there any reason a restart_syscall can't * be interrupted by a signal and itself restarted? */ bool may_restart = !is_restart_syscall_syscall(syscallno, t->arch()) // SYS_pause is either interrupted or // never returns. It doesn't restart. && !is_pause_syscall(syscallno, t->arch()) && t->regs().syscall_may_restart(); /* no need to process the syscall in case its * restarted this will be done in the exit from the * restart_syscall */ if (!may_restart) { rec_process_syscall(t); if (t->session().done_initial_exec() && Flags::get().check_cached_mmaps) { t->vm()->verify(t); } } else { LOG(debug) << " may restart " << syscall_name(syscallno, syscall_arch) << " (from retval " << HEX(retval) << ")"; rec_prepare_restart_syscall(t); /* If we may restart this syscall, we've most * likely fudged some of the argument * registers with scratch pointers. We don't * want to record those fudged registers, * because scratch doesn't exist in replay. * So cover our tracks here. */ Registers r = t->regs(); copy_syscall_arg_regs(&r, t->ev().Syscall().regs); t->set_regs(r); // We need to track what the return value was on architectures // where the kernel replaces the return value by the new arg1 // on restart. t->ev().Syscall().regs = r; } t->record_current_event(); /* If we're not going to restart this syscall, we're * done with it. But if we are, "freeze" it on the * event stack until the execution point where it * might be restarted. */ if (!may_restart) { t->pop_syscall(); if (EV_DESCHED == t->ev().type()) { LOG(debug) << " exiting desched critical section"; desched_state_changed(t); } } else { t->ev().transform(EV_SYSCALL_INTERRUPTION); t->ev().Syscall().is_restart = true; } t->canonicalize_regs(syscall_arch); if (!may_restart) { if (t->retry_syscall_patching) { LOG(debug) << "Retrying deferred syscall patching"; if (t->vm()->monkeypatcher().try_patch_syscall(t, false)) { // Syscall was patched. Emit event and continue execution. auto ev = Event::patch_syscall(); ev.PatchSyscall().patch_after_syscall = true; t->record_event(ev); } t->retry_syscall_patching = false; } } } last_task_switchable = ALLOW_SWITCH; step_state->continue_type = DONT_CONTINUE; if (!is_in_privileged_syscall(t)) { maybe_trigger_emulated_ptrace_syscall_exit_stop(t); } return; } default: FATAL() << "Unknown exec state " << t->ev().Syscall().state; } } /** If the perf counters seem to be working return, otherwise don't return. */ void RecordSession::check_initial_task_syscalls(RecordTask* t, RecordResult* step_result) { if (done_initial_exec()) { return; } if (is_write_syscall(t->ev().Syscall().number, t->arch()) && t->regs().arg1_signed() == -1) { Ticks ticks = t->tick_count(); LOG(debug) << "ticks on entry to dummy write: " << ticks; if (ticks == 0) { step_result->status = RecordSession::STEP_SPAWN_FAILED; step_result->failure_message = string( "rr internal recorder error: Performance counter doesn't seem to " "be working. Are you perhaps running rr in a VM but didn't enable " "perf-counter virtualization?"); } } if (is_exit_group_syscall(t->ev().Syscall().number, t->arch())) { step_result->status = RecordSession::STEP_SPAWN_FAILED; step_result->failure_message = read_spawned_task_error(); } } RecordTask* RecordSession::revive_task_for_exec(pid_t rec_tid) { unsigned long msg = 0; int ret = ptrace(__ptrace_request(PTRACE_GETEVENTMSG), rec_tid, nullptr, &msg); if (ret < 0) { FATAL() << "Can't get old tid for execve (leader=" << rec_tid << ")"; } RecordTask* t = find_task(msg); if (!t) { FATAL() << "Can't find old task for execve"; } ASSERT(t, rec_tid == t->tgid()); pid_t own_namespace_tid = t->thread_group()->tgid_own_namespace; LOG(debug) << "Changing task tid from " << t->tid << " to " << rec_tid; // Pretend the old task cloned a new task with the right tid, and then exited trace_writer().write_task_event(TraceTaskEvent::for_clone( rec_tid, t->tid, own_namespace_tid, CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM)); trace_writer().write_task_event( TraceTaskEvent::for_exit(t->tid, WaitStatus::for_exit_code(0))); // Account for tid change task_map.erase(t->tid); task_map.insert(make_pair(rec_tid, t)); // Update the serial as if this task was really created by cloning the old // task. t->set_tid_and_update_serial(rec_tid, own_namespace_tid); return t; } /** * Take a NativeArch::siginfo_t& here instead of siginfo_t because different * versions of system headers have inconsistent field naming. */ template static void setup_sigframe_siginfo_arch(RecordTask* t, const siginfo_t& siginfo) { remote_ptr dest; switch (Arch::arch()) { case x86: { auto p = t->regs().sp().cast() + 2; dest = t->read_mem(p); break; } case x86_64: dest = t->regs().si(); break; case aarch64: dest = t->regs().x1(); break; default: DEBUG_ASSERT(0 && "Unknown architecture"); break; } typename Arch::siginfo_t si = t->read_mem(dest); set_arch_siginfo(siginfo, t->arch(), &si, sizeof(si)); t->write_mem(dest, si); } static void setup_sigframe_siginfo(RecordTask* t, const siginfo_t& siginfo) { RR_ARCH_FUNCTION(setup_sigframe_siginfo_arch, t->arch(), t, siginfo); } /** * Get t into a state where resume_execution with a signal will actually work. */ static bool preinject_signal(RecordTask* t) { int sig = t->ev().Signal().siginfo.si_signo; /* Signal injection is tricky. Per the ptrace(2) man page, injecting * a signal while the task is not in a signal-stop is not guaranteed to work * (and indeed, we see that the kernel sometimes ignores such signals). * But some signals must be delayed until after the signal-stop that notified * us of them. * So, first we check if we're in a signal-stop that we can use to inject * a signal. Some (all?) SIGTRAP stops are *not* usable for signal injection. */ if (t->stop_sig() && t->stop_sig() != SIGTRAP) { LOG(debug) << " in signal-stop for " << signal_name(t->stop_sig()); } else { /* We're not in a usable signal-stop. Force a signal-stop by sending * a new signal with tgkill (as the ptrace(2) man page recommends). */ LOG(debug) << " maybe not in signal-stop (status " << t->status() << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)"; // Always send SYSCALLBUF_DESCHED_SIGNAL because other signals (except // TIME_SLICE_SIGNAL) will be blocked by // RecordTask::will_resume_execution(). t->tgkill(t->session().syscallbuf_desched_sig()); t->move_to_signal_stop(); if (t->status().ptrace_event() == PTRACE_EVENT_EXIT) { /* We raced with an exit (e.g. due to a pending SIGKILL). */ return false; } ASSERT(t, t->stop_sig() == t->session().syscallbuf_desched_sig()) << "Expected SYSCALLBUF_DESCHED_SIGNAL, got " << t->status(); /* We're now in a signal-stop */ } /* Now that we're in a signal-stop, we can inject our signal and advance * to the signal handler with one single-step. */ LOG(debug) << " injecting signal " << signal_name(sig); t->set_siginfo(t->ev().Signal().siginfo); return true; } /** * Returns true if the signal should be delivered. * Returns false if this signal should not be delivered because another signal * occurred during delivery. * Must call t->stashed_signal_processed() once we're ready to unmask signals. */ static bool inject_handled_signal(RecordTask* t) { if (!preinject_signal(t)) { // Task prematurely exited. return false; } // If there aren't any more stashed signals, it's OK to stop blocking all // signals. t->stashed_signal_processed(); int sig = t->ev().Signal().siginfo.si_signo; do { // We are ready to inject our signal. // XXX we assume the kernel won't respond by notifying us of a different // signal. We don't want to do this with signals blocked because that will // save a bogus signal mask in the signal frame. t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT, RESUME_NO_TICKS, sig); // Signal injection can change the sigmask due to sa_mask effects, lack of // SA_NODEFER, and signal frame construction triggering a synchronous // SIGSEGV. t->invalidate_sigmask(); // Repeat injection if we got a desched signal. We observe in Linux 4.14.12 // that we get SYSCALLBUF_DESCHED_SIGNAL here once in a while. } while (t->stop_sig() == t->session().syscallbuf_desched_sig()); if (t->stop_sig() == SIGSEGV) { // Constructing the signal handler frame must have failed. Stash the signal // to deliver it later. t->stash_sig(); if (sig == SIGSEGV) { // The kernel will kill the process after this. Make sure we know to treat // it as fatal when we inject it. Also disable the signal handler to match // what the kernel does. t->did_set_sig_handler_default(SIGSEGV); t->thread_group()->received_sigframe_SIGSEGV = true; } return false; } // We stepped into a user signal handler. ASSERT(t, t->stop_sig() == SIGTRAP) << "Got unexpected status " << t->status(); ASSERT(t, t->get_signal_user_handler(sig) == t->ip()) << "Expected handler IP " << t->get_signal_user_handler(sig) << ", got " << t->ip() << "; actual signal mask=" << HEX(t->read_sigmask_from_process()) << " (cached " << HEX(t->get_sigmask()) << ")"; if (t->signal_handler_takes_siginfo(sig)) { // The kernel copied siginfo into userspace so it can pass a pointer to // the signal handler. Replace the contents of that siginfo with // the exact data we want to deliver. (We called Task::set_siginfo // above to set that data, but the kernel sanitizes the passed-in data // which wipes out certain fields; e.g. we can't set SI_KERNEL in si_code.) setup_sigframe_siginfo(t, t->ev().Signal().siginfo); } // The kernel clears the FPU state on entering the signal handler, but prior // to 4.7 or thereabouts ptrace can still return stale values. Fix that here. // This also sets bit 0 of the XINUSE register to 1 to avoid issues where it // get set to 1 nondeterministically. ExtraRegisters e = t->extra_regs(); e.reset(); t->set_extra_regs(e); return true; } /** * |t| is being delivered a signal, and its state changed. * Must call t->stashed_signal_processed() once we're ready to unmask signals. */ bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) { int sig = t->ev().Signal().siginfo.si_signo; switch (t->ev().type()) { case EV_SIGNAL: { // This event is used by the replayer to advance to // the point of signal delivery. t->record_current_event(); t->ev().transform(EV_SIGNAL_DELIVERY); ssize_t sigframe_size = 0; bool has_handler = t->signal_has_user_handler(sig); if (has_handler) { LOG(debug) << " " << t->tid << ": " << signal_name(sig) << " has user handler"; if (!inject_handled_signal(t)) { // Signal delivery isn't happening. Prepare to process the new // signal that aborted signal delivery. t->signal_delivered(sig); t->pop_signal_delivery(); step_state->continue_type = DONT_CONTINUE; last_task_switchable = PREVENT_SWITCH; break; } if (is_x86ish(t->arch())) { // It's somewhat difficult engineering-wise to // compute the sigframe size at compile time, // and it can vary across kernel versions and CPU // microarchitectures. So this size is an overestimate // of the real size(s). // // If this size becomes too small in the // future, and unit tests that use sighandlers // are run with checksumming enabled, then // they can catch errors here. sigframe_size = 1152 /* Overestimate of kernel sigframe */ + 128 /* Redzone */ + /* this returns 512 when XSAVE unsupported */ xsave_area_size(); } else if (t->arch() == aarch64) { sigframe_size = sizeof(ARM64Arch::rt_sigframe) + sizeof(ARM64Arch::user_fpsimd_state); } else { DEBUG_ASSERT(0 && "Add sigframe size for your architecture here"); } t->ev().transform(EV_SIGNAL_HANDLER); t->signal_delivered(sig); // We already continued! Don't continue now, and allow switching. step_state->continue_type = DONT_CONTINUE; last_task_switchable = ALLOW_SWITCH; } else { t->stashed_signal_processed(); LOG(debug) << " " << t->tid << ": no user handler for " << signal_name(sig); // Don't do another task continue. We want to deliver the signal // as the next thing that the task does. step_state->continue_type = DONT_CONTINUE; // If we didn't set up the sighandler frame, we need // to ensure that this tracee is scheduled next so // that we can deliver the signal normally. We have // to do that because setting up the sighandler frame // is synchronous, but delivery otherwise is async. // But right after this, we may have to process some // syscallbuf state, so we can't let the tracee race // with us. last_task_switchable = PREVENT_SWITCH; } // We record this data even if sigframe_size is zero to simplify replay. // Stop recording data if we run off the end of a writable mapping. // Our sigframe size is conservative so we need to do this. t->record_remote_writable(t->regs().sp(), sigframe_size); // This event is used by the replayer to set up the signal handler frame. // But if we don't have a handler, we don't want to record the event // until we deal with the EV_SIGNAL_DELIVERY. if (has_handler) { t->record_current_event(); } break; } case EV_SIGNAL_DELIVERY: { // A SIGSTOP requires us to allow switching to another task. // So does a fatal, core-dumping signal, since we need to allow other // tasks to proceed to their exit events. bool is_deterministic = t->ev().Signal().deterministic == DETERMINISTIC_SIG; // Signals that would normally be fatal are just ignored for init processes, // unless they're deterministic. bool is_fatal = t->ev().Signal().disposition == DISPOSITION_FATAL && (!t->is_container_init() || is_deterministic); Switchable can_switch = ((is_fatal && is_coredumping_signal(sig)) || sig == SIGSTOP) ? ALLOW_SWITCH : PREVENT_SWITCH; // We didn't record this event above, so do that now. // NB: If there is no handler, and we interrupted a syscall, and there are // no more actionable signals, the kernel sets us up for a syscall // restart. But it does that *after* the ptrace trap. To replay this // correctly we need to fake those changes here. But we don't do this // if we're going to switch away at the ptrace trap, and for the moment, // 'can_switch' is actually 'will_switch'. // This is essentially copied from do_signal in arch/x86/kernel/signal.c bool has_other_signals = t->has_any_actionable_signal(); auto r = t->regs(); if (!is_fatal) { Event *prev_ev = t->prev_ev(); if (can_switch == PREVENT_SWITCH && !has_other_signals && prev_ev && EV_SYSCALL_INTERRUPTION == prev_ev->type()) { switch (prev_ev->Syscall().regs.syscall_result_signed()) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: r.set_syscallno(r.original_syscallno()); break; case -ERESTART_RESTARTBLOCK: r.set_syscallno(syscall_number_for_restart_syscall(t->arch())); break; } r.set_ip(r.ip().decrement_by_syscall_insn_length(t->arch())); // Now that we've mucked with the registers, we can't switch tasks. That // could allow more signals to be generated, breaking our assumption // that we are the last signal. } else { // But if we didn't touch the registers switching here is ok. can_switch = ALLOW_SWITCH; } } t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF, RecordTask::ALLOW_RESET_SYSCALLBUF, &r); // Don't actually set_regs(r), the kernel does these modifications. if (t->is_container_init() && is_fatal) { // Nondeterministic signals were already filtered out. ASSERT(t, is_deterministic); // Usually, the kernel removes the killable-protection from an init process // when a determinisic fatal signal gets executed, but (due to what is // arguably a bug) when a ptracer is attached, this does not happen. // If we try to inject it here, the kernel will just ignore it, // and we'll go around again. As a hack, we detach here, in the // expectation that the deterministic instruction will run again and // actually kill the task now that it isn't under ptrace control anymore. t->destroy_buffers(nullptr, nullptr); WaitStatus exit_status = WaitStatus::for_fatal_sig(sig); record_exit_trace_event(t, exit_status); // Allow writing child_tid now because otherwise the write will race t->record_exit_event(sig, RecordTask::WRITE_CHILD_TID); // On a real affected kernel, we probably would have never gotten here, // since the signal we would be seeing was not deterministic, but let's // be conservative and still try to emulate the ptrace stop. t->do_ptrace_exit_stop(exit_status); t->did_kill(); t->detach(); // Not really, but we detached, so we're never gonna see that event // anyway, so just pretend we're there already t->did_reach_zombie(); return true; } // Only inject fatal signals. Non-fatal signals with signal handlers // were taken care of above; for non-fatal signals without signal // handlers, there is no need to deliver the signal at all. In fact, // there is really no way to inject a non-fatal, non-handled signal // without letting the task execute at least one instruction, which // we don't want to do here. bool inject_signal = is_fatal && sig != get_continue_through_sig(); if (inject_signal) { preinject_signal(t); t->resume_execution(RESUME_CONT, RESUME_NONBLOCKING, RESUME_NO_TICKS, sig); } t->signal_delivered(sig); if (!inject_signal || !is_coredumping_signal(sig)) { /* Fatal signals may core-dump, so we don't consider the signal * delivery complete until we've actually managed to advance past that */ t->pop_signal_delivery(); } // Mark each task in this address space as expecting a ptrace exit // to avoid causing any ptrace_exit reaces. if (is_fatal && is_coredumping_signal(sig)) { for (Task *ot : t->vm()->task_set()) { if (t != ot) { ((RecordTask *)ot)->waiting_for_ptrace_exit = true; } } } last_task_switchable = can_switch; step_state->continue_type = DONT_CONTINUE; break; } default: FATAL() << "Unhandled signal state " << t->ev().type(); break; } return false; } bool RecordSession::handle_signal_event(RecordTask* t, StepState* step_state) { int sig = t->stop_sig(); if (!sig) { return false; } if (!done_initial_exec()) { // If the initial tracee isn't prepared to handle // signals yet, then us ignoring the ptrace // notification here will have the side effect of // declining to deliver the signal. // // This doesn't really occur in practice, only in // tests that force a degenerately low time slice. LOG(warn) << "Dropping " << signal_name(sig) << " because it can't be delivered yet"; // These signals might have effects on the sigmask. t->invalidate_sigmask(); // No events to be recorded, so no syscallbuf updates // needed. return true; } if (sig == SIGTRAP && handle_syscallbuf_breakpoint(t)) { return true; } SignalDeterministic deterministic = is_deterministic_signal(t); // The kernel might have forcibly unblocked the signal. Check whether it // was blocked now, before we update our cached sigmask. SignalBlocked signal_was_blocked = t->is_sig_blocked(sig) ? SIG_BLOCKED : SIG_UNBLOCKED; if (deterministic || sig == t->session().syscallbuf_desched_sig()) { // Don't stash these signals; deliver them immediately. // We don't want them to be reordered around other signals. // invalidate_sigmask() must not be called before we reach handle_signal! siginfo_t siginfo = t->get_siginfo(); switch (handle_signal(t, &siginfo, deterministic, signal_was_blocked)) { case SIGNAL_PTRACE_STOP: // Emulated ptrace-stop. Don't run the task again yet. last_task_switchable = ALLOW_SWITCH; step_state->continue_type = DONT_CONTINUE; return true; case DEFER_SIGNAL: ASSERT(t, false) << "Can't defer deterministic or internal signal " << siginfo << " at ip " << t->ip(); break; case SIGNAL_HANDLED: if (t->ptrace_event() == PTRACE_EVENT_SECCOMP) { // `handle_desched_event` detected a spurious desched followed // by a SECCOMP event, which it left pending. Handle that SECCOMP // event now. bool dummy_did_enter_syscall; handle_ptrace_event(&t, step_state, nullptr, &dummy_did_enter_syscall); ASSERT(t, !dummy_did_enter_syscall); } break; } return false; } // Conservatively invalidate the sigmask in case just accepting a signal has // sigmask effects. t->invalidate_sigmask(); if (sig == PerfCounters::TIME_SLICE_SIGNAL) { if (t->next_pmc_interrupt_is_for_user) { auto vpmc = VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t); ASSERT(t, vpmc); // Synthesize the requested signal. vpmc->synthesize_signal(t); t->next_pmc_interrupt_is_for_user = false; return true; } auto& si = t->get_siginfo(); /* This implementation will of course fall over if rr tries to * record itself. * * NB: we can't check that the ticks is >= the programmed * target, because this signal may have become pending before * we reset the HPC counters. There be a way to handle that * more elegantly, but bridge will be crossed in due time. * * We can't check that the fd matches t->hpc.ticks_fd() because this * signal could have been queued quite a long time ago and the PerfCounters * might have been stopped (and restarted!), perhaps even more than once, * since the signal was queued. possibly changing its fd. We could check * against all fds the PerfCounters have ever used, but that seems like * overkill. */ ASSERT(t, PerfCounters::TIME_SLICE_SIGNAL == si.si_signo && (RecordTask::SYNTHETIC_TIME_SLICE_SI_CODE == si.si_code || POLL_IN == si.si_code)) << "Tracee is using SIGSTKFLT??? (code=" << si.si_code << ", fd=" << si.si_fd << ")"; } t->stash_sig(); return true; } template static bool is_ptrace_any_sysemu_arch(int command) { return command >= 0 && (command == Arch::PTRACE_SYSEMU || command == Arch::PTRACE_SYSEMU_SINGLESTEP); } static bool is_ptrace_any_sysemu(SupportedArch arch, int command) { RR_ARCH_FUNCTION(is_ptrace_any_sysemu_arch, arch, command); } bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state, RecordResult* step_result, SupportedArch syscall_arch) { if (const siginfo_t* si = t->stashed_sig_not_synthetic_SIGCHLD()) { // The only four cases where we allow a stashed signal to be pending on // syscall entry are: // -- the signal is a ptrace-related signal, in which case if it's generated // during a blocking syscall, it does not interrupt the syscall // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and // lets us dispatch signals afterward // -- when we're entering a blocking untraced syscall. If it really blocks, // we'll get the desched-signal notification and dispatch our stashed // signal. // -- when we're doing a privileged syscall that's internal to the preload // logic // We do not generally want to have stashed signals pending when we enter // a syscall, because that will execute with a hacked signal mask // (see RecordTask::will_resume_execution) which could make things go wrong. ASSERT(t, t->desched_rec() || is_rrcall_notify_syscall_hook_exit_syscall( t->regs().original_syscallno(), t->arch()) || t->ip() == t->vm() ->privileged_traced_syscall_ip() .increment_by_syscall_insn_length(t->arch())) << "Stashed signal pending on syscall entry when it shouldn't be: " << *si << "; IP=" << t->ip(); } // We just entered a syscall. if (!maybe_restart_syscall(t)) { // Emit FLUSH_SYSCALLBUF if necessary before we do any patching work t->maybe_flush_syscallbuf(); if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN && t->seccomp_bpf_enabled) { // We received a PTRACE_SYSCALL notification before the seccomp // notification. Ignore it and continue to the seccomp notification. syscall_seccomp_ordering_ = PTRACE_SYSCALL_BEFORE_SECCOMP; step_state->continue_type = CONTINUE; return true; } // Don't ever patch a sigreturn syscall. These can't go through the syscallbuf. if (!is_sigreturn(t->regs().original_syscallno(), t->arch())) { if (t->vm()->monkeypatcher().try_patch_syscall(t)) { // Syscall was patched. Emit event and continue execution. t->record_event(Event::patch_syscall()); return true; } } if (t->ptrace_event() == PTRACE_EVENT_EXIT) { // task exited while we were trying to patch it. // Make sure that this exit event gets processed step_state->continue_type = DONT_CONTINUE; return false; } t->push_event(SyscallEvent(t->regs().original_syscallno(), syscall_arch)); } check_initial_task_syscalls(t, step_result); note_entering_syscall(t); if ((t->emulated_ptrace_cont_command == PTRACE_SYSCALL || is_ptrace_any_sysemu(t->arch(), t->emulated_ptrace_cont_command)) && !is_in_privileged_syscall(t)) { t->ev().Syscall().state = ENTERING_SYSCALL_PTRACE; t->emulate_ptrace_stop(WaitStatus::for_syscall(t)); t->record_current_event(); t->ev().Syscall().in_sysemu = is_ptrace_any_sysemu(t->arch(), t->emulated_ptrace_cont_command); } return true; } /** * The execution of |t| has just been resumed, and it most likely has * a new event that needs to be processed. Prepare that new event. * Returns false if the task exits during processing */ void RecordSession::runnable_state_changed(RecordTask* t, StepState* step_state, RecordResult* step_result, bool can_consume_wait_status) { switch (t->ev().type()) { case EV_NOOP: t->pop_noop(); return; case EV_INSTRUCTION_TRAP: t->record_current_event(); t->pop_event(t->ev().type()); return; case EV_SENTINEL: case EV_SIGNAL_HANDLER: case EV_SYSCALL_INTERRUPTION: { if (!can_consume_wait_status) { return; } SupportedArch syscall_arch = t->detect_syscall_arch(); t->canonicalize_regs(syscall_arch); t->apply_syscall_entry_regs(); process_syscall_entry(t, step_state, step_result, syscall_arch); return; } default: return; } } bool RecordSession::prepare_to_inject_signal(RecordTask* t, StepState* step_state) { if (!done_initial_exec() || step_state->continue_type != CONTINUE) { return false; } union { NativeArch::siginfo_t native_api; siginfo_t linux_api; } si; const RecordTask::StashedSignal* sig; while (true) { sig = t->peek_stashed_sig_to_deliver(); if (!sig) { return false; } si.linux_api = sig->siginfo; if (si.linux_api.si_signo == get_ignore_sig()) { LOG(debug) << "Declining to deliver " << signal_name(si.linux_api.si_signo) << " by user request"; t->pop_stash_sig(sig); t->stashed_signal_processed(); } else { break; } } if (sig->deterministic == DETERMINISTIC_SIG && sig->siginfo.si_signo == SIGSYS && t->is_sig_blocked(sig->siginfo.si_signo) == SIG_BLOCKED) { // Our synthesized deterministic SIGSYS (seccomp trap) needs to match the // kernel behavior of unblocking the signal and resetting disposition to // default. t->unblock_signal(SIGSYS); t->set_sig_handler_default(SIGSYS); } switch (handle_signal(t, &si.linux_api, sig->deterministic, SIG_UNBLOCKED)) { case SIGNAL_PTRACE_STOP: // Emulated ptrace-stop. Don't run the task again yet. last_task_switchable = ALLOW_SWITCH; LOG(debug) << signal_name(si.linux_api.si_signo) << ", emulating ptrace stop"; break; case DEFER_SIGNAL: LOG(debug) << signal_name(si.linux_api.si_signo) << " deferred"; // Leave signal on the stack and continue task execution. We'll try again // later. return false; case SIGNAL_HANDLED: LOG(debug) << signal_name(si.linux_api.si_signo) << " handled"; // Signal is now a pending event on |t|'s event stack if (t->ev().type() == EV_SCHED) { if (t->maybe_in_spinlock()) { LOG(debug) << "Detected possible spinlock, forcing one round-robin"; scheduler().schedule_one_round_robin(t); } // Allow switching after a SCHED. We'll flush the SCHED if and only // if we really do a switch. last_task_switchable = ALLOW_SWITCH; } break; } step_state->continue_type = DONT_CONTINUE; t->pop_stash_sig(sig); if (t->ev().type() != EV_SIGNAL) { t->stashed_signal_processed(); } return true; } static void inject_ld_helper_library(vector& env, string env_var, string value) { // Our preload lib should come first if possible, because that will speed up // the loading of the other libraries; it's also a good idea to put our audit // library at the head of the list, since there's only sixteen possible link // namespaces on glibc and each audit library uses up one. // // We supply a placeholder which is then mutated to the correct filename in // Monkeypatcher::patch_after_exec. auto env_assignment = env_var + "="; auto it = env.begin(); for (; it != env.end(); ++it) { if (it->find(env_assignment) != 0) { continue; } // Honor old preloads too. This may cause // problems, but only in those libs, and // that's the user's problem. value += ":"; value += it->substr(it->find("=") + 1); break; } value = env_assignment + value; if (it == env.end()) { env.push_back(value); } else { *it = value; } } void strip_outer_ld_preload(vector& env) { auto env_assignment = "LD_PRELOAD="; auto it = env.begin(); for (; it != env.end(); ++it) { if (it->find(env_assignment) != 0) { continue; } size_t colon_pos = it->find(":"); if (colon_pos != string::npos) { // If the preload library is loaded at all, it must be first size_t preload_pos = it->find("librrpreload"); if (preload_pos < colon_pos) { string new_ld_preload = it->substr(++colon_pos); *it = env_assignment + new_ld_preload; return; } else { DEBUG_ASSERT(preload_pos == string::npos); } } } } struct ExeInfo { ExeInfo() : has_asan_symbols(false) {} // Empty if anything fails string libasan_path; bool has_asan_symbols; }; static ExeInfo read_exe_info(const string& exe_file) { ExeInfo ret; ScopedFd fd(exe_file.c_str(), O_RDONLY); if (!fd.is_open()) { return ret; } ElfFileReader reader(fd); DynamicSection dynamic = reader.read_dynamic(); for (auto& entry : dynamic.entries) { if (entry.tag == DT_NEEDED && entry.val < dynamic.strtab.size()) { const char* name = &dynamic.strtab[entry.val]; if (!strncmp(name, "libasan", 7)) { ret.libasan_path = string(name); } } } auto syms = reader.read_symbols(".dynsym", ".dynstr"); for (size_t i = 0; i < syms.size(); ++i) { if (syms.is_name(i, "__asan_init")) { ret.has_asan_symbols = true; } } return ret; } static string lookup_by_path(const string& name) { if (name.find('/') != string::npos) { return name; } const char* env = getenv("PATH"); if (!env) { return name; } char* p = strdup(env); char* s = p; while (*s) { char* next = strchr(s, ':'); if (next) { *next = 0; } string file = string(s) + "/" + name; struct stat st; if (!stat(file.c_str(), &st) && S_ISREG(st.st_mode) && !access(file.c_str(), X_OK)) { free(p); return file; } if (!next) { break; } s = next + 1; } free(p); return name; } /*static*/ RecordSession::shr_ptr RecordSession::create( const vector& argv, const vector& extra_env, const DisableCPUIDFeatures& disable_cpuid_features, SyscallBuffering syscallbuf, unsigned char syscallbuf_desched_sig, BindCPU bind_cpu, const string& output_trace_dir, const TraceUuid* trace_id, bool use_audit, bool unmap_vdso, bool force_asan_active) { // The syscallbuf library interposes some critical // external symbols like XShmQueryExtension(), so we // preload it whether or not syscallbuf is enabled. Indicate here whether // syscallbuf is enabled. if (syscallbuf == DISABLE_SYSCALL_BUF) { unsetenv(SYSCALLBUF_ENABLED_ENV_VAR); } else { setenv(SYSCALLBUF_ENABLED_ENV_VAR, "1", 1); if (!has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) && !has_effective_caps(uint64_t(1) << CAP_PERFMON)) { ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); if (fd.is_open()) { char buf[100]; ssize_t size = read(fd, buf, sizeof(buf) - 1); if (size >= 0) { buf[size] = 0; int val = atoi(buf); if (val > 1) { fprintf(stderr, "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n" "Change it to 1, or use 'rr record -n' (slow).\n" "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n" "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n" "and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n", val); exit(1); } } } } } vector env = current_env(); // Have extra_env override anything already in the environment for (string extra : extra_env) { string extra_var = extra.substr(0, extra.find('=')); auto it = env.begin(); for (; it != env.end(); ++it) { if (it->find(extra_var) != 0) { continue; } it = env.erase(it); break; } } env.insert(env.end(), extra_env.begin(), extra_env.end()); string full_path = lookup_by_path(argv[0]); ExeInfo exe_info = read_exe_info(full_path); // Strip any LD_PRELOAD that an outer rr may have inserted strip_outer_ld_preload(env); // LD_PRELOAD the syscall interception lib string syscall_buffer_lib_path = find_helper_library(SYSCALLBUF_LIB_FILENAME); if (!syscall_buffer_lib_path.empty()) { string ld_preload = ""; if (!exe_info.libasan_path.empty()) { LOG(debug) << "Prepending " << exe_info.libasan_path << " to LD_PRELOAD"; // Put an LD_PRELOAD entry for it before our preload library, because // it checks that it's loaded first ld_preload += exe_info.libasan_path + ":"; } ld_preload += syscall_buffer_lib_path + SYSCALLBUF_LIB_FILENAME_PADDED; inject_ld_helper_library(env, "LD_PRELOAD", ld_preload); } if (use_audit) { string rtld_audit_lib_path = find_helper_library(RTLDAUDIT_LIB_FILENAME); if (!rtld_audit_lib_path.empty()) { string ld_audit = rtld_audit_lib_path + RTLDAUDIT_LIB_FILENAME_PADDED; inject_ld_helper_library(env, "LD_AUDIT", ld_audit); } } env.push_back("RUNNING_UNDER_RR=1"); // Stop Mesa using the GPU env.push_back("LIBGL_ALWAYS_SOFTWARE=1"); env.push_back("GBM_ALWAYS_SOFTWARE=1"); env.push_back("SDL_RENDER_DRIVER=software"); // Stop sssd from using shared-memory with its daemon env.push_back("SSS_NSS_USE_MEMCACHE=NO"); // Disable Gecko's "wait for gdb to attach on process crash" behavior, since // it is useless when running under rr. env.push_back("MOZ_GDB_SLEEP=0"); // If we have CPUID faulting, don't use these environment hacks. We don't // need them and the user might want to use them themselves for other reasons. if (!Session::has_cpuid_faulting()) { // OpenSSL uses RDRAND, but we can disable it. These bitmasks are inverted // and ANDed with the results of CPUID. The number below is 2^62, which is the // bit for RDRAND support. env.push_back("OPENSSL_ia32cap=~4611686018427387904:~0"); // Disable Qt's use of RDRAND/RDSEED/RTM env.push_back("QT_NO_CPU_FEATURE=rdrnd rdseed rtm"); // Disable systemd's use of RDRAND env.push_back("SYSTEMD_RDRAND=0"); } shr_ptr session( new RecordSession(full_path, argv, env, disable_cpuid_features, syscallbuf, syscallbuf_desched_sig, bind_cpu, output_trace_dir, trace_id, use_audit, unmap_vdso)); session->set_asan_active(force_asan_active || !exe_info.libasan_path.empty() || exe_info.has_asan_symbols); return session; } RecordSession::RecordSession(const std::string& exe_path, const std::vector& argv, const std::vector& envp, const DisableCPUIDFeatures& disable_cpuid_features, SyscallBuffering syscallbuf, int syscallbuf_desched_sig, BindCPU bind_cpu, const string& output_trace_dir, const TraceUuid* trace_id, bool use_audit, bool unmap_vdso) : trace_out(argv[0], output_trace_dir, ticks_semantics_), scheduler_(*this), trace_id(trace_id), disable_cpuid_features_(disable_cpuid_features), ignore_sig(0), continue_through_sig(0), last_task_switchable(PREVENT_SWITCH), syscall_buffer_size_(1024 * 1024), syscallbuf_desched_sig_(syscallbuf_desched_sig), use_syscall_buffer_(syscallbuf == ENABLE_SYSCALL_BUF), use_file_cloning_(true), use_read_cloning_(true), enable_chaos_(false), asan_active_(false), wait_for_all_(false), use_audit_(use_audit), unmap_vdso_(unmap_vdso) { if (!has_cpuid_faulting() && disable_cpuid_features.any_features_disabled()) { FATAL() << "CPUID faulting required to disable CPUID features"; } if (rr::syscall_number_for_rrcall_init_preload(x86_64) != RR_CALL_BASE) { FATAL() << "RR_CALL_BASE is incorrect"; } trace_out.set_bound_cpu(choose_cpu(bind_cpu, cpu_lock)); do_bind_cpu(trace_out); ScopedFd error_fd = create_spawn_task_error_pipe(); RecordTask* t = static_cast( Task::spawn(*this, error_fd, &tracee_socket_fd(), &tracee_socket_receiver_fd(), &tracee_socket_fd_number, exe_path, argv, envp)); if (NativeArch::is_x86ish()) { // CPU affinity has been set. trace_out.setup_cpuid_records(has_cpuid_faulting(), disable_cpuid_features_); if (cpu_has_xsave_fip_fdp_quirk()) { trace_out.set_xsave_fip_fdp_quirk(true); // Clear FIP/FDP on every event to reduce the probability of this quirk // causing divergence, especially when porting traces to Intel machines trace_out.set_clear_fip_fdp(true); } if (cpu_has_fdp_exception_only_quirk()) { trace_out.set_fdp_exception_only_quirk(true); } } initial_thread_group = t->thread_group(); on_create(t); } RecordSession::RecordResult RecordSession::record_step() { RecordResult result; if (task_map.empty()) { result.status = STEP_EXITED; result.exit_status = initial_thread_group->exit_status; return result; } if (!wait_for_all_ && initial_thread_group->task_set().empty()) { // SIGKILL any tasks we haven't already killed. terminate_tracees(); } result.status = STEP_CONTINUE; TaskUid prev_task_tuid; if (scheduler().current()) { prev_task_tuid = scheduler().current()->tuid(); } auto rescheduled = scheduler().reschedule(last_task_switchable); if (rescheduled.interrupted_by_signal) { // The scheduler was waiting for some task to become active, but was // interrupted by a signal. Yield to our caller now to give the caller // a chance to do something triggered by the signal // (e.g. terminate the recording). return result; } RecordTask* t = scheduler().current(); if (t->waiting_for_reap) { // Give it another chance to be reaped t->did_reach_zombie(); return result; } RecordTask* prev_task = find_task(prev_task_tuid); if (prev_task && prev_task->ev().type() == EV_SCHED) { if (prev_task != t) { // We did do a context switch, so record the SCHED event. Otherwise // we'll just discard it. prev_task->record_current_event(); } prev_task->pop_event(EV_SCHED); } if (rescheduled.started_new_timeslice) { t->registers_at_start_of_last_timeslice = t->regs(); t->time_at_start_of_last_timeslice = trace_writer().time(); } // Have to disable context-switching until we know it's safe // to allow switching the context. last_task_switchable = PREVENT_SWITCH; LOG(debug) << "trace time " << t->trace_time() << ": Active task is " << t->tid << ". Events:"; if (IS_LOGGING(debug)) { t->log_pending_events(); } if (handle_ptrace_exit_event(t)) { // t may have been deleted. last_task_switchable = ALLOW_SWITCH; return result; } StepState step_state(CONTINUE); bool did_enter_syscall; if (rescheduled.by_waitpid && handle_ptrace_event(&t, &step_state, &result, &did_enter_syscall)) { if (result.status != STEP_CONTINUE || step_state.continue_type == DONT_CONTINUE) { return result; } if (did_enter_syscall && t->ev().type() == EV_SYSCALL) { syscall_state_changed(t, &step_state); } } else if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) { } else { runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid); if (result.status != STEP_CONTINUE || step_state.continue_type == DONT_CONTINUE) { return result; } switch (t->ev().type()) { case EV_DESCHED: desched_state_changed(t); break; case EV_SYSCALL: syscall_state_changed(t, &step_state); break; case EV_SIGNAL: case EV_SIGNAL_DELIVERY: if (signal_state_changed(t, &step_state)) { // t may have been deleted return result; } break; default: break; } } t->verify_signal_states(); // We try to inject a signal if there's one pending; otherwise we continue // task execution. if (!prepare_to_inject_signal(t, &step_state) && step_state.continue_type != DONT_CONTINUE) { // Ensure that we aren't allowing switches away from a running task. // Only tasks blocked in a syscall can be switched away from, otherwise // we have races. ASSERT(t, last_task_switchable == PREVENT_SWITCH || t->may_be_blocked()); debug_exec_state("EXEC_START", t); task_continue(step_state); } return result; } void RecordSession::terminate_tracees() { for (auto& v : task_map) { RecordTask* t = static_cast(v.second); if (!t->detached_proxy && !t->sent_shutdown_kill) { LOG(debug) << "Terminating tracee " << t->tid; ::kill(t->rec_tid, SIGKILL); t->sent_shutdown_kill = true; t->emulate_SIGCONT(); } } } void RecordSession::term_detached_tasks() { // Send SIGTERM to all detached child tasks first, so they may clean up // in parallel. for (auto& v : task_map) { RecordTask* t = static_cast(v.second); if (!t->detached_proxy) { continue; } ::kill(t->rec_tid, SIGTERM); } for (auto& v : task_map) { RecordTask* t = static_cast(v.second); if (!t->detached_proxy) { continue; } int status; pid_t ret = ::waitpid(t->rec_tid, &status, WEXITED); if (ret != t->rec_tid) { LOG(warn) << "Unexpected wait status " << WaitStatus(status) << " while waiting for detached child " << t->rec_tid; } } } void RecordSession::close_trace_writer(TraceWriter::CloseStatus status) { trace_out.close(status, trace_id.get()); } Task* RecordSession::new_task(pid_t tid, pid_t, uint32_t serial, SupportedArch a) { return new RecordTask(*this, tid, serial, a); } void RecordSession::on_create(Task* t) { Session::on_create(t); scheduler().on_create(static_cast(t)); } void RecordSession::on_destroy(Task* t) { scheduler().on_destroy(static_cast(t)); Session::on_destroy(t); } RecordTask* RecordSession::find_task(pid_t rec_tid) const { return static_cast(Session::find_task(rec_tid)); } RecordTask* RecordSession::find_task(const TaskUid& tuid) const { return static_cast(Session::find_task(tuid)); } void RecordSession::on_proxy_detach(RecordTask *t, pid_t new_tid) { Session::on_destroy(t); task_map[new_tid] = t; } uint64_t RecordSession::rr_signal_mask() const { return signal_bit(PerfCounters::TIME_SLICE_SIGNAL) | signal_bit(syscallbuf_desched_sig_); } static const uint32_t CPUID_RDRAND_FLAG = 1 << 30; static const uint32_t CPUID_RTM_FLAG = 1 << 11; static const uint32_t CPUID_RDSEED_FLAG = 1 << 18; static const uint32_t CPUID_XSAVEOPT_FLAG = 1 << 0; void DisableCPUIDFeatures::amend_cpuid_data(uint32_t eax_in, uint32_t ecx_in, CPUIDData* cpuid_data) const { switch (eax_in) { case CPUID_GETFEATURES: cpuid_data->ecx &= ~(CPUID_RDRAND_FLAG | features_ecx); cpuid_data->edx &= ~features_edx; break; case CPUID_GETEXTENDEDFEATURES: if (ecx_in == 0) { cpuid_data->ebx &= ~(CPUID_RDSEED_FLAG | CPUID_RTM_FLAG | extended_features_ebx); cpuid_data->ecx &= ~extended_features_ecx; cpuid_data->edx &= ~extended_features_edx; } break; case CPUID_GETXSAVE: if (ecx_in == 1) { // Always disable XSAVEOPT because it's nondeterministic, // possibly depending on context switching behavior. Intel // recommends not using it from user space. cpuid_data->eax &= ~(CPUID_XSAVEOPT_FLAG | xsave_features_eax); } break; default: break; } } } // namespace rr rr-5.5.0/src/RecordSession.h000066400000000000000000000215051412202446200156660ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_RECORD_SESSION_H_ #define RR_RECORD_SESSION_H_ #include #include #include "Scheduler.h" #include "SeccompFilterRewriter.h" #include "Session.h" #include "ThreadGroup.h" #include "TraceFrame.h" #include "WaitStatus.h" namespace rr { class RecordTask; struct DisableCPUIDFeatures { DisableCPUIDFeatures() : features_ecx(0) , features_edx(0) , extended_features_ebx(0) , extended_features_ecx(0) , extended_features_edx(0) , xsave_features_eax(0) {} bool any_features_disabled() const { return features_ecx || features_edx || extended_features_ebx || extended_features_ecx || extended_features_edx || xsave_features_eax; } /** * Includes disabling TSX and other rr-incompatible features */ void amend_cpuid_data(uint32_t eax_in, uint32_t ecx_in, CPUIDData* cpuid_data) const; /* in: EAX=0x01 */ uint32_t features_ecx; uint32_t features_edx; /* in: EAX=0x07 ECX=0 */ uint32_t extended_features_ebx; uint32_t extended_features_ecx; uint32_t extended_features_edx; /* in: EAX=0x0D ECX=1 */ uint32_t xsave_features_eax; }; struct TraceUuid { uint8_t bytes[16]; }; /** Encapsulates additional session state related to recording. */ class RecordSession : public Session { public: typedef std::shared_ptr shr_ptr; /** * Create a recording session for the initial command line |argv|. */ enum SyscallBuffering { ENABLE_SYSCALL_BUF, DISABLE_SYSCALL_BUF }; static shr_ptr create( const std::vector& argv, const std::vector& extra_env, const DisableCPUIDFeatures& features, SyscallBuffering syscallbuf = ENABLE_SYSCALL_BUF, unsigned char syscallbuf_desched_sig = SIGPWR, BindCPU bind_cpu = BIND_CPU, const std::string& output_trace_dir = "", const TraceUuid* trace_id = nullptr, bool use_audit = false, bool unmap_vdso = false, bool force_asan_active = false); const DisableCPUIDFeatures& disable_cpuid_features() const { return disable_cpuid_features_; } bool use_syscall_buffer() const { return use_syscall_buffer_; } size_t syscall_buffer_size() const { return syscall_buffer_size_; } unsigned char syscallbuf_desched_sig() const { return syscallbuf_desched_sig_; } bool use_read_cloning() const { return use_read_cloning_; } bool use_file_cloning() const { return use_file_cloning_; } void set_ignore_sig(int sig) { ignore_sig = sig; } int get_ignore_sig() const { return ignore_sig; } void set_continue_through_sig(int sig) { continue_through_sig = sig; } int get_continue_through_sig() const { return continue_through_sig; } void set_asan_active(bool active) { asan_active_ = active; } bool asan_active() const { return asan_active_; } bool use_audit() const { return use_audit_; } bool unmap_vdso() { return unmap_vdso_; } uint64_t rr_signal_mask() const; enum RecordStatus { // Some execution was recorded. record_step() can be called again. STEP_CONTINUE, // All tracees are dead. record_step() should not be called again. STEP_EXITED, // Spawning the initial tracee failed. An error message will be in // failure_message. STEP_SPAWN_FAILED }; struct RecordResult { RecordStatus status; // When status == STEP_EXITED WaitStatus exit_status; // When status == STEP_SPAWN_FAILED std::string failure_message; }; /** * Record some tracee execution. * This may block. If blocking is interrupted by a signal, will return * STEP_CONTINUE. * Typically you'd call this in a loop until it returns something other than * STEP_CONTINUE. * Note that when this returns, some tasks may be running (not in a ptrace- * stop). In particular, up to one task may be executing user code and any * number of tasks may be blocked in syscalls. */ RecordResult record_step(); /** * SIGKILL all tracees. */ void terminate_tracees(); /** * Close trace output without flushing syscall buffers or writing * task exit/termination records to the trace. */ void close_trace_writer(TraceWriter::CloseStatus status); virtual RecordSession* as_record() override { return this; } TraceWriter& trace_writer() { return trace_out; } virtual void on_destroy(Task* t) override; Scheduler& scheduler() { return scheduler_; } SeccompFilterRewriter& seccomp_filter_rewriter() { return seccomp_filter_rewriter_; } enum ContinueType { DONT_CONTINUE = 0, CONTINUE, CONTINUE_SYSCALL }; struct StepState { // Continue with this continuation type. ContinueType continue_type; StepState(ContinueType continue_type) : continue_type(continue_type) {} }; void set_enable_chaos(bool enable_chaos) { scheduler().set_enable_chaos(enable_chaos); enable_chaos_ = enable_chaos; trace_out.set_chaos_mode(enable_chaos); } bool enable_chaos() const { return enable_chaos_; } void set_num_cores(int num_cores) { scheduler().set_num_cores(num_cores); } void set_use_read_cloning(bool enable) { use_read_cloning_ = enable; } void set_use_file_cloning(bool enable) { use_file_cloning_ = enable; } void set_syscall_buffer_size(size_t size) { syscall_buffer_size_ = size; } void set_wait_for_all(bool wait_for_all) { this->wait_for_all_ = wait_for_all; } virtual Task* new_task(pid_t tid, pid_t rec_tid, uint32_t serial, SupportedArch a) override; RecordTask* find_task(pid_t rec_tid) const; RecordTask* find_task(const TaskUid& tuid) const; void on_proxy_detach(RecordTask *t, pid_t new_tid); /** * This gets called when we detect that a task has been revived from the * dead with a PTRACE_EVENT_EXEC. See ptrace man page under "execve(2) under * ptrace" for the horrid details. * * The task in the thread-group that triggered the successful execve has changed * its tid to |rec_tid|. We mirror that, and emit TraceTaskEvents to make it * look like a new task was spawned and the old task exited. */ RecordTask* revive_task_for_exec(pid_t rec_tid); virtual TraceStream* trace_stream() override { return &trace_out; } /** * Send SIGTERM to all detached tasks and wait for them to finish. */ void term_detached_tasks(); private: RecordSession(const std::string& exe_path, const std::vector& argv, const std::vector& envp, const DisableCPUIDFeatures& features, SyscallBuffering syscallbuf, int syscallbuf_desched_sig, BindCPU bind_cpu, const std::string& output_trace_dir, const TraceUuid* trace_id, bool use_audit, bool unmap_vdso); virtual void on_create(Task* t) override; void handle_seccomp_traced_syscall(RecordTask* t, RecordSession::StepState* step_state, RecordResult* result, bool* did_enter_syscall); // Returns false if the task exits during processing bool process_syscall_entry(RecordTask* t, StepState* step_state, RecordResult* step_result, SupportedArch syscall_arch); void check_initial_task_syscalls(RecordTask* t, RecordResult* step_result); bool handle_ptrace_event(RecordTask** t_ptr, StepState* step_state, RecordResult* result, bool* did_enter_syscall); bool handle_signal_event(RecordTask* t, StepState* step_state); void runnable_state_changed(RecordTask* t, StepState* step_state, RecordResult* step_result, bool can_consume_wait_status); bool signal_state_changed(RecordTask* t, StepState* step_state); void syscall_state_changed(RecordTask* t, StepState* step_state); void desched_state_changed(RecordTask* t); bool prepare_to_inject_signal(RecordTask* t, StepState* step_state); void task_continue(const StepState& step_state); TraceWriter trace_out; Scheduler scheduler_; ThreadGroup::shr_ptr initial_thread_group; SeccompFilterRewriter seccomp_filter_rewriter_; std::unique_ptr trace_id; DisableCPUIDFeatures disable_cpuid_features_; int ignore_sig; int continue_through_sig; Switchable last_task_switchable; size_t syscall_buffer_size_; unsigned char syscallbuf_desched_sig_; bool use_syscall_buffer_; bool use_file_cloning_; bool use_read_cloning_; /** * When true, try to increase the probability of finding bugs. */ bool enable_chaos_; bool asan_active_; /** * When true, wait for all tracees to exit before finishing recording. */ bool wait_for_all_; std::string output_trace_dir; bool use_audit_; bool unmap_vdso_; }; } // namespace rr #endif // RR_RECORD_SESSION_H_ rr-5.5.0/src/RecordTask.cc000066400000000000000000002210511412202446200153010ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "RecordTask.h" #include #include #include #include #include #include #include "AutoRemoteSyscalls.h" #include "PreserveFileMonitor.h" #include "RecordSession.h" #include "core.h" #include "kernel_abi.h" #include "kernel_metadata.h" #include "log.h" #include "record_signal.h" #include "rr/rr.h" #include "util.h" using namespace std; namespace rr { /** * Stores the table of signal dispositions and metadata for an * arbitrary set of tasks. Each of those tasks must own one one of * the |refcount|s while they still refer to this. */ struct Sighandler { Sighandler() : resethand(false), takes_siginfo(false) {} template void init_arch(const typename Arch::kernel_sigaction& ksa) { k_sa_handler = ksa.k_sa_handler; sa.resize(sizeof(ksa)); memcpy(sa.data(), &ksa, sizeof(ksa)); resethand = (ksa.sa_flags & SA_RESETHAND) != 0; takes_siginfo = (ksa.sa_flags & SA_SIGINFO) != 0; } template void reset_arch() { typename Arch::kernel_sigaction ksa; memset(&ksa, 0, sizeof(ksa)); DEBUG_ASSERT(uintptr_t(SIG_DFL) == 0); init_arch(ksa); } SignalDisposition disposition() const { DEBUG_ASSERT(uintptr_t(SIG_DFL) == 0); DEBUG_ASSERT(uintptr_t(SIG_IGN) == 1); switch (k_sa_handler.as_int()) { case 0: return SIGNAL_DEFAULT; case 1: return SIGNAL_IGNORE; default: return SIGNAL_HANDLER; } } remote_code_ptr get_user_handler() const { return disposition() == SIGNAL_HANDLER ? remote_code_ptr(k_sa_handler.as_int()) : remote_code_ptr(); } remote_ptr k_sa_handler; // Saved kernel_sigaction; used to restore handler vector sa; bool resethand; bool takes_siginfo; }; static void reset_handler(Sighandler* handler, SupportedArch arch) { RR_ARCH_FUNCTION(handler->reset_arch, arch); } struct Sighandlers { typedef shared_ptr shr_ptr; shr_ptr clone() const { shr_ptr s(new Sighandlers()); // NB: depends on the fact that Sighandler is for all // intents and purposes a POD type, though not // technically. for (size_t i = 0; i < array_length(handlers); ++i) { s->handlers[i] = handlers[i]; } return s; } Sighandler& get(int sig) { assert_valid(sig); return handlers[sig]; } const Sighandler& get(int sig) const { assert_valid(sig); return handlers[sig]; } void init_from_current_process() { for (size_t i = 1; i < array_length(handlers); ++i) { Sighandler& h = handlers[i]; NativeArch::kernel_sigaction sa; if (::syscall(SYS_rt_sigaction, i, nullptr, &sa, sizeof(uint64_t))) { /* EINVAL means we're querying an * unused signal number. */ DEBUG_ASSERT(EINVAL == errno); continue; } msan_unpoison(&sa, sizeof(NativeArch::kernel_sigaction)); h.init_arch(sa); } } /** * For each signal in |table| such that is_user_handler() is * true, reset the disposition of that signal to SIG_DFL, and * clear the resethand flag if it's set. SIG_IGN signals are * not modified. * * (After an exec() call copies the original sighandler table, * this is the operation required by POSIX to initialize that * table copy.) */ void reset_user_handlers(SupportedArch arch) { for (int i = 0; i < ssize_t(array_length(handlers)); ++i) { Sighandler& h = handlers[i]; // If the handler was a user handler, reset to // default. If it was SIG_IGN or SIG_DFL, // leave it alone. if (h.disposition() == SIGNAL_HANDLER) { reset_handler(&h, arch); } } } void assert_valid(int sig) const { DEBUG_ASSERT(0 < sig && sig < ssize_t(array_length(handlers))); } static shr_ptr create() { return shr_ptr(new Sighandlers()); } Sighandler handlers[_NSIG]; private: Sighandlers() {} Sighandlers(const Sighandlers&); Sighandlers operator=(const Sighandlers&); }; RecordTask::RecordTask(RecordSession& session, pid_t _tid, uint32_t serial, SupportedArch a) : Task(session, _tid, _tid, serial, a), ticks_at_last_recorded_syscall_exit(0), ip_at_last_recorded_syscall_exit(nullptr), time_at_start_of_last_timeslice(0), priority(0), in_round_robin_queue(false), stable_exit(false), detached_proxy(false), emulated_ptracer(nullptr), emulated_ptrace_event_msg(0), emulated_ptrace_options(0), emulated_ptrace_cont_command(0), emulated_stop_pending(false), emulated_ptrace_SIGCHLD_pending(false), emulated_SIGCHLD_pending(false), emulated_ptrace_seized(false), in_wait_type(WAIT_TYPE_NONE), in_wait_pid(0), emulated_stop_type(NOT_STOPPED), blocked_sigs_dirty(true), syscallbuf_blocked_sigs_generation(0), flushed_num_rec_bytes(0), flushed_syscallbuf(false), delay_syscallbuf_reset_for_desched(false), delay_syscallbuf_reset_for_seccomp_trap(false), prctl_seccomp_status(0), robust_futex_list_len(0), termination_signal(0), tsc_mode(PR_TSC_ENABLE), cpuid_mode(1), stashed_signals_blocking_more_signals(false), stashed_group_stop(false), break_at_syscallbuf_traced_syscalls(false), break_at_syscallbuf_untraced_syscalls(false), break_at_syscallbuf_final_instruction(false), next_pmc_interrupt_is_for_user(false), did_record_robust_futex_changes(false), waiting_for_reap(false), waiting_for_zombie(false), waiting_for_ptrace_exit(false), retry_syscall_patching(false), sent_shutdown_kill(false), tick_request_override((TicksRequest)0) { push_event(Event::sentinel()); if (session.tasks().empty()) { // Initial tracee. It inherited its state from this process, so set it up. // The very first task we fork inherits the signal // dispositions of the current OS process (which should all be // default at this point, but ...). From there on, new tasks // will transitively inherit from this first task. auto sh = Sighandlers::create(); sh->init_from_current_process(); sighandlers.swap(sh); own_namespace_rec_tid = _tid; } } RecordTask::~RecordTask() { if (emulated_ptracer) { emulated_ptracer->emulated_ptrace_tracees.erase(this); if (emulated_ptrace_options & PTRACE_O_TRACEEXIT) { ASSERT(this, stable_exit) << "PTRACE_O_TRACEEXIT only supported for stable exits for now"; } } for (RecordTask* t : emulated_ptrace_tracees) { // XXX emulate PTRACE_O_EXITKILL ASSERT(this, t->emulated_ptracer == this); t->emulated_ptracer = nullptr; t->emulated_ptrace_options = 0; t->emulated_stop_pending = false; t->emulated_stop_type = NOT_STOPPED; } // We expect tasks to usually exit by a call to exit() or // exit_group(), so it's not helpful to warn about that. if (EV_SENTINEL != ev().type() && (pending_events.size() > 2 || !(ev().type() == EV_SYSCALL && (is_exit_syscall(ev().Syscall().number, ev().Syscall().regs.arch()) || is_exit_group_syscall(ev().Syscall().number, ev().Syscall().regs.arch()))))) { LOG(info) << tid << " still has pending events. From top down:"; log_pending_events(); } if (detached_proxy) { // We kept the zombie of the orginal task around to prevent its pid from // being re-used. Reap that now. proceed_to_exit(); if (!already_reaped() && may_reap()) { reap(); } did_kill(); } } void RecordTask::record_exit_event(int fatal_signo, WriteChildTid write_child_tid) { // The kernel explicitly only clears the futex if the address space is shared. // If the address space has no other users then the futex will not be cleared // even if it lives in shared memory which other tasks can read. // If however, the exit was the result of a fatal, core-dump signal, the futex // is not cleared (both to preserve the coredump and because any other users // of the same address space were also shot down). if (!is_coredumping_signal(fatal_signo) && !tid_futex.is_null() && as->task_set().size() > 1 && as->has_mapping(tid_futex)) { int val = 0; record_local(tid_futex, &val); if (write_child_tid == WRITE_CHILD_TID) { // Write the memory now, otherwise the kernel will write it later and that can // race with the execution of other threads if we don't wait for this // thread to fully exit. // This could fail since the address space might have gone away/been switched // by execve. bool ok = true; write_mem(tid_futex, 0, &ok); // The kernel will do an unconditional futex wake on that location so we don't // need to do it. } } // Write the exit event here so that the value recorded above is captured. // Don't flush syscallbuf. Whatever triggered the exit (syscall, signal) // should already have flushed it, if it was running. If it was blocked, // then the syscallbuf would already have been flushed too. Trying to flush // syscallbuf for an exiting task could be bad, // e.g. it could be in the middle of syscallbuf code that's supposed to be // atomic. For the same reasons don't allow syscallbuf to be reset here. record_event(Event::exit(), DONT_FLUSH_SYSCALLBUF, DONT_RESET_SYSCALLBUF); } RecordSession& RecordTask::session() const { return *Task::session().as_record(); } TraceWriter& RecordTask::trace_writer() const { return session().trace_writer(); } Task* RecordTask::clone(CloneReason reason, int flags, remote_ptr stack, remote_ptr tls, remote_ptr cleartid_addr, pid_t new_tid, pid_t new_rec_tid, uint32_t new_serial, Session* other_session, FdTable::shr_ptr new_fds, ThreadGroup::shr_ptr new_tg) { ASSERT(this, reason == Task::TRACEE_CLONE); ASSERT(this, !new_fds); ASSERT(this, !new_tg); Task* t = Task::clone(reason, flags, stack, tls, cleartid_addr, new_tid, new_rec_tid, new_serial, other_session, new_fds, new_tg); if (t->session().is_recording()) { RecordTask* rt = static_cast(t); if (CLONE_CLEARTID & flags) { LOG(debug) << "cleartid futex is " << cleartid_addr; ASSERT(this, !cleartid_addr.is_null()); rt->tid_futex = cleartid_addr; } else { LOG(debug) << "(clone child not enabling CLEARTID)"; } } return t; } void RecordTask::post_wait_clone(Task* cloned_from, int flags) { ASSERT(cloned_from, cloned_from->session().is_recording()); Task::post_wait_clone(cloned_from, flags); RecordTask* rt = static_cast(cloned_from); priority = rt->priority; syscallbuf_code_layout = rt->syscallbuf_code_layout; prctl_seccomp_status = rt->prctl_seccomp_status; robust_futex_list = rt->robust_futex_list; robust_futex_list_len = rt->robust_futex_list_len; tsc_mode = rt->tsc_mode; cpuid_mode = rt->cpuid_mode; if (CLONE_SHARE_SIGHANDLERS & flags) { sighandlers = rt->sighandlers; } else { auto sh = rt->sighandlers->clone(); sighandlers.swap(sh); } update_own_namespace_tid(); } void RecordTask::post_exec() { // Change syscall number to execve *for the new arch*. If we don't do this, // and the arch changes, then the syscall number for execve in the old arch/ // is treated as the syscall we're executing in the new arch, with hilarious // results. int syscallno = syscall_number_for_execve(arch()); registers.set_original_syscallno(syscallno); // Fix event architecture and syscall number ev().Syscall().number = syscallno; ev().Syscall().set_arch(arch()); // The signal mask is inherited across execve so we don't need to invalidate. Task::post_exec(this->exe_path()); if (emulated_ptracer) { ASSERT(this, !(emulated_ptracer->arch() == x86 && arch() == x86_64)) << "We don't support a 32-bit process tracing a 64-bit process"; } // Clear robust_list state to match kernel state. If this task is cloned // soon after exec, we must not do a bogus set_robust_list syscall for // the clone. set_robust_list(nullptr, 0); sighandlers = sighandlers->clone(); sighandlers->reset_user_handlers(arch()); // Newly execed tasks always have non-faulting mode (from their point of // view, even if rr is secretly causing faults). cpuid_mode = 1; } template static void do_preload_init_arch(RecordTask* t) { auto params = t->read_mem( remote_ptr>(t->regs().arg1())); t->syscallbuf_code_layout.syscallbuf_final_exit_instruction = params.syscallbuf_final_exit_instruction.rptr().as_int(); t->syscallbuf_code_layout.syscallbuf_code_start = params.syscallbuf_code_start.rptr().as_int(); t->syscallbuf_code_layout.syscallbuf_code_end = params.syscallbuf_code_end.rptr().as_int(); t->syscallbuf_code_layout.get_pc_thunks_start = params.get_pc_thunks_start.rptr().as_int(); t->syscallbuf_code_layout.get_pc_thunks_end = params.get_pc_thunks_end.rptr().as_int(); unsigned char in_chaos = t->session().enable_chaos(); auto in_chaos_ptr REMOTE_PTR_FIELD(params.globals.rptr(), in_chaos); t->write_mem(in_chaos_ptr, in_chaos); t->record_local(in_chaos_ptr, &in_chaos); int cores = t->session().scheduler().pretend_num_cores(); auto cores_ptr = REMOTE_PTR_FIELD(params.globals.rptr(), pretend_num_cores); t->write_mem(cores_ptr, cores); t->record_local(cores_ptr, &cores); auto desched_sig = t->session().syscallbuf_desched_sig(); auto desched_sig_ptr = REMOTE_PTR_FIELD(params.globals.rptr(), desched_sig); t->write_mem(desched_sig_ptr, desched_sig); t->record_local(desched_sig_ptr, &desched_sig); uint64_t random_seed; do { random_seed = rand() | (uint64_t(rand()) << 32); } while (!random_seed); auto random_seed_ptr REMOTE_PTR_FIELD(params.globals.rptr(), random_seed); t->write_mem(random_seed_ptr, random_seed); t->record_local(random_seed_ptr, &random_seed); } void RecordTask::push_syscall_event(int syscallno) { push_event(SyscallEvent(syscallno, detect_syscall_arch())); } static void do_preload_init(RecordTask* t) { RR_ARCH_FUNCTION(do_preload_init_arch, t->arch(), t); } void RecordTask::at_preload_init() { Task::at_preload_init(); do_preload_init(this); } /** * Avoid using low-numbered file descriptors since that can confuse * developers. */ static int find_free_file_descriptor(pid_t for_tid) { int fd = 300 + (for_tid % 500); while (true) { char buf[PATH_MAX]; sprintf(buf, "/proc/%d/fd/%d", for_tid, fd); if (access(buf, F_OK) == -1 && errno == ENOENT) { return fd; } ++fd; } } template void RecordTask::init_buffers_arch() { // NB: the tracee can't be interrupted with a signal while // we're processing the rrcall, because it's masked off all // signals. AutoRemoteSyscalls remote(this); // Arguments to the rrcall. remote_ptr> child_args = regs().arg1(); auto args = read_mem(child_args); args.cloned_file_data_fd = -1; if (as->syscallbuf_enabled()) { args.syscallbuf_size = syscallbuf_size = session().syscall_buffer_size(); KernelMapping syscallbuf_km = init_syscall_buffer(remote, nullptr); args.syscallbuf_ptr = syscallbuf_child; desched_fd_child = args.desched_counter_fd; // Prevent the child from closing this fd fds->add_monitor(this, desched_fd_child, new PreserveFileMonitor()); desched_fd = remote.retrieve_fd(desched_fd_child); auto record_in_trace = trace_writer().write_mapped_region( this, syscallbuf_km, syscallbuf_km.fake_stat(), syscallbuf_km.fsname(), vector(), TraceWriter::RR_BUFFER_MAPPING); ASSERT(this, record_in_trace == TraceWriter::DONT_RECORD_IN_TRACE); if (trace_writer().supports_file_data_cloning() && session().use_read_cloning()) { cloned_file_data_fname = trace_writer().file_data_clone_file_name(tuid()); ScopedFd clone_file(cloned_file_data_fname.c_str(), O_RDWR | O_CREAT, 0600); int cloned_file_data = remote.send_fd(clone_file.get()); ASSERT(this, cloned_file_data >= 0); int free_fd = find_free_file_descriptor(tid); cloned_file_data_fd_child = remote.syscall(syscall_number_for_dup3(arch()), cloned_file_data, free_fd, O_CLOEXEC); if (cloned_file_data_fd_child != free_fd) { ASSERT(this, cloned_file_data_fd_child < 0); LOG(warn) << "Couldn't dup clone-data file to free fd"; cloned_file_data_fd_child = cloned_file_data; } else { // Prevent the child from closing this fd. We're going to close it // ourselves and we don't want the child closing it and then reopening // its own file with this fd. fds->add_monitor(this, cloned_file_data_fd_child, new PreserveFileMonitor()); remote.infallible_syscall(syscall_number_for_close(arch()), cloned_file_data); } args.cloned_file_data_fd = cloned_file_data_fd_child; } } else { args.syscallbuf_ptr = remote_ptr(nullptr); args.syscallbuf_size = 0; } args.scratch_buf = scratch_ptr; args.usable_scratch_size = usable_scratch_size(); // Return the mapped buffers to the child. write_mem(child_args, args); // The tracee doesn't need this addr returned, because it's // already written to the inout |args| param, but we stash it // away in the return value slot so that we can easily check // that we map the segment at the same addr during replay. remote.regs().set_syscall_result(syscallbuf_child); } void RecordTask::init_buffers() { RR_ARCH_FUNCTION(init_buffers_arch, arch()); } template void RecordTask::on_syscall_exit_arch(int syscallno, const Registers& regs) { switch (syscallno) { // These syscalls affect the sigmask even if they fail. case Arch::epoll_pwait: case Arch::pselect6: case Arch::pselect6_time64: case Arch::ppoll: case Arch::ppoll_time64: invalidate_sigmask(); break; } if (regs.original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO || regs.syscall_failed()) { return; } switch (syscallno) { case Arch::set_robust_list: set_robust_list(regs.orig_arg1(), (size_t)regs.arg2()); return; case Arch::sigaction: case Arch::rt_sigaction: // TODO: SYS_signal update_sigaction(regs); return; case Arch::set_tid_address: set_tid_addr(regs.orig_arg1()); return; case Arch::sigsuspend: case Arch::rt_sigsuspend: case Arch::sigprocmask: case Arch::rt_sigprocmask: case Arch::pselect6: case Arch::pselect6_time64: case Arch::ppoll: case Arch::ppoll_time64: invalidate_sigmask(); return; } } void RecordTask::on_syscall_exit(int syscallno, SupportedArch arch, const Registers& regs) { with_converted_registers(regs, arch, [&](const Registers& regs) { Task::on_syscall_exit(syscallno, arch, regs); RR_ARCH_FUNCTION(on_syscall_exit_arch, arch, syscallno, regs) }); } bool RecordTask::is_at_syscallbuf_syscall_entry_breakpoint() { auto i = ip().undo_executed_bkpt(arch()); for (auto p : syscallbuf_syscall_entry_breakpoints()) { if (i == p) { return true; } } return false; } bool RecordTask::is_at_syscallbuf_final_instruction_breakpoint() { if (!break_at_syscallbuf_final_instruction) { return false; } auto i = ip().undo_executed_bkpt(arch()); return i == syscallbuf_code_layout.syscallbuf_final_exit_instruction; } void RecordTask::will_resume_execution(ResumeRequest, WaitRequest, TicksRequest ticks_request, int sig) { // We may execute user code, which could lead to an RDTSC or grow-map // operation which unblocks SIGSEGV, and we'll need to know whether to // re-block it. So we need our cached sigmask to be up to date. // We don't need to this if we're not going to execute user code // (i.e. ticks_request == RESUME_NO_TICKS) except that did_wait can't // easily check for that and may restore blocked_sigs so it had better be // accurate. get_sigmask(); if (stashed_signals_blocking_more_signals) { // A stashed signal we have already accepted for this task may // have a sigaction::sa_mask that would block the next signal to be // delivered and cause it to be delivered to a different task. If we allow // such a signal to be delivered to this task then we run the risk of never // being able to process the signal (if it stays blocked indefinitely). // To prevent this, block any further signal delivery as long as there are // stashed signals. // We assume the kernel can't report a new signal of the same number // in response to us injecting a signal. XXX is this true??? We don't // have much choice, signal injection won't work if we block the signal. // We leave rr signals unblocked. TIME_SLICE_SIGNAL has to be unblocked // because blocking it seems to cause problems for some hardware/kernel // configurations (see https://github.com/rr-debugger/rr/issues/1979), // causing them to stop counting events. sig_set_t sigset = ~session().rr_signal_mask(); if (sig) { // We're injecting a signal, so make sure that signal is unblocked. sigset &= ~signal_bit(sig); } int ret = fallible_ptrace(PTRACE_SETSIGMASK, remote_ptr(8), &sigset); if (ret < 0) { if (errno == EIO) { FATAL() << "PTRACE_SETSIGMASK not supported; rr requires Linux kernel >= 3.11"; } ASSERT(this, errno == EINVAL); } else { LOG(debug) << "Set signal mask to block all signals (bar " << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " << " have a stashed signal"; } } // RESUME_NO_TICKS means that tracee code is not going to run so there's no // need to set breakpoints and in fact they might interfere with rr // processing. if (ticks_request != RESUME_NO_TICKS) { if (!at_may_restart_syscall()) { // If the tracee has SIGTRAP blocked or ignored and we hit one of these // breakpoints, the kernel will automatically unblock the signal and set // its disposition to DFL, effects which we ought to undo to keep these // SIGTRAPs invisible to tracees. Fixing the sigmask happens // automatically in did_wait(). Restoring the signal-ignored status is // handled in `handle_syscallbuf_breakpoint`. // Set breakpoints at untraced syscalls to catch us entering an untraced // syscall. We don't need to do this (and shouldn't do this) if the // execution requestor wants to stop inside untraced syscalls. // If we have an interrupted syscall that we may restart, don't // set the breakpoints because we should restart the syscall instead // of breaking and delivering signals. The syscallbuf code doesn't // (and must not) perform more than one blocking syscall for any given // buffered syscall. for (auto p : syscallbuf_syscall_entry_breakpoints()) { vm()->add_breakpoint(p, BKPT_INTERNAL); } } if (break_at_syscallbuf_final_instruction) { vm()->add_breakpoint( syscallbuf_code_layout.syscallbuf_final_exit_instruction, BKPT_INTERNAL); } } } vector RecordTask::syscallbuf_syscall_entry_breakpoints() { vector result; if (break_at_syscallbuf_untraced_syscalls) { result.push_back(AddressSpace::rr_page_syscall_entry_point( AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_ONLY, arch())); result.push_back(AddressSpace::rr_page_syscall_entry_point( AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_AND_REPLAY, arch())); } if (break_at_syscallbuf_traced_syscalls) { result.push_back(AddressSpace::rr_page_syscall_entry_point( AddressSpace::TRACED, AddressSpace::UNPRIVILEGED, AddressSpace::RECORDING_AND_REPLAY, arch())); } return result; } void RecordTask::did_wait() { for (auto p : syscallbuf_syscall_entry_breakpoints()) { vm()->remove_breakpoint(p, BKPT_INTERNAL); } if (break_at_syscallbuf_final_instruction) { vm()->remove_breakpoint( syscallbuf_code_layout.syscallbuf_final_exit_instruction, BKPT_INTERNAL); } if (stashed_signals_blocking_more_signals) { // Saved 'blocked_sigs' must still be correct regardless of syscallbuf // state, because we do not allow stashed_signals_blocking_more_signals // to hold across syscalls (traced or untraced) that change the signal mask. ASSERT(this, !blocked_sigs_dirty); xptrace(PTRACE_SETSIGMASK, remote_ptr(8), &blocked_sigs); } else if (syscallbuf_child) { // The syscallbuf struct is only 32 bytes currently so read the whole thing // at once to avoid multiple calls to read_mem. Even though this shouldn't // need a syscall because we use a local-mapping, apparently that lookup // is still noticeably expensive. auto syscallbuf = read_mem(syscallbuf_child); if (syscallbuf.in_sigprocmask_critical_section) { // |blocked_sigs| may have been updated but the syscall not yet issued. // Use the kernel's value. invalidate_sigmask(); } else { uint32_t syscallbuf_generation = syscallbuf.blocked_sigs_generation; if (syscallbuf_generation > syscallbuf_blocked_sigs_generation) { syscallbuf_blocked_sigs_generation = syscallbuf_generation; blocked_sigs = syscallbuf.blocked_sigs; } } } } void RecordTask::set_emulated_ptracer(RecordTask* tracer) { if (tracer) { ASSERT(this, !emulated_ptracer); emulated_ptracer = tracer; emulated_ptracer->emulated_ptrace_tracees.insert(this); } else { ASSERT(this, emulated_ptracer); ASSERT(this, emulated_stop_type == NOT_STOPPED || emulated_stop_type == GROUP_STOP); emulated_ptracer->emulated_ptrace_tracees.erase(this); emulated_ptracer = nullptr; } } bool RecordTask::emulate_ptrace_stop(WaitStatus status, const siginfo_t* siginfo, int si_code) { ASSERT(this, emulated_stop_type == NOT_STOPPED); if (!emulated_ptracer) { return false; } if (siginfo) { ASSERT(this, status.ptrace_signal() == siginfo->si_signo); save_ptrace_signal_siginfo(*siginfo); } else { siginfo_t si; memset(&si, 0, sizeof(si)); si.si_signo = status.ptrace_signal(); if (status.ptrace_event() || status.is_syscall()) { si.si_code = status.get() >> 8; } else { si.si_code = si_code; } save_ptrace_signal_siginfo(si); } force_emulate_ptrace_stop(status); return true; } void RecordTask::force_emulate_ptrace_stop(WaitStatus status) { emulated_stop_type = status.group_stop() ? GROUP_STOP : SIGNAL_DELIVERY_STOP; emulated_stop_code = status; emulated_stop_pending = true; emulated_ptrace_SIGCHLD_pending = true; emulated_ptracer->send_synthetic_SIGCHLD_if_necessary(); // The SIGCHLD will eventually be reported to rr via a ptrace stop, // interrupting wake_task's syscall (probably a waitpid) if necessary. At // that point, we'll fix up the siginfo data with values that match what // the kernel would have delivered for a real ptracer's SIGCHLD. When the // signal handler (if any) returns, if wake_task was in a blocking wait that // wait will be resumed, at which point rec_prepare_syscall_arch will // discover the pending ptrace result and emulate the wait syscall to // return that result immediately. } void RecordTask::do_ptrace_exit_stop(WaitStatus exit_status) { // Notify ptracer of the exit if it's not going to receive it from the // kernel because it's not the parent. (The kernel has similar logic to // deliver two stops in this case.) if (emulated_ptracer && (is_clone_child() || get_parent_pid() != emulated_ptracer->real_tgid())) { // The task is dead so treat it as not stopped so we can deliver a new stop emulated_stop_type = NOT_STOPPED; // This is a bit wrong; this is an exit stop, not a signal/ptrace stop. emulate_ptrace_stop(exit_status); } } void RecordTask::did_reach_zombie() { waiting_for_zombie = false; // Remove from address-space and fds list since we really aren't associated // with them anymore (and we can't be used to operate on them) as->erase_task(this); fds->erase_task(this); if (!already_reaped()) { if (may_reap()) { reap(); } else { waiting_for_reap = true; } } if ((already_reaped() || !waiting_for_reap) && !emulated_stop_pending) { delete this; } } void RecordTask::send_synthetic_SIGCHLD_if_necessary() { RecordTask* wake_task = nullptr; bool need_signal = false; for (RecordTask* tracee : emulated_ptrace_tracees) { if (tracee->emulated_ptrace_SIGCHLD_pending) { need_signal = true; // check to see if any thread in the ptracer process is in a waitpid that // could read the status of 'tracee'. If it is, we should wake up that // thread. Otherwise we send SIGCHLD to the ptracer thread. for (Task* t : thread_group()->task_set()) { auto rt = static_cast(t); if (rt->is_waiting_for_ptrace(tracee)) { wake_task = rt; break; } } if (wake_task) { break; } } } if (!need_signal) { for (ThreadGroup* child_tg : thread_group()->children()) { for (Task* child : child_tg->task_set()) { RecordTask* rchild = static_cast(child); if (rchild->emulated_SIGCHLD_pending) { need_signal = true; // check to see if any thread in the ptracer process is in a waitpid // that // could read the status of 'tracee'. If it is, we should wake up that // thread. Otherwise we send SIGCHLD to the ptracer thread. for (Task* t : thread_group()->task_set()) { auto rt = static_cast(t); if (rt->is_waiting_for(rchild)) { wake_task = rt; break; } } if (wake_task) { break; } } } } if (!need_signal) { return; } } // ptrace events trigger SIGCHLD in the ptracer's wake_task. // We can't set all the siginfo values to their correct values here, so // we'll patch this up when the signal is received. // If there's already a pending SIGCHLD, this signal will be ignored, // but at some point the pending SIGCHLD will be delivered and then // send_synthetic_SIGCHLD_if_necessary will be called again to deliver a new // SIGCHLD if necessary. siginfo_t si; memset(&si, 0, sizeof(si)); si.si_code = SI_QUEUE; si.si_value.sival_int = SIGCHLD_SYNTHETIC; int ret; if (wake_task) { LOG(debug) << "Sending synthetic SIGCHLD to tid " << wake_task->tid; // We must use the raw SYS_rt_tgsigqueueinfo syscall here to ensure the // signal is sent to the correct thread by tid. ret = syscall(SYS_rt_tgsigqueueinfo, wake_task->tgid(), wake_task->tid, SIGCHLD, &si); ASSERT(this, ret == 0); if (wake_task->is_sig_blocked(SIGCHLD)) { LOG(debug) << "SIGCHLD is blocked, kicking it out of the syscall"; // Just sending SIGCHLD won't wake it up. Send it a TIME_SLICE_SIGNAL // as well to make sure it exits a blocking syscall. We ensure those // can never be blocked. // We have to send a negative code here because only the kernel can set // positive codes. We set a magic number so we can recognize it // when received. si.si_code = SYNTHETIC_TIME_SLICE_SI_CODE; ret = syscall(SYS_rt_tgsigqueueinfo, wake_task->tgid(), wake_task->tid, PerfCounters::TIME_SLICE_SIGNAL, &si); ASSERT(this, ret == 0); } } else { // Send the signal to the process as a whole and let the kernel // decide which thread gets it. ret = syscall(SYS_rt_sigqueueinfo, tgid(), SIGCHLD, &si); ASSERT(this, ret == 0); LOG(debug) << "Sending synthetic SIGCHLD to pid " << tgid(); } } static bool is_synthetic_SIGCHLD(const siginfo_t& si) { return si.si_signo == SIGCHLD && si.si_value.sival_int == SIGCHLD_SYNTHETIC; } bool RecordTask::set_siginfo_for_synthetic_SIGCHLD(siginfo_t* si) { if (!is_synthetic_SIGCHLD(*si)) { return true; } if (is_syscall_restart() && EV_SYSCALL_INTERRUPTION == ev().type()) { int syscallno = regs().original_syscallno(); SupportedArch syscall_arch = ev().Syscall().arch(); if (is_waitpid_syscall(syscallno, syscall_arch) || is_waitid_syscall(syscallno, syscall_arch) || is_wait4_syscall(syscallno, syscall_arch)) { // Wait-like syscalls always check for notifications from waited-for processes // before they check for pending signals. So, if the tracee has a pending // notification that also generated a signal, the wait syscall will return // normally rather than returning with ERESTARTSYS etc. (The signal will // be dequeued and any handler run on the return to userspace, however.) // We need to emulate this by deferring our synthetic ptrace signal // until after the wait syscall has returned. LOG(debug) << "Deferring signal because we're in a wait"; // Return false to tell the caller to defer the signal and resume // the syscall. return false; } } for (RecordTask* tracee : emulated_ptrace_tracees) { if (tracee->emulated_ptrace_SIGCHLD_pending) { tracee->emulated_ptrace_SIGCHLD_pending = false; tracee->set_siginfo_for_waited_task( reinterpret_cast(si)); si->si_value.sival_int = 0; return true; } } for (ThreadGroup* child_tg : thread_group()->children()) { for (Task* child : child_tg->task_set()) { auto rchild = static_cast(child); if (rchild->emulated_SIGCHLD_pending) { rchild->emulated_SIGCHLD_pending = false; rchild->set_siginfo_for_waited_task( reinterpret_cast(si)); si->si_value.sival_int = 0; return true; } } } return true; } bool RecordTask::is_waiting_for_ptrace(RecordTask* t) { // This task's process must be a ptracer of t. if (!t->emulated_ptracer || t->emulated_ptracer->thread_group() != thread_group()) { return false; } // XXX need to check |options| to make sure this task is eligible!! switch (in_wait_type) { case WAIT_TYPE_NONE: return false; case WAIT_TYPE_ANY: return true; case WAIT_TYPE_SAME_PGID: return getpgid(t->tgid()) == getpgid(tgid()); case WAIT_TYPE_PGID: return getpgid(t->tgid()) == in_wait_pid; case WAIT_TYPE_PID: // When waiting for a ptracee, a specific pid is interpreted as the // exact tid. return t->tid == in_wait_pid; default: ASSERT(this, false); return false; } } bool RecordTask::is_waiting_for(RecordTask* t) { // t must be a child of this task. if (t->thread_group()->parent() != thread_group().get()) { return false; } switch (in_wait_type) { case WAIT_TYPE_NONE: return false; case WAIT_TYPE_ANY: return true; case WAIT_TYPE_SAME_PGID: return getpgid(t->tgid()) == getpgid(tgid()); case WAIT_TYPE_PGID: return getpgid(t->tgid()) == in_wait_pid; case WAIT_TYPE_PID: return t->tgid() == in_wait_pid; default: ASSERT(this, false); return false; } } void RecordTask::save_ptrace_signal_siginfo(const siginfo_t& si) { for (auto it = saved_ptrace_siginfos.begin(); it != saved_ptrace_siginfos.end(); ++it) { if (it->si_signo == si.si_signo) { saved_ptrace_siginfos.erase(it); break; } } saved_ptrace_siginfos.push_back(si); } siginfo_t& RecordTask::get_saved_ptrace_siginfo() { int sig = emulated_stop_code.ptrace_signal(); ASSERT(this, sig > 0); for (auto it = saved_ptrace_siginfos.begin(); it != saved_ptrace_siginfos.end(); ++it) { if (it->si_signo == sig) { return *it; } } ASSERT(this, false) << "No saved siginfo found for stop-signal???"; while (true) { // Avoid having to return anything along this (unreachable) path } } siginfo_t RecordTask::take_ptrace_signal_siginfo(int sig) { for (auto it = saved_ptrace_siginfos.begin(); it != saved_ptrace_siginfos.end(); ++it) { if (it->si_signo == sig) { siginfo_t si = *it; saved_ptrace_siginfos.erase(it); return si; } } siginfo_t si; memset(&si, 0, sizeof(si)); si.si_signo = sig; return si; } static pid_t get_ppid(pid_t pid) { auto ppid_str = read_proc_status_fields(pid, "PPid"); if (ppid_str.empty()) { return -1; } char* end; int actual_ppid = strtol(ppid_str[0].c_str(), &end, 10); return *end ? -1 : actual_ppid; } void RecordTask::apply_group_stop(int sig) { if (emulated_stop_type == NOT_STOPPED) { LOG(debug) << "setting " << tid << " to GROUP_STOP due to signal " << sig; WaitStatus status = WaitStatus::for_group_sig(sig, this); if (!emulate_ptrace_stop(status)) { emulated_stop_type = GROUP_STOP; emulated_stop_code = status; emulated_stop_pending = true; emulated_SIGCHLD_pending = true; RecordTask* t = session().find_task(get_ppid(tid)); if (t) { t->send_synthetic_SIGCHLD_if_necessary(); } } } } bool RecordTask::is_signal_pending(int sig) { auto pending_strs = read_proc_status_fields(tid, "SigPnd", "ShdPnd"); if (pending_strs.size() < 2) { return false; } char* end1; sig_set_t mask1 = strtoull(pending_strs[0].c_str(), &end1, 16); char* end2; sig_set_t mask2 = strtoull(pending_strs[1].c_str(), &end2, 16); return !*end1 && !*end2 && ((mask1 | mask2) & signal_bit(sig)); } bool RecordTask::has_any_actionable_signal() { auto sig_strs = read_proc_status_fields(tid, "SigPnd", "ShdPnd", "SigBlk"); if (sig_strs.size() < 3) { return false; } char* end1; uint64_t mask1 = strtoull(sig_strs[0].c_str(), &end1, 16); char* end2; uint64_t mask2 = strtoull(sig_strs[1].c_str(), &end2, 16); char* end3; uint64_t mask_blk = strtoull(sig_strs[2].c_str(), &end3, 16); return !*end1 && !*end2 && !*end3 && ((mask1 | mask2) & ~mask_blk); } void RecordTask::emulate_SIGCONT() { // All threads in the process are resumed. for (Task* t : thread_group()->task_set()) { auto rt = static_cast(t); LOG(debug) << "setting " << tid << " to NOT_STOPPED due to SIGCONT"; rt->clear_stashed_group_stop(); rt->emulated_stop_pending = false; rt->emulated_stop_type = NOT_STOPPED; } } void RecordTask::signal_delivered(int sig) { Sighandler& h = sighandlers->get(sig); if (h.resethand) { reset_handler(&h, arch()); } if (!is_sig_ignored(sig)) { switch (sig) { case SIGTSTP: case SIGTTIN: case SIGTTOU: if (h.disposition() == SIGNAL_HANDLER) { break; } RR_FALLTHROUGH; case SIGSTOP: // All threads in the process are stopped. for (Task* t : thread_group()->task_set()) { auto rt = static_cast(t); rt->apply_group_stop(sig); } break; case SIGCONT: emulate_SIGCONT(); break; } } send_synthetic_SIGCHLD_if_necessary(); } bool RecordTask::signal_has_user_handler(int sig) const { return sighandlers->get(sig).disposition() == SIGNAL_HANDLER; } remote_code_ptr RecordTask::get_signal_user_handler(int sig) const { return sighandlers->get(sig).get_user_handler(); } const vector& RecordTask::signal_action(int sig) const { return sighandlers->get(sig).sa; } bool RecordTask::signal_handler_takes_siginfo(int sig) const { return sighandlers->get(sig).takes_siginfo; } static bool is_unstoppable_signal(int sig) { return sig == SIGSTOP || sig == SIGKILL; } bool RecordTask::is_sig_blocked(int sig) { if (is_unstoppable_signal(sig)) { // These can never be blocked return false; } int sig_bit = sig - 1; return (get_sigmask() >> sig_bit) & 1; } bool RecordTask::is_sig_ignored(int sig) const { if (is_unstoppable_signal(sig)) { // These can never be ignored return false; } switch (sighandlers->get(sig).disposition()) { case SIGNAL_IGNORE: return true; case SIGNAL_DEFAULT: return IGNORE == default_action(sig); default: return false; } } SignalDisposition RecordTask::sig_disposition(int sig) const { return sighandlers->get(sig).disposition(); } SignalResolvedDisposition RecordTask::sig_resolved_disposition( int sig, SignalDeterministic deterministic) { if (is_fatal_signal(sig, deterministic)) { return DISPOSITION_FATAL; } if (signal_has_user_handler(sig) && !is_sig_blocked(sig)) { return DISPOSITION_USER_HANDLER; } return DISPOSITION_IGNORED; } void RecordTask::set_siginfo(const siginfo_t& si) { pending_siginfo = si; ptrace_if_alive(PTRACE_SETSIGINFO, nullptr, (void*)&si); } template void RecordTask::update_sigaction_arch(const Registers& regs) { int sig = regs.orig_arg1_signed(); remote_ptr new_sigaction = regs.arg2(); if (0 == regs.syscall_result() && !new_sigaction.is_null()) { // A new sighandler was installed. Update our // sighandler table. // TODO: discard attempts to handle or ignore signals // that can't be by POSIX typename Arch::kernel_sigaction sa; memset(&sa, 0, sizeof(sa)); read_bytes_helper(new_sigaction, sizeof(sa), &sa); sighandlers->get(sig).init_arch(sa); } } void RecordTask::update_sigaction(const Registers& regs) { RR_ARCH_FUNCTION(update_sigaction_arch, regs.arch(), regs); } sig_set_t RecordTask::read_sigmask_from_process() { // During syscall interruptions, PTRACE_GETSIGMASK may return the sigmask that is going // to be restored, not the kernel's current (internal) sigmask, which is what // /proc/.../status reports. Always go with what /proc/.../status reports. See // https://github.com/torvalds/linux/commit/fcfc2aa0185f4a731d05a21e9f359968fdfd02e7 // XXXkhuey and yet that's not what we actually do here ... if (at_interrupted_non_restartable_signal_modifying_syscall()) { // Mark the sigmask as already invalid. The moment we exit the kernel and run more // of the tracee the sigmask will change, so we need to keep refetching the // sigmask until that happens. invalidate_sigmask(); } else if (!at_may_restart_syscall()) { sig_set_t mask; long ret = fallible_ptrace(PTRACE_GETSIGMASK, remote_ptr(sizeof(sig_set_t)), &mask); if (ret >= 0) { return mask; } } auto results = read_proc_status_fields(tid, "SigBlk"); ASSERT(this, results.size() == 1); return strtoull(results[0].c_str(), NULL, 16); } sig_set_t RecordTask::get_sigmask() { if (blocked_sigs_dirty) { // Clear this first, read_sigmask_from_process might set it again. blocked_sigs_dirty = false; blocked_sigs = read_sigmask_from_process(); LOG(debug) << "Refreshed sigmask, now " << HEX(blocked_sigs); } return blocked_sigs; } void RecordTask::unblock_signal(int sig) { sig_set_t mask = get_sigmask(); mask &= ~signal_bit(sig); int ret = fallible_ptrace(PTRACE_SETSIGMASK, remote_ptr(8), &mask); if (ret < 0) { if (errno == EIO) { FATAL() << "PTRACE_SETSIGMASK not supported; rr requires Linux kernel >= 3.11"; } ASSERT(this, errno == EINVAL); } else { LOG(debug) << "Set signal mask to block all signals (bar " << "SYSCALLBUF_DESCHED_SIGNAL/TIME_SLICE_SIGNAL) while we " << " have a stashed signal"; } invalidate_sigmask(); } void RecordTask::set_sig_handler_default(int sig) { did_set_sig_handler_default(sig); // This could happen during a syscallbuf untraced syscall. In that case // our remote syscall here could trigger a desched signal if that event // is armed, making progress impossible. Disarm the event now. disarm_desched_event(this); AutoRemoteSyscalls remote(this); Sighandler& h = sighandlers->get(sig); AutoRestoreMem mem(remote, h.sa.data(), h.sa.size()); remote.infallible_syscall(syscall_number_for_rt_sigaction(arch()), sig, mem.get().as_int(), nullptr, sigaction_sigset_size(arch())); } void RecordTask::did_set_sig_handler_default(int sig) { Sighandler& h = sighandlers->get(sig); reset_handler(&h, arch()); } void RecordTask::verify_signal_states() { #ifndef DEBUG return; #endif if (ev().is_syscall_event()) { // If the syscall event is on the event stack with PROCESSING or EXITING // states, we won't have applied the signal-state updates yet while the // kernel may have. return; } if (detached_proxy) { // This task isn't real return; } auto results = read_proc_status_fields(tid, "SigBlk", "SigIgn", "SigCgt"); ASSERT(this, results.size() == 3); sig_set_t blocked = strtoull(results[0].c_str(), NULL, 16); sig_set_t ignored = strtoull(results[1].c_str(), NULL, 16); sig_set_t caught = strtoull(results[2].c_str(), NULL, 16); for (int sig = 1; sig < _NSIG; ++sig) { sig_set_t mask = signal_bit(sig); if (is_unstoppable_signal(sig)) { ASSERT(this, !(blocked & mask)) << "Expected " << signal_name(sig) << " to not be blocked, but it is"; ASSERT(this, !(ignored & mask)) << "Expected " << signal_name(sig) << " to not be ignored, but it is"; ASSERT(this, !(caught & mask)) << "Expected " << signal_name(sig) << " to not be caught, but it is"; } else { ASSERT(this, !!(blocked & mask) == is_sig_blocked(sig)) << signal_name(sig) << ((blocked & mask) ? " is blocked" : " is not blocked"); if (sig == SIGCHLD && is_container_init() && (ignored & mask)) { // pid-1-in-its-own-pid-namespace tasks can have their SIGCHLD set // to "ignore" when they die (in zap_pid_ns_processes). We may // not have observed anything relating to this death yet. We could // probe to ensure it's already marked as a zombie but why bother. // XXX arguably we should actually change our disposition here but // it would only matter in certain very weird cases: a vfork() where // the child process is pid-1 in its namespace. continue; } auto disposition = sighandlers->get(sig).disposition(); ASSERT(this, !!(ignored & mask) == (disposition == SIGNAL_IGNORE)) << signal_name(sig) << ((ignored & mask) ? " is ignored" : " is not ignored"); ASSERT(this, !!(caught & mask) == (disposition == SIGNAL_HANDLER)) << signal_name(sig) << ((caught & mask) ? " is caught" : " is not caught"); } } } void RecordTask::stash_sig() { int sig = stop_sig(); ASSERT(this, sig); // Callers should avoid passing SYSCALLBUF_DESCHED_SIGNAL in here. ASSERT(this, sig != session().syscallbuf_desched_sig()); // multiple non-RT signals coalesce if (sig < SIGRTMIN) { for (auto it = stashed_signals.begin(); it != stashed_signals.end(); ++it) { if (it->siginfo.si_signo == sig) { LOG(debug) << "discarding stashed signal " << sig << " since we already have one pending"; return; } } } const siginfo_t& si = get_siginfo(); stashed_signals.push_back(StashedSignal(si, is_deterministic_signal(this))); // Once we've stashed a signal, stop at the next traced/untraced syscall to // check whether we need to process the signal before it runs. stashed_signals_blocking_more_signals = break_at_syscallbuf_final_instruction = break_at_syscallbuf_traced_syscalls = break_at_syscallbuf_untraced_syscalls = true; } void RecordTask::stash_synthetic_sig(const siginfo_t& si, SignalDeterministic deterministic) { int sig = si.si_signo; DEBUG_ASSERT(sig); // Callers should avoid passing SYSCALLBUF_DESCHED_SIGNAL in here. DEBUG_ASSERT(sig != session().syscallbuf_desched_sig()); // multiple non-RT signals coalesce if (sig < SIGRTMIN) { for (auto it = stashed_signals.begin(); it != stashed_signals.end(); ++it) { if (it->siginfo.si_signo == sig) { if (deterministic == DETERMINISTIC_SIG && it->deterministic == NONDETERMINISTIC_SIG) { stashed_signals.erase(it); break; } else { LOG(debug) << "discarding stashed signal " << sig << " since we already have one pending"; return; } } } } stashed_signals.insert(stashed_signals.begin(), StashedSignal(si, deterministic)); stashed_signals_blocking_more_signals = break_at_syscallbuf_final_instruction = break_at_syscallbuf_traced_syscalls = break_at_syscallbuf_untraced_syscalls = true; } bool RecordTask::has_stashed_sig(int sig) const { for (auto it = stashed_signals.begin(); it != stashed_signals.end(); ++it) { if (it->siginfo.si_signo == sig) { return true; } } return false; } const siginfo_t* RecordTask::stashed_sig_not_synthetic_SIGCHLD() const { for (auto it = stashed_signals.begin(); it != stashed_signals.end(); ++it) { if (!is_synthetic_SIGCHLD(it->siginfo)) { return &it->siginfo; } } return nullptr; } void RecordTask::pop_stash_sig(const StashedSignal* stashed) { for (auto it = stashed_signals.begin(); it != stashed_signals.end(); ++it) { if (&*it == stashed) { stashed_signals.erase(it); return; } } ASSERT(this, false) << "signal not found"; } void RecordTask::stashed_signal_processed() { break_at_syscallbuf_final_instruction = break_at_syscallbuf_traced_syscalls = break_at_syscallbuf_untraced_syscalls = stashed_signals_blocking_more_signals = has_stashed_sig(); } const RecordTask::StashedSignal* RecordTask::peek_stashed_sig_to_deliver() const { if (stashed_signals.empty()) { return nullptr; } // Choose the first non-synthetic-SIGCHLD signal so that if a syscall should // be interrupted, we'll interrupt it. for (auto& sig : stashed_signals) { if (!is_synthetic_SIGCHLD(sig.siginfo)) { return &sig; } } return &stashed_signals[0]; } bool RecordTask::is_syscall_restart() { if (EV_SYSCALL_INTERRUPTION != ev().type()) { return false; } int syscallno = regs().original_syscallno(); SupportedArch syscall_arch = ev().Syscall().arch(); string call_name = syscall_name(syscallno, syscall_arch); bool is_restart = false; LOG(debug) << " is syscall interruption of recorded " << ev() << "? (now " << call_name << ")"; /* It's possible for the tracee to resume after a sighandler * with a fresh syscall that happens to be the same as the one * that was interrupted. So we check here if the args are the * same. * * Of course, it's possible (but less likely) for the tracee * to incidentally resume with a fresh syscall that just * happens to have the same *arguments* too. But in that * case, we would usually set up scratch buffers etc the same * was as for the original interrupted syscall, so we just * save a step here. * * TODO: it's possible for arg structures to be mutated * between the original call and restarted call in such a way * that it might change the scratch allocation decisions. */ if (is_restart_syscall_syscall(syscallno, syscall_arch)) { is_restart = true; syscallno = ev().Syscall().number; LOG(debug) << " (SYS_restart_syscall)"; } if (ev().Syscall().number != syscallno) { LOG(debug) << " interrupted " << ev() << " != " << call_name; goto done; } { const Registers& old_regs = ev().Syscall().regs; if (!(old_regs.arg1() == regs().arg1() && old_regs.arg2() == regs().arg2() && old_regs.arg3() == regs().arg3() && old_regs.arg4() == regs().arg4() && old_regs.arg5() == regs().arg5() && old_regs.arg6() == regs().arg6())) { LOG(debug) << " regs different at interrupted " << call_name << ": " << old_regs << " vs " << regs(); goto done; } } is_restart = true; done: if (is_restart) { LOG(debug) << " restart of " << call_name; } return is_restart; } template static uint64_t read_ptr_arch(Task* t, remote_ptr p, bool* ok) { return t->read_mem(p.cast(), ok); } static uint64_t read_ptr(Task* t, remote_ptr p, bool* ok) { RR_ARCH_FUNCTION(read_ptr_arch, t->arch(), t, p, ok); } bool RecordTask::is_in_syscallbuf() { if (!as->syscallbuf_enabled()) { // Even if we're in the rr page, if syscallbuf isn't enabled then the // rr page is not being used by syscallbuf. return false; } remote_code_ptr p = ip(); if (is_in_rr_page() || (syscallbuf_code_layout.get_pc_thunks_start <= p && p < syscallbuf_code_layout.get_pc_thunks_end)) { // Look at the caller to see if we're in the syscallbuf or not. bool ok = true; uint64_t addr = read_ptr(this, regs().sp(), &ok); if (ok) { p = addr; } } return as->monkeypatcher().is_jump_stub_instruction(p) || (syscallbuf_code_layout.syscallbuf_code_start <= p && p < syscallbuf_code_layout.syscallbuf_code_end); } bool RecordTask::at_may_restart_syscall() const { ssize_t depth = pending_events.size(); const Event* prev_ev = depth > 2 ? &pending_events[depth - 2] : nullptr; return EV_SYSCALL_INTERRUPTION == ev().type() || (EV_SIGNAL_DELIVERY == ev().type() && prev_ev && EV_SYSCALL_INTERRUPTION == prev_ev->type()); } bool RecordTask::at_interrupted_non_restartable_signal_modifying_syscall() const { auto r = regs(); // XXXkhuey io_uring_enter (not yet supported) can do this too. return r.syscall_result_signed() == -EINTR && is_epoll_pwait_syscall(r.original_syscallno(), arch()); } bool RecordTask::is_arm_desched_event_syscall() { return is_desched_event_syscall() && PERF_EVENT_IOC_ENABLE == regs().arg2(); } bool RecordTask::is_disarm_desched_event_syscall() { return (is_desched_event_syscall() && PERF_EVENT_IOC_DISABLE == regs().arg2()); } bool RecordTask::may_be_blocked() const { return (EV_SYSCALL == ev().type() && PROCESSING_SYSCALL == ev().Syscall().state) || emulated_stop_type != NOT_STOPPED || (EV_SIGNAL_DELIVERY == ev().type() && DISPOSITION_FATAL == ev().Signal().disposition) || waiting_for_zombie || waiting_for_ptrace_exit; } bool RecordTask::maybe_in_spinlock() { return time_at_start_of_last_timeslice == session().trace_writer().time() && regs().matches(registers_at_start_of_last_timeslice); } remote_ptr RecordTask::desched_rec() const { return (ev().is_syscall_event() ? ev().Syscall().desched_rec : (EV_DESCHED == ev().type()) ? ev().Desched().rec : nullptr); } bool RecordTask::running_inside_desched() const { for (auto& e : pending_events) { if (e.type() == EV_DESCHED) { return e.Desched().rec != desched_rec(); } } return false; } int RecordTask::get_ptrace_eventmsg_seccomp_data() { unsigned long data = 0; // in theory we could hit an assertion failure if the tracee suffers // a SIGKILL before we get here. But the SIGKILL would have to be // precisely timed between the generation of a PTRACE_EVENT_FORK/CLONE/ // SYS_clone event, and us fetching the event message here. if (fallible_ptrace(PTRACE_GETEVENTMSG, nullptr, &data) < 0) { ASSERT(this, errno == ESRCH); return -1; } return data; } void RecordTask::record_local(remote_ptr addr, ssize_t num_bytes, const void* data) { maybe_flush_syscallbuf(); ASSERT(this, num_bytes >= 0); if (!addr) { return; } trace_writer().write_raw(rec_tid, data, num_bytes, addr); } bool RecordTask::record_remote_by_local_map(remote_ptr addr, size_t num_bytes) { if (uint8_t* local_addr = as->local_mapping(addr, num_bytes)) { record_local(addr, num_bytes, local_addr); return true; } return false; } void RecordTask::record_remote(remote_ptr addr, ssize_t num_bytes) { maybe_flush_syscallbuf(); ASSERT(this, num_bytes >= 0); if (!addr) { return; } if (record_remote_by_local_map(addr, num_bytes) != 0) { return; } auto buf = read_mem(addr.cast(), num_bytes); trace_writer().write_raw(rec_tid, buf.data(), num_bytes, addr); } void RecordTask::record_remote_writable(remote_ptr addr, ssize_t num_bytes) { ASSERT(this, num_bytes >= 0); remote_ptr p = addr; while (p < addr + num_bytes) { if (!as->has_mapping(p)) { break; } auto m = as->mapping_of(p); if (!(m.map.prot() & PROT_WRITE)) { break; } p = m.map.end(); } num_bytes = min(num_bytes, p - addr); record_remote(addr, num_bytes); } ssize_t RecordTask::record_remote_fallible(remote_ptr addr, uintptr_t num_bytes, const std::vector& holes) { auto hole_iter = holes.begin(); uintptr_t offset = 0; vector buf; while (offset < num_bytes) { if (hole_iter != holes.end() && hole_iter->offset == offset) { offset += hole_iter->size; ++hole_iter; continue; } uintptr_t bytes = min(uintptr_t(4*1024*1024), num_bytes - offset); if (hole_iter != holes.end()) { ASSERT(this, hole_iter->offset > offset); bytes = min(bytes, uintptr_t(hole_iter->offset) - offset); } if (record_remote_by_local_map(addr + offset, bytes)) { offset += bytes; continue; } if (addr) { buf.resize(bytes); ssize_t nread = read_bytes_fallible(addr + offset, bytes, buf.data()); if (nread <= 0) { break; } trace_writer().write_raw_data(buf.data(), nread); offset += nread; } else { offset += bytes; } } trace_writer().write_raw_header(rec_tid, offset, addr, holes); return offset; } void RecordTask::record_remote_even_if_null(remote_ptr addr, ssize_t num_bytes) { maybe_flush_syscallbuf(); DEBUG_ASSERT(num_bytes >= 0); if (!addr) { trace_writer().write_raw(rec_tid, nullptr, 0, addr); return; } if (record_remote_by_local_map(addr, num_bytes) != 0) { return; } auto buf = read_mem(addr.cast(), num_bytes); trace_writer().write_raw(rec_tid, buf.data(), num_bytes, addr); } void RecordTask::pop_event(EventType expected_type) { ASSERT(this, pending_events.back().type() == expected_type); pending_events.pop_back(); } void RecordTask::log_pending_events() const { ssize_t depth = pending_events.size(); DEBUG_ASSERT(depth > 0); if (1 == depth) { LOG(info) << "(no pending events)"; return; } /* The event at depth 0 is the placeholder event, which isn't * useful to log. Skip it. */ for (auto it = pending_events.rbegin(); it != pending_events.rend(); ++it) { LOG(info) << *it; } } void RecordTask::maybe_flush_syscallbuf() { if (EV_SYSCALLBUF_FLUSH == ev().type()) { // Already flushing. return; } if (!syscallbuf_child) { return; } // This can be called while the task is not stopped, when we prematurely // terminate the trace. In that case, the tracee could be concurrently // modifying the header. We'll take a snapshot of the header now. // The syscallbuf code ensures that writes to syscallbuf records // complete before num_rec_bytes is incremented. struct syscallbuf_hdr hdr = read_mem(syscallbuf_child); ASSERT(this, !flushed_syscallbuf || flushed_num_rec_bytes == hdr.num_rec_bytes); if (!hdr.num_rec_bytes || flushed_syscallbuf) { // no records, or we've already flushed. return; } push_event(Event(SyscallbufFlushEvent())); // Apply buffered mprotect operations and flush the buffer in the tracee. if (hdr.mprotect_record_count) { auto& records = ev().SyscallbufFlush().mprotect_records; records = read_mem(REMOTE_PTR_FIELD(preload_globals, mprotect_records[0]), hdr.mprotect_record_count); for (auto& r : records) { as->protect(this, r.start, r.size, r.prot); } } // Write the entire buffer in one shot without parsing it, // because replay will take care of that. if (is_running()) { vector buf; buf.resize(sizeof(hdr) + hdr.num_rec_bytes); memcpy(buf.data(), &hdr, sizeof(hdr)); read_bytes_helper(syscallbuf_child + 1, hdr.num_rec_bytes, buf.data() + sizeof(hdr)); record_local(syscallbuf_child, buf.size(), buf.data()); } else { record_remote(syscallbuf_child, syscallbuf_data_size()); } record_current_event(); pop_event(EV_SYSCALLBUF_FLUSH); flushed_syscallbuf = true; flushed_num_rec_bytes = hdr.num_rec_bytes; LOG(debug) << "Syscallbuf flushed with num_rec_bytes=" << (uint32_t)hdr.num_rec_bytes; } /** * If the syscallbuf has just been flushed, and resetting hasn't been * overridden with a delay request, then record the reset event for * replay. */ void RecordTask::maybe_reset_syscallbuf() { if (flushed_syscallbuf && !delay_syscallbuf_reset_for_desched && !delay_syscallbuf_reset_for_seccomp_trap) { flushed_syscallbuf = false; LOG(debug) << "Syscallbuf reset"; reset_syscallbuf(); syscallbuf_blocked_sigs_generation = 0; record_event(Event::syscallbuf_reset()); } } void RecordTask::record_event(const Event& ev, FlushSyscallbuf flush, AllowSyscallbufReset reset, const Registers* registers) { if (flush == FLUSH_SYSCALLBUF) { maybe_flush_syscallbuf(); } FrameTime current_time = trace_writer().time(); if (should_dump_memory(ev, current_time)) { dump_process_memory(this, current_time, "rec"); } if (should_checksum(ev, current_time)) { checksum_process_memory(this, current_time); } if (trace_writer().clear_fip_fdp()) { const ExtraRegisters* maybe_extra = extra_regs_fallible(); if (maybe_extra) { ExtraRegisters extra_registers = *maybe_extra; extra_registers.clear_fip_fdp(); set_extra_regs(extra_registers); } } const ExtraRegisters* extra_registers = nullptr; if (ev.record_regs()) { if (!registers) { registers = ®s(); } if (ev.record_extra_regs()) { extra_registers = &extra_regs(); } } if (ev.is_syscall_event() && ev.Syscall().state == EXITING_SYSCALL) { ticks_at_last_recorded_syscall_exit = tick_count(); ip_at_last_recorded_syscall_exit = registers->ip(); } trace_writer().write_frame(this, ev, registers, extra_registers); LOG(debug) << "Wrote event " << ev << " for time " << current_time; if (!ev.has_ticks_slop() && reset == ALLOW_RESET_SYSCALLBUF) { ASSERT(this, flush == FLUSH_SYSCALLBUF); // After we've output an event, it's safe to reset the syscallbuf (if not // explicitly delayed) since we will have exited the syscallbuf code that // consumed the syscallbuf data. // This only works if the event has a reliable tick count so when we // reach it, we're done. maybe_reset_syscallbuf(); } } bool RecordTask::is_fatal_signal(int sig, SignalDeterministic deterministic) const { if (thread_group()->received_sigframe_SIGSEGV) { // Can't be blocked, caught or ignored return true; } auto action = default_action(sig); if (action != DUMP_CORE && action != TERMINATE) { // If the default action doesn't kill the process, it won't die. return false; } if (is_sig_ignored(sig)) { // Deterministic fatal signals can't be ignored. return deterministic == DETERMINISTIC_SIG; } // If there's a signal handler, the signal won't be fatal. return !signal_has_user_handler(sig); } void RecordTask::record_current_event() { record_event(ev()); } pid_t RecordTask::find_newborn_thread() { ASSERT(this, session().is_recording()); ASSERT(this, ptrace_event() == PTRACE_EVENT_CLONE); pid_t hint = get_ptrace_eventmsg(); char path[PATH_MAX]; sprintf(path, "/proc/%d/task/%d", tid, hint); struct stat stat_buf; // This should always succeed, but may fail in old kernels due to // a kernel bug. See RecordSession::handle_ptrace_event. if (!session().find_task(hint) && 0 == stat(path, &stat_buf)) { return hint; } sprintf(path, "/proc/%d/task", tid); DIR* dir = opendir(path); ASSERT(this, dir); while (true) { struct dirent* result = readdir(dir); ASSERT(this, result); char* end; pid_t thread_tid = strtol(result->d_name, &end, 10); if (*end == '\0' && !session().find_task(thread_tid)) { closedir(dir); return thread_tid; } } } pid_t RecordTask::find_newborn_process(pid_t child_parent) { ASSERT(this, session().is_recording()); ASSERT(this, ptrace_event() == PTRACE_EVENT_CLONE || ptrace_event() == PTRACE_EVENT_VFORK || ptrace_event() == PTRACE_EVENT_FORK); pid_t hint = get_ptrace_eventmsg(); // This should always succeed, but may fail in old kernels due to // a kernel bug. See RecordSession::handle_ptrace_event. if (!session().find_task(hint) && get_ppid(hint) == child_parent) { return hint; } DIR* dir = opendir("/proc"); ASSERT(this, dir); while (true) { struct dirent* result = readdir(dir); ASSERT(this, result); char* end; pid_t proc_tid = strtol(result->d_name, &end, 10); if (*end == '\0' && !session().find_task(proc_tid) && get_ppid(proc_tid) == child_parent) { closedir(dir); return proc_tid; } } } void RecordTask::set_tid_addr(remote_ptr tid_addr) { LOG(debug) << "updating cleartid futex to " << tid_addr; tid_futex = tid_addr; } void RecordTask::update_own_namespace_tid() { AutoRemoteSyscalls remote(this); own_namespace_rec_tid = remote.infallible_syscall_if_alive(syscall_number_for_gettid(arch())); if (own_namespace_rec_tid == -ESRCH) { own_namespace_rec_tid = -1; } } void RecordTask::kill_if_alive() { if (!is_dying()) { tgkill(SIGKILL); } } pid_t RecordTask::get_parent_pid() const { return get_ppid(tid); } void RecordTask::set_tid_and_update_serial(pid_t tid, pid_t own_namespace_tid) { hpc.set_tid(tid); this->tid = rec_tid = tid; serial = session().next_task_serial(); own_namespace_rec_tid = own_namespace_tid; } bool RecordTask::may_reap() { if (emulated_stop_pending) { LOG(debug) << "Declining to reap " << tid << "; emulated stop pending"; // Don't reap until the emulated ptrace stop has been processed. return false; } // Non thread-group-leaders may always be reaped if (tid != real_tgid()) { return true; } for (auto it : thread_group()->task_set()) { if (&*it != this) { LOG(debug) << "Declining to reap " << tid << "; leader of non-empty thread-group with active thread " << it->tid; return false; } } return true; } void RecordTask::reap() { ASSERT(this, !was_reaped); LOG(debug) << "Reaping " << tid; siginfo_t info; memset(&info, 0, sizeof(info)); int ret = waitid(P_PID, tid, &info, WEXITED | WNOHANG); if (ret != 0) { FATAL() << "Unexpected wait status for tid " << tid; } /* The sid_pid == 0 case here is the same as the case below where we're the * group leader whose pid gets stolen. */ DEBUG_ASSERT(info.si_pid == tid || info.si_pid == 0); was_reaped = true; } bool RecordTask::try_wait() { if (wait_unexpected_exit()) { return true; } // Check if there is a status change for us WaitStatus status; siginfo_t info; memset(&info, 0, sizeof(siginfo_t)); int ret = waitid(P_PID, tid, &info, WSTOPPED | WNOHANG); ASSERT(this, 0 == ret || (-1 == ret && errno == ECHILD)) << "waitid(" << tid << ", WSTOPPED | NOHANG) failed with " << ret; LOG(debug) << "waitid(" << tid << ", NOHANG) returns " << ret; if (ret == 0 && info.si_pid == 0) { return false; } if (ret == 0) { status = WaitStatus(info); } else if (ret == -1) { ASSERT(this, errno == ECHILD); // Either we died/are dying unexpectedly, or we were in exec and changed the tid. // Try to differentiate the two situations by seeing if there is an exit // notification ready for us to de-queue, in which case we synthesize an // exit event (but don't actually reap the task, instead leaving that // for the generic cleanup code). int ret = waitid(P_PID, tid, &info, WEXITED | WNOWAIT | WNOHANG); if (ret == 0) { if (info.si_pid == tid) { LOG(debug) << "Synthesizing PTRACE_EVENT_EXIT for zombie process in try_wait " << tid; status = WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT); } else { // This can happen when the task is in zap_pid_ns_processes waiting for all tasks // in the pid-namespace to exit. It's not in a signal stop, but it's also not // ready to be reaped yet, yet we're still tracing it. Don't wait on this // task, we should be able to reap it later. ASSERT(this, info.si_pid == 0); return false; } } else { ASSERT(this, ret == -1 && errno == ECHILD) << "waitpid failed with " << ret; return false; } } did_waitpid(status); return true; } static uint64_t read_pid_ns(const RecordTask* t) { char buf[PATH_MAX]; sprintf(buf, "/proc/%d/ns/pid", t->tid); char link[PATH_MAX]; int ret = readlink(buf, link, sizeof(link)); ASSERT(t, ret >= 0); ASSERT(t, ret < (int)sizeof(link)); link[ret] = 0; ASSERT(t, strncmp(link, "pid:[", 5) == 0); char* end; uint64_t result = strtoul(link + 5, &end, 10); ASSERT(t, strcmp(end, "]") == 0); return result; } bool RecordTask::waiting_for_pid_namespace_tasks_to_exit() const { if (tg->tgid_own_namespace != 1) { return false; } // This might be the last live thread for pid-1 in the pid namespace. // Checking that it *is* the last live thread is tricky because other // threads could unexpectedly die asynchronously :-(. // See if there are any other tasks in the pid namespace. // Note that due to setns there can be tasks in the pid namespace // with parents outside the pid namespace other than our thread-group. // If there are multiple threads in our threadgroup, they're in our // pid namespace. if (thread_group()->task_set().size() > 1) { return true; } // If we have any child processes then those belong to our pid namespace // (or a descendant). for (auto p : session().thread_group_map()) { if (p.second->parent() == tg.get()) { return true; } } // If there are any other tasks in the pid namespace at least one must be // directly in the namespace. uint64_t pid_ns = read_pid_ns(this); for (auto it : session().tasks()) { auto rt = static_cast(it.second); if (rt == this) { continue; } if (read_pid_ns(rt) == pid_ns) { return true; } } return false; } int RecordTask::process_depth() const { int depth = 0; ThreadGroup* tg = this->tg.get(); while (tg) { ++depth; tg = tg->parent(); } return depth; } template static void maybe_restore_original_syscall_registers_arch(RecordTask* t, void* local_addr) { if (!local_addr) { return; } auto locals = reinterpret_cast*>(local_addr); static_assert(sizeof(*locals) <= PRELOAD_THREAD_LOCALS_SIZE, "bad PRELOAD_THREAD_LOCALS_SIZE"); if (!locals->original_syscall_parameters) { return; } auto args = t->read_mem(locals->original_syscall_parameters.rptr()); Registers r = t->regs(); if (args.no != r.syscallno()) { // Maybe a preparatory syscall before the real syscall (e.g. sys_read) return; } r.set_arg1(args.args[0]); r.set_arg2(args.args[1]); r.set_arg3(args.args[2]); r.set_arg4(args.args[3]); r.set_arg5(args.args[4]); r.set_arg6(args.args[5]); t->set_regs(r); } void RecordTask::maybe_restore_original_syscall_registers() { RR_ARCH_FUNCTION(maybe_restore_original_syscall_registers_arch, arch(), this, preload_thread_locals()); } bool RecordTask::post_vm_clone(CloneReason reason, int flags, Task* origin) { if (Task::post_vm_clone(reason, flags, origin)) { KernelMapping preload_thread_locals_mapping = vm()->mapping_of(AddressSpace::preload_thread_locals_start()).map; auto mode = trace_writer().write_mapped_region( this, preload_thread_locals_mapping, preload_thread_locals_mapping.fake_stat(), preload_thread_locals_mapping.fsname(), vector(), TraceWriter::RR_BUFFER_MAPPING); ASSERT(this, mode == TraceWriter::DONT_RECORD_IN_TRACE); return true; } return false; }; } // namespace rr rr-5.5.0/src/RecordTask.h000066400000000000000000000703061412202446200151500ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_RECORD_TASK_H_ #define RR_RECORD_TASK_H_ #include "Registers.h" #include "Task.h" #include "TraceFrame.h" namespace rr { struct Sighandlers; /** Different kinds of waits a task can do. */ enum WaitType { // Not waiting for anything WAIT_TYPE_NONE, // Waiting for any child process WAIT_TYPE_ANY, // Waiting for any child with the same process group ID WAIT_TYPE_SAME_PGID, // Waiting for any child with a specific process group ID WAIT_TYPE_PGID, // Waiting for a specific process ID WAIT_TYPE_PID }; /** Reasons why we simulate stopping of a task (see ptrace(2) man page). */ enum EmulatedStopType { NOT_STOPPED, GROUP_STOP, // stopped by a signal. This applies to non-ptracees too. SIGNAL_DELIVERY_STOP,// Stopped before delivering a signal. ptracees only. CHILD_STOP // All other kinds of non-ptrace stops }; /** * Pass USE_SYSGOOD to emulate_ptrace_stop to add 0x80 to the signal * if PTRACE_O_TRACESYSGOOD is in effect. */ enum AddSysgoodFlag { IGNORE_SYSGOOD, USE_SYSGOOD }; struct SyscallbufCodeLayout { remote_code_ptr syscallbuf_code_start; remote_code_ptr syscallbuf_code_end; remote_code_ptr get_pc_thunks_start; remote_code_ptr get_pc_thunks_end; remote_code_ptr syscallbuf_final_exit_instruction; }; enum SignalDisposition { SIGNAL_DEFAULT, SIGNAL_IGNORE, SIGNAL_HANDLER }; /** * Every Task owned by a RecordSession is a RecordTask. Functionality that * only applies during recording belongs here. */ class RecordTask : public Task { public: RecordTask(RecordSession& session, pid_t _tid, uint32_t serial, SupportedArch a); Task* clone(CloneReason reason, int flags, remote_ptr stack, remote_ptr tls, remote_ptr cleartid_addr, pid_t new_tid, pid_t new_rec_tid, uint32_t new_serial, Session* other_session = nullptr, FdTable::shr_ptr new_fds = nullptr, ThreadGroup::shr_ptr new_tg = nullptr) override; virtual void post_wait_clone(Task* cloned_from, int flags) override; virtual void on_syscall_exit(int syscallno, SupportedArch arch, const Registers& regs) override; virtual void will_resume_execution(ResumeRequest, WaitRequest, TicksRequest, int /*sig*/) override; virtual void did_wait() override; std::vector syscallbuf_syscall_entry_breakpoints(); bool is_at_syscallbuf_syscall_entry_breakpoint(); bool is_at_syscallbuf_final_instruction_breakpoint(); /** * Initialize tracee buffers in this, i.e., implement * RRCALL_init_syscall_buffer. This task must be at the point * of *exit from* the rrcall. Registers will be updated with * the return value from the rrcall, which is also returned * from this call. */ void init_buffers(); void post_exec(); /** * Called when SYS_rrcall_init_preload has happened. */ virtual void at_preload_init() override; RecordSession& session() const; TraceWriter& trace_writer() const; /** * Emulate 'tracer' ptracing this task. */ void set_emulated_ptracer(RecordTask* tracer); /** * Call this when an event occurs that should stop a ptraced task. * If we're emulating ptrace of the task, stop the task and wake the ptracer * if it's waiting, and queue "status" to be reported to the * ptracer. If siginfo is non-null, we'll report that siginfo, otherwise we'll * make one up based on the status (unless the status is an exit code). * Returns true if the task is stopped-for-emulated-ptrace, false otherwise. */ bool emulate_ptrace_stop(WaitStatus status, const siginfo_t* siginfo = nullptr, int si_code = 0); /** * Force the ptrace-stop state no matter what state the task is currently in. */ void force_emulate_ptrace_stop(WaitStatus status); /** * If necessary, signal the ptracer that this task has exited. */ void do_ptrace_exit_stop(WaitStatus exit_status); /** * Return the exit event. * If write_child_tid is set, zero out child_tid now if applicable. */ enum WriteChildTid { KERNEL_WRITES_CHILD_TID, WRITE_CHILD_TID, }; void record_exit_event(int exitsig = 0, WriteChildTid write_child_tid = KERNEL_WRITES_CHILD_TID); /** * Called when we're about to deliver a signal to this task. If it's a * synthetic SIGCHLD and there's a ptraced task that needs to SIGCHLD, * update the siginfo to reflect the status and note that that * ptraced task has had its SIGCHLD sent. * Note that we can't set the correct siginfo when we send the signal, because * it requires us to set information only the kernel has permission to set. * Returns false if this signal should be deferred. */ bool set_siginfo_for_synthetic_SIGCHLD(siginfo_t* si); /** * Sets up |si| as if we're delivering a SIGCHLD/waitid for this waited task. */ template void set_siginfo_for_waited_task(typename Arch::siginfo_t* si) { // XXX: The `ptrace` argument is likely incorrect here. emulated_stop_code.fill_siginfo(si, emulated_stop_type != GROUP_STOP, emulated_ptrace_options); si->_sifields._sigchld.si_pid_ = tgid(); si->_sifields._sigchld.si_uid_ = getuid(); } /** * Return a reference to the saved siginfo record for the stop-signal * that we're currently in a ptrace-stop for. */ siginfo_t& get_saved_ptrace_siginfo(); /** * When emulating a ptrace-continue with a signal number, extract the siginfo * that was saved by |save_ptrace_signal_siginfo|. If no such siginfo was * saved, make one up. */ siginfo_t take_ptrace_signal_siginfo(int sig); /** * Returns true if this task is in a waitpid or similar that would return * when t's status changes due to a ptrace event. */ bool is_waiting_for_ptrace(RecordTask* t); /** * Returns true if this task is in a waitpid or similar that would return * when t's status changes due to a regular event (exit). */ bool is_waiting_for(RecordTask* t); virtual bool already_exited() const override { return waiting_for_reap || waiting_for_zombie; } /** * Call this to force a group stop for this task with signal 'sig', * notifying ptracer if necessary. */ void apply_group_stop(int sig); /** * Call this after |sig| is delivered to this task. Emulate * sighandler updates induced by the signal delivery. */ void signal_delivered(int sig); /** * Return true if |sig| is pending but hasn't been reported to ptrace yet */ bool is_signal_pending(int sig); /** * Return true if there are any signals pending that are not blocked. */ bool has_any_actionable_signal(); /** * Get all threads out of an emulated GROUP_STOP */ void emulate_SIGCONT(); /** * Return true if the disposition of |sig| in |table| isn't * SIG_IGN or SIG_DFL, that is, if a user sighandler will be * invoked when |sig| is received. */ bool signal_has_user_handler(int sig) const; /** * If signal_has_user_handler(sig) is true, return the address of the * user handler, otherwise return null. */ remote_code_ptr get_signal_user_handler(int sig) const; /** * Return true if the signal handler for |sig| takes a siginfo_t* * parameter. */ bool signal_handler_takes_siginfo(int sig) const; /** * Return |sig|'s current sigaction. Returned as raw bytes since the * data is architecture-dependent. */ const std::vector& signal_action(int sig) const; /** Return true iff |sig| is blocked for this. */ bool is_sig_blocked(int sig); /** * Return true iff |sig| is SIG_IGN, or it's SIG_DFL and the * default disposition is "ignore". */ bool is_sig_ignored(int sig) const; /** * Return the applications current disposition of |sig|. */ SignalDisposition sig_disposition(int sig) const; /** * Return the resolved disposition --- what this signal will actually do, * taking into account the default behavior. */ SignalResolvedDisposition sig_resolved_disposition( int sig, SignalDeterministic deterministic); /** * Set the siginfo for the signal-stop of this. */ void set_siginfo(const siginfo_t& si); /** Note that the task sigmask needs to be refetched. */ void invalidate_sigmask() { blocked_sigs_dirty = true; } /** * Reset the signal handler for this signal to the default. */ void did_set_sig_handler_default(int sig); /** * Check that our status for |sig| matches what's in /proc//status. */ void verify_signal_states(); /** * Stashed-signal API: if a signal becomes pending at an * awkward time, but could be handled "soon", call * |stash_sig()| to stash the current pending-signal state. * * |has_stashed_sig()| obviously returns true if |stash_sig()| * has been called successfully. * * |pop_stash_sig()| restores the (relevant) state of this * Task to what was saved in |stash_sig()|, and returns the * saved siginfo. After this call, |has_stashed_sig()| is * false. * * NB: |get_siginfo()| will always return the "real" siginfo, * regardless of stash popped-ness state. Callers must ensure * they do the right thing with the popped siginfo. * * If the process unexpectedly died (due to SIGKILL), we don't * stash anything. */ void stash_sig(); void stash_synthetic_sig(const siginfo_t& si, SignalDeterministic deterministic); bool has_stashed_sig() const { return !stashed_signals.empty(); } const siginfo_t* stashed_sig_not_synthetic_SIGCHLD() const; bool has_stashed_sig(int sig) const; struct StashedSignal { StashedSignal(const siginfo_t& siginfo, SignalDeterministic deterministic) : siginfo(siginfo), deterministic(deterministic) {} siginfo_t siginfo; SignalDeterministic deterministic; }; const StashedSignal* peek_stashed_sig_to_deliver() const; void pop_stash_sig(const StashedSignal* stashed); void stashed_signal_processed(); /** * If a group-stop occurs at an inconvenient time, stash it and * process it later. */ void stash_group_stop() { stashed_group_stop = true; } void clear_stashed_group_stop() { stashed_group_stop = false; } bool has_stashed_group_stop() const { return stashed_group_stop; } /** * Return true if the current state of this looks like the * interrupted syscall at the top of our event stack, if there * is one. */ bool is_syscall_restart(); /** * Return true iff this is at an execution state where * resuming execution may lead to the restart of an * interrupted syscall. * * For example, if a signal without a user handler is about to * be delivered to this just after a syscall interruption, * then delivering the signal may restart the first syscall * and this method will return true. */ bool at_may_restart_syscall() const; /** * Return true iff this is at an execution state where * a syscall that modifes isgnals was interrupted but will not * be automatically restarted. **/ bool at_interrupted_non_restartable_signal_modifying_syscall() const; /** * Return true if this is at an arm-desched-event syscall. */ bool is_arm_desched_event_syscall(); /** * Return true if this is at a disarm-desched-event syscall. */ bool is_disarm_desched_event_syscall(); /** * Return true if |t| may not be immediately runnable, * i.e., resuming execution and then |waitpid()|'ing may block * for an unbounded amount of time. When the task is in this * state, the tracer must await a |waitpid()| notification * that the task is no longer possibly-blocked before resuming * its execution. */ bool may_be_blocked() const; /** * Returns true if it looks like this task has been spinning on an atomic * access/lock. */ bool maybe_in_spinlock(); /** * Return true if this is within the syscallbuf library. This * *does not* imply that $ip is at a buffered syscall. */ bool is_in_syscallbuf(); /** * Shortcut to the most recent |pending_event->desched.rec| when * there's a desched event on the stack, and nullptr otherwise. * Exists just so that clients don't need to dig around in the * event stack to find this record. */ remote_ptr desched_rec() const; /** * Returns true when the task is in a signal handler in an interrupted * system call being handled by syscall buffering. */ bool running_inside_desched() const; /** * Returns -1 if we failed (the process unexpectedly exited). */ int get_ptrace_eventmsg_seccomp_data(); /** * Save tracee data to the trace. |addr| is the address in * the address space of this task. The |record_local*()| * variants record data that's already been read from this, * and the |record_remote*()| variants read the data and then * record it. * If 'addr' is null then no record is written. */ void record_local(remote_ptr addr, ssize_t num_bytes, const void* buf); template void record_local(remote_ptr addr, const T* buf, size_t count = 1) { record_local(addr, sizeof(T) * count, buf); } void record_remote(remote_ptr addr, ssize_t num_bytes); template void record_remote(remote_ptr addr) { record_remote(addr, sizeof(T)); } void record_remote(const MemoryRange& range) { record_remote(range.start(), range.size()); } ssize_t record_remote_fallible(const MemoryRange& range) { return record_remote_fallible(range.start(), range.size()); } // Record as much as we can of the bytes in this range. Will record only // contiguous mapped data starting at `addr`. ssize_t record_remote_fallible(remote_ptr addr, uintptr_t num_bytes, const std::vector& holes = std::vector()); // Record as much as we can of the bytes in this range. Will record only // contiguous mapped-writable data starting at `addr`. void record_remote_writable(remote_ptr addr, ssize_t num_bytes); // Simple helper that attempts to use the local mapping to record if one // exists bool record_remote_by_local_map(remote_ptr addr, size_t num_bytes); /** * Save tracee data to the trace. |addr| is the address in * the address space of this task. * If 'addr' is null then a zero-length record is written. */ void record_remote_even_if_null(remote_ptr addr, ssize_t num_bytes); template void record_remote_even_if_null(remote_ptr addr) { record_remote_even_if_null(addr, sizeof(T)); } /** * Manage pending events. |push_event()| pushes the given * event onto the top of the event stack. The |pop_*()| * helpers pop the event at top of the stack, which must be of * the specified type. */ void push_event(const Event& ev) { pending_events.push_back(ev); } void push_syscall_event(int syscallno); void pop_event(EventType expected_type); void pop_noop() { pop_event(EV_NOOP); } void pop_desched() { pop_event(EV_DESCHED); } void pop_seccomp_trap() { pop_event(EV_SECCOMP_TRAP); } void pop_signal_delivery() { pop_event(EV_SIGNAL_DELIVERY); } void pop_signal_handler() { pop_event(EV_SIGNAL_HANDLER); } void pop_syscall() { pop_event(EV_SYSCALL); } void pop_syscall_interruption() { pop_event(EV_SYSCALL_INTERRUPTION); } virtual void log_pending_events() const override; /** Return the event at the top of this's stack. */ Event& ev() { return pending_events.back(); } const Event& ev() const { return pending_events.back(); } /** * Obtain the previous event on the stack (if any) or nullptr (if not) */ Event *prev_ev() { ssize_t depth = pending_events.size(); return depth > 2 ? &pending_events[depth - 2] : nullptr; } /** * Call this before recording events or data. Records * syscallbuf data and flushes the buffer, if there's buffered * data. * * The timing of calls to this is tricky. We must flush the syscallbuf * before recording any data associated with events that happened after the * buffered syscalls. But we don't support flushing a syscallbuf twice with * no intervening reset, i.e. after flushing we have to be sure we'll get * a chance to reset the syscallbuf (i.e. record some other kind of event) * before the tracee runs again in a way that might append another buffered * syscall --- so we can't flush too early */ void maybe_flush_syscallbuf(); /** * Call this after recording an event when it might be safe to reset the * syscallbuf. It must be after recording an event to ensure during replay * we run past any syscallbuf after-syscall code that uses the buffer data. */ void maybe_reset_syscallbuf(); /** * Record an event on behalf of this. Record the registers of * this (and other relevant execution state) so that it can be * used or verified during replay, if that state is available * and meaningful at this's current execution point. * |record_current_event()| record |this->ev()|, and * |record_event()| records the specified event. */ void record_current_event(); enum FlushSyscallbuf { FLUSH_SYSCALLBUF, /* Pass this if it's safe to replay the event before we process the * syscallbuf records. */ DONT_FLUSH_SYSCALLBUF }; enum AllowSyscallbufReset { ALLOW_RESET_SYSCALLBUF, /* Pass this if it's safe to replay the event before we process the * syscallbuf records. */ DONT_RESET_SYSCALLBUF }; void record_event(const Event& ev, FlushSyscallbuf flush = FLUSH_SYSCALLBUF, AllowSyscallbufReset reset = ALLOW_RESET_SYSCALLBUF, const Registers* registers = nullptr); bool is_fatal_signal(int sig, SignalDeterministic deterministic) const; /** * Return the pid of the newborn thread created by this task. * Called when this task has a PTRACE_CLONE_EVENT with CLONE_THREAD. */ pid_t find_newborn_thread(); /** * Return the pid of the newborn process (whose parent has pid `parent_pid`, * which need not be the same as the current task's pid, due to CLONE_PARENT) * created by this task. Called when this task has a PTRACE_CLONE_EVENT * without CLONE_THREAD, or PTRACE_FORK_EVENT. */ pid_t find_newborn_process(pid_t child_parent); /** * If the process looks alive, kill it. It is recommended to call try_wait(), * on this task before, to make sure liveness is correctly reflected when * making this decision */ void kill_if_alive(); remote_ptr robust_list() const { return robust_futex_list; } size_t robust_list_len() const { return robust_futex_list_len; } /** Uses /proc so not trivially cheap. */ pid_t get_parent_pid() const; /** * Return true if this is a "clone child" per the wait(2) man page. */ bool is_clone_child() { return termination_signal != SIGCHLD; } void set_termination_signal(int sig) { termination_signal = sig; } /** * When a signal triggers an emulated a ptrace-stop for this task, * save the siginfo so a later emulated ptrace-continue with this signal * number can use it. */ void save_ptrace_signal_siginfo(const siginfo_t& si); enum { SYNTHETIC_TIME_SLICE_SI_CODE = -9999 }; /** * Tasks normally can't change their tid. There is one very special situation * where they can: when a non-main-thread does an execve, its tid changes * to the tid of the thread-group leader. */ void set_tid_and_update_serial(pid_t tid, pid_t own_namespace_tid); /** * Return our cached copy of the signal mask, updating it if necessary. */ sig_set_t get_sigmask(); /** * Just get the signal mask of the process. */ sig_set_t read_sigmask_from_process(); /** * Unblock the signal for the process. */ void unblock_signal(int sig); /** * Set the signal handler to default for the process. */ void set_sig_handler_default(int sig); ~RecordTask(); void maybe_restore_original_syscall_registers(); /** * The task reached zombie state. Do whatever processing is necessary (reaping * it, emulating ptrace stops, etc.) */ void did_reach_zombie(); // Is this task a container init? (which has special signal behavior) bool is_container_init() const { return tg->tgid_own_namespace == 1; } /** * Linux requires the invariant that that all members of a thread group * are reaped before the thread group leader. This determines whether or * not we're allowed to attempt reaping this thread or whether doing so * risks deadlock. */ bool may_reap(); /** * Reaps a task-exit notification, thus detaching us from the tracee. * N.B.: If may_reap is false, this risks a deadlock. */ void reap(); /** * Return true if the status of this has changed, but don't * block. */ bool try_wait(); bool waiting_for_pid_namespace_tasks_to_exit() const; int process_depth() const; /** * Called when this task is able to receive a SIGCHLD (e.g. because * we completed delivery of a signal). Sends a new synthetic * SIGCHLD to the task if there are still tasks that need a SIGCHLD * sent for them. * May queue signals for specific tasks. */ void send_synthetic_SIGCHLD_if_necessary(); private: /* Retrieve the tid of this task from the tracee and store it */ void update_own_namespace_tid(); /** * Wait for |futex| in this address space to have the value * |val|. * * WARNING: this implementation semi-busy-waits for the value * change. This must only be used in contexts where the futex * will change "soon". */ void futex_wait(remote_ptr futex, int val, bool* ok); /** * Call this when SYS_sigaction is finishing with |regs|. */ void update_sigaction(const Registers& regs); /** * Update the futex robust list head pointer to |list| (which * is of size |len|). */ void set_robust_list(remote_ptr list, size_t len) { robust_futex_list = list; robust_futex_list_len = len; } template void init_buffers_arch(); template void on_syscall_exit_arch(int syscallno, const Registers& regs); /** Helper function for update_sigaction. */ template void update_sigaction_arch(const Registers& regs); /** Update the clear-tid futex to |tid_addr|. */ void set_tid_addr(remote_ptr tid_addr); virtual bool post_vm_clone(CloneReason reason, int flags, Task* origin) override; public: Ticks ticks_at_last_recorded_syscall_exit; remote_code_ptr ip_at_last_recorded_syscall_exit; // Scheduler state Registers registers_at_start_of_last_timeslice; FrameTime time_at_start_of_last_timeslice; /* Task 'nice' value set by setpriority(2). We use this to drive scheduling decisions. rr's scheduler is deliberately simple and unfair; a task never runs as long as there's another runnable task with a lower nice value. */ int priority; /* Tasks with in_round_robin_queue set are in the session's * in_round_robin_queue instead of its task_priority_set. */ bool in_round_robin_queue; /* exit(), or exit_group() with one task, has been called, so * the exit can be treated as stable. */ bool stable_exit; bool detached_proxy; // ptrace emulation state // Task for which we're emulating ptrace of this task, or null RecordTask* emulated_ptracer; std::set emulated_ptrace_tracees; uintptr_t emulated_ptrace_event_msg; // Saved emulated-ptrace signals std::vector saved_ptrace_siginfos; // Code to deliver to ptracer/waiter when it waits. Note that zero can be a // valid code! Reset to zero when leaving the stop due to PTRACE_CONT etc. WaitStatus emulated_stop_code; // Always zero while no ptracer is attached. int emulated_ptrace_options; // One of PTRACE_CONT, PTRACE_SYSCALL --- or 0 if the tracee has not been // continued by its ptracer yet, or has no ptracer. int emulated_ptrace_cont_command; // true when a ptracer/waiter wait() can return |emulated_stop_code|. bool emulated_stop_pending; // true if this task needs to send a SIGCHLD to its ptracer for its // emulated ptrace stop bool emulated_ptrace_SIGCHLD_pending; // true if this task needs to send a SIGCHLD to its parent for its // emulated stop bool emulated_SIGCHLD_pending; // tracer attached via PTRACE_SEIZE bool emulated_ptrace_seized; WaitType in_wait_type; pid_t in_wait_pid; // Signal handler state // Points to the signal-hander table of this task. If this // task is a non-fork clone child, then the table will be // shared with all its "thread" siblings. Any updates made to // that shared table are immediately visible to all sibling // threads. // // fork children always get their own copies of the table. // And if this task exec()s, the table is copied and stripped // of user sighandlers (see below). */ std::shared_ptr sighandlers; // If not NOT_STOPPED, then the task is logically stopped and this is the type // of stop. EmulatedStopType emulated_stop_type; // True if the task sigmask may have changed and we need to refetch it. bool blocked_sigs_dirty; // Most accesses to this should use set_sigmask and get_sigmask to ensure // the mirroring to syscallbuf is correct. sig_set_t blocked_sigs; uint32_t syscallbuf_blocked_sigs_generation; // Syscallbuf state SyscallbufCodeLayout syscallbuf_code_layout; ScopedFd desched_fd; /* Value of hdr->num_rec_bytes when the buffer was flushed */ uint32_t flushed_num_rec_bytes; /* Nonzero after the trace recorder has flushed the * syscallbuf. When this happens, the recorder must prepare a * "reset" of the buffer, to zero the record count, at the * next available slow (taking |desched| into * consideration). */ bool flushed_syscallbuf; /* This bit is set when code wants to prevent the syscall * record buffer from being reset when it normally would be. * This bit is set by the desched code. */ bool delay_syscallbuf_reset_for_desched; /* This is set when code wants to prevent the syscall * record buffer from being reset when it normally would be. * This is set by the code for handling seccomp SIGSYS signals. */ bool delay_syscallbuf_reset_for_seccomp_trap; // Value to return from PR_GET_SECCOMP uint8_t prctl_seccomp_status; // Mirrored kernel state // This state agrees with kernel-internal values // Futex list passed to |set_robust_list()|. We could keep a // strong type for this list head and read it if we wanted to, // but for now we only need to remember its address / size at // the time of the most recent set_robust_list() call. remote_ptr robust_futex_list; size_t robust_futex_list_len; // The memory cell the kernel will clear and notify on exit, // if our clone parent requested it. remote_ptr tid_futex; // Signal delivered by the kernel when this task terminates, or zero int termination_signal; // Our value for PR_GET/SET_TSC (one of PR_TSC_ENABLED, PR_TSC_SIGSEGV). int tsc_mode; // Our value for ARCH_GET/SET_CPUID (0 -> generate SIGSEGV, 1 -> do CPUID). // Only used if session().has_cpuid_faulting(). int cpuid_mode; // The current stack of events being processed. (We use a // deque instead of a stack because we need to iterate the // events.) std::deque pending_events; // Stashed signal-delivery state, ready to be delivered at // next opportunity. std::deque stashed_signals; bool stashed_signals_blocking_more_signals; bool stashed_group_stop; bool break_at_syscallbuf_traced_syscalls; bool break_at_syscallbuf_untraced_syscalls; bool break_at_syscallbuf_final_instruction; // The pmc is programmed to interrupt at a value requested by the tracee, not // by rr. bool next_pmc_interrupt_is_for_user; bool did_record_robust_futex_changes; // This task is just waiting to be reaped. bool waiting_for_reap; // This task is waiting to reach zombie state bool waiting_for_zombie; // This task is waiting for a ptrace exit event. It should not // be manually run. bool waiting_for_ptrace_exit; // When exiting a syscall, we should call MonkeyPatcher::try_patch_syscall again. bool retry_syscall_patching; // We've sent a SIGKILL during shutdown for this task. bool sent_shutdown_kill; // Set if the tracee requested an override of the ticks request. // Used for testing. TicksRequest tick_request_override; }; } // namespace rr #endif /* RR_RECORD_TASK_H_ */ rr-5.5.0/src/Registers.cc000066400000000000000000000631021412202446200152100ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "Registers.h" #include #include #include #include #include "ReplayTask.h" #include "core.h" #include "log.h" using namespace std; namespace rr { struct RegisterValue { // The name of this register. const char* name; // The offsetof the register in user_regs_struct. size_t offset; // The size of the register. 0 means we cannot read it. size_t nbytes; // Mask to be applied to register values prior to comparing them. Will // typically be ((1 << nbytes) - 1), but some registers may have special // comparison semantics. uint64_t comparison_mask; constexpr RegisterValue() : name(nullptr), offset(0), nbytes(0), comparison_mask(0) {} RegisterValue(const char* name_, size_t offset_, size_t nbytes_) : name(name_), offset(offset_), nbytes(nbytes_) { comparison_mask = mask_for_nbytes(nbytes_); } RegisterValue(const char* name_, size_t offset_, size_t nbytes_, uint64_t comparison_mask_, size_t size_override = 0) : name(name_), offset(offset_), nbytes(nbytes_), comparison_mask(comparison_mask_) { // Ensure no bits are set outside of the register's bitwidth. DEBUG_ASSERT((comparison_mask_ & ~mask_for_nbytes(nbytes_)) == 0); if (size_override > 0) { nbytes = size_override; } } // Returns a pointer to the register in |regs| represented by |offset|. // |regs| is assumed to be a pointer to the user_struct_regs for the // appropriate architecture. void* pointer_into(void* regs) { return static_cast(regs) + offset; } const void* pointer_into(const void* regs) { return static_cast(regs) + offset; } static uint64_t mask_for_nbytes(size_t nbytes) { DEBUG_ASSERT(nbytes <= sizeof(comparison_mask)); return ((nbytes == sizeof(comparison_mask)) ? uint64_t(0) : (uint64_t(1) << nbytes * 8)) - 1; } }; typedef std::pair RegisterInit; template struct RegisterTable : std::array { RegisterTable(std::initializer_list list) { for (auto& ri : list) { (*this)[ri.first] = ri.second; } } }; template struct RegisterInfo; template <> struct RegisterInfo { static bool ignore_undefined_register(GdbRegister regno) { return regno == DREG_FOSEG || regno == DREG_MXCSR; } static const size_t num_registers = DREG_NUM_LINUX_I386; typedef RegisterTable Table; static Table registers; }; template <> struct RegisterInfo { static bool ignore_undefined_register(GdbRegister regno) { return regno == DREG_64_FOSEG || regno == DREG_64_MXCSR; } static const size_t num_registers = DREG_NUM_LINUX_X86_64; typedef RegisterTable Table; static Table registers; }; template <> struct RegisterInfo { static bool ignore_undefined_register(GdbRegister) { return false; } static const size_t num_registers = DREG_NUM_LINUX_AARCH64; typedef RegisterTable Table; static Table registers; }; #define RV_ARCH(gdb_suffix, name, arch, extra_ctor_args) \ RegisterInit(DREG_##gdb_suffix, \ RegisterValue(#name, offsetof(arch::user_regs_struct, name), \ sizeof(((arch::user_regs_struct*)0)->name) \ extra_ctor_args)) #define RV_X86(gdb_suffix, name) \ RV_ARCH(gdb_suffix, name, rr::X86Arch, /* empty */) #define RV_X64(gdb_suffix, name) \ RV_ARCH(gdb_suffix, name, rr::X64Arch, /* empty */) #define COMMA , #define RV_X86_WITH_MASK(gdb_suffix, name, comparison_mask) \ RV_ARCH(gdb_suffix, name, rr::X86Arch, COMMA comparison_mask) #define RV_X64_WITH_MASK(gdb_suffix, name, comparison_mask, size) \ RV_ARCH(gdb_suffix, name, rr::X64Arch, COMMA comparison_mask COMMA size) #define RV_AARCH64(gdb_suffix, name) RV_ARCH(gdb_suffix, name, rr::ARM64Arch, /* empty */) #define RV_AARCH64_WITH_MASK(gdb_suffix, name, comparison_mask, size) \ RV_ARCH(gdb_suffix, name, rr::ARM64Arch, COMMA comparison_mask COMMA size) RegisterInfo::Table RegisterInfo::registers = { RV_X86(EAX, eax), RV_X86(ECX, ecx), RV_X86(EDX, edx), RV_X86(EBX, ebx), RV_X86(ESP, esp), RV_X86(EBP, ebp), RV_X86(ESI, esi), RV_X86(EDI, edi), RV_X86(EIP, eip), RV_X86_WITH_MASK(EFLAGS, eflags, 0), RV_X86_WITH_MASK(CS, xcs, 0), RV_X86_WITH_MASK(SS, xss, 0), RV_X86_WITH_MASK(DS, xds, 0), RV_X86_WITH_MASK(ES, xes, 0), // Mask out the RPL from the fs and gs segment selectors. The kernel // unconditionally sets RPL=3 on sigreturn, but if the segment index is 0, // the RPL doesn't matter, and the CPU resets the entire register to 0, // so whether or not we see this depends on whether the value round-tripped // to the CPU yet. RV_X86_WITH_MASK(FS, xfs, (uint16_t)~3), RV_X86_WITH_MASK(GS, xgs, (uint16_t)~3), // The comparison for this is handled specially elsewhere. RV_X86_WITH_MASK(ORIG_EAX, orig_eax, 0), }; RegisterInfo::Table RegisterInfo::registers = { RV_X64(RAX, rax), RV_X64(RCX, rcx), RV_X64(RDX, rdx), RV_X64(RBX, rbx), RV_X64_WITH_MASK(RSP, rsp, 0, 8), RV_X64(RBP, rbp), RV_X64(RSI, rsi), RV_X64(RDI, rdi), RV_X64(R8, r8), RV_X64(R9, r9), RV_X64(R10, r10), RV_X64(R11, r11), RV_X64(R12, r12), RV_X64(R13, r13), RV_X64(R14, r14), RV_X64(R15, r15), RV_X64(RIP, rip), RV_X64_WITH_MASK(64_EFLAGS, eflags, 0, 4), RV_X64_WITH_MASK(64_CS, cs, 0, 4), RV_X64_WITH_MASK(64_SS, ss, 0, 4), RV_X64_WITH_MASK(64_DS, ds, 0, 4), RV_X64_WITH_MASK(64_ES, es, 0, 4), RV_X64_WITH_MASK(64_FS, fs, 0xffffffffLL, 4), RV_X64_WITH_MASK(64_GS, gs, 0xffffffffLL, 4), // The comparison for this is handled specially // elsewhere. RV_X64_WITH_MASK(ORIG_RAX, orig_rax, 0, 8), RV_X64(FS_BASE, fs_base), RV_X64(GS_BASE, gs_base), }; RegisterInfo::Table RegisterInfo::registers = { RV_AARCH64(X0, x[0]), RV_AARCH64(X1, x[1]), RV_AARCH64(X2, x[2]), RV_AARCH64(X3, x[3]), RV_AARCH64(X4, x[4]), RV_AARCH64(X5, x[5]), RV_AARCH64(X6, x[6]), // Don't compare these - the kernel sometimes lies [1] about this value // [1] https://github.com/torvalds/linux/blob/d2f8825ab78e4c18686f3e1a756a30255bb00bf3/arch/arm64/kernel/ptrace.c#L1814-L1820 RV_AARCH64_WITH_MASK(X7, x[7], 0, 8), RV_AARCH64(X8, x[8]), RV_AARCH64(X9, x[9]), RV_AARCH64(X10, x[10]), RV_AARCH64(X11, x[11]), RV_AARCH64(X12, x[12]), RV_AARCH64(X13, x[13]), RV_AARCH64(X14, x[14]), RV_AARCH64(X15, x[15]), RV_AARCH64(X16, x[16]), RV_AARCH64(X17, x[17]), RV_AARCH64(X18, x[18]), RV_AARCH64(X19, x[19]), RV_AARCH64(X20, x[20]), RV_AARCH64(X21, x[21]), RV_AARCH64(X22, x[22]), RV_AARCH64(X23, x[23]), RV_AARCH64(X24, x[24]), RV_AARCH64(X25, x[25]), RV_AARCH64(X26, x[26]), RV_AARCH64(X27, x[27]), RV_AARCH64(X28, x[28]), RV_AARCH64(X29, x[29]), RV_AARCH64(X30, x[30]), RV_AARCH64(SP, sp), RV_AARCH64(PC, pc), // Mask out the single-step flag from the pstate. During replay, we may // single-step to an execution point, which could set the single-step bit // when it wasn't set during record. RV_AARCH64_WITH_MASK(CPSR, pstate, 0xffffffffLL & ~AARCH64_DBG_SPSR_SS, 4), }; #undef RV_X64 #undef RV_X86 #undef RV_AARCH64 #undef RV_X64_WITH_MASK #undef RV_X86_WITH_MASK #undef RV_AARCH64_WITH_MASK #undef RV_ARCH // 32-bit format, 64-bit format for all of these. // format_index in RegisterPrinting depends on the ordering here. static const char* hex_format_leading_0x[] = { "0x%" PRIx32, "0x%" PRIx64 }; // static const char* decimal_format[] = { "%" PRId32, "%" PRId64 }; template struct RegisterPrinting; template <> struct RegisterPrinting<4> { typedef uint32_t type; static const size_t format_index = 0; }; template <> struct RegisterPrinting<8> { typedef uint64_t type; static const size_t format_index = 1; }; template void print_single_register(FILE* f, const char* name, const void* register_ptr, const char* formats[]) { typename RegisterPrinting::type val; memcpy(&val, register_ptr, nbytes); if (name) { fprintf(f, "%s:", name); } else { fprintf(f, " "); } fprintf(f, formats[RegisterPrinting::format_index], val); } template void Registers::print_register_file_arch(FILE* f, const char* formats[]) const { fprintf(f, "Printing register file:\n"); const void* user_regs = &u; for (auto& rv : RegisterInfo::registers) { if (rv.nbytes == 0) { continue; } switch (rv.nbytes) { case 8: print_single_register<8>(f, rv.name, rv.pointer_into(user_regs), formats); break; case 4: print_single_register<4>(f, rv.name, rv.pointer_into(user_regs), formats); break; default: DEBUG_ASSERT(0 && "bad register size"); } fprintf(f, "\n"); } fprintf(f, "\n"); } void Registers::print_register_file(FILE* f) const { RR_ARCH_FUNCTION(print_register_file_arch, arch(), f, hex_format_leading_0x); } template void Registers::print_register_file_for_trace_arch( FILE* f, TraceStyle style, const char* formats[]) const { const void* user_regs = &u; bool first = true; for (auto& rv : RegisterInfo::registers) { if (rv.nbytes == 0) { continue; } if (!first) { fputc(' ', f); } first = false; const char* name = (style == Annotated ? rv.name : nullptr); switch (rv.nbytes) { case 8: print_single_register<8>(f, name, rv.pointer_into(user_regs), formats); break; case 4: print_single_register<4>(f, name, rv.pointer_into(user_regs), formats); break; default: DEBUG_ASSERT(0 && "bad register size"); } } } void Registers::print_register_file_compact(FILE* f) const { RR_ARCH_FUNCTION(print_register_file_for_trace_arch, arch(), f, Annotated, hex_format_leading_0x); } void Registers::print_register_file_for_trace_raw(FILE* f) const { fprintf(f, " %d %d %d %d %d %d %d" " %d %d %d %d", u.x86regs.eax, u.x86regs.ebx, u.x86regs.ecx, u.x86regs.edx, u.x86regs.esi, u.x86regs.edi, u.x86regs.ebp, u.x86regs.orig_eax, u.x86regs.esp, u.x86regs.eip, u.x86regs.eflags); } static void maybe_print_reg_mismatch(MismatchBehavior mismatch_behavior, const char* regname, const char* label1, uint64_t val1, const char* label2, uint64_t val2) { if (mismatch_behavior >= BAIL_ON_MISMATCH) { LOG(error) << regname << " " << HEX(val1) << " != " << HEX(val2) << " (" << label1 << " vs. " << label2 << ")"; } else if (mismatch_behavior >= LOG_MISMATCHES) { LOG(info) << regname << " " << HEX(val1) << " != " << HEX(val2) << " (" << label1 << " vs. " << label2 << ")"; } } template bool Registers::compare_registers_core(const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { bool match = true; for (auto& rv : RegisterInfo::registers) { if (rv.nbytes == 0) { continue; } // Disregard registers that will trivially compare equal. if (rv.comparison_mask == 0) { continue; } // XXX correct but oddly displayed for big-endian processors. uint64_t val1 = 0, val2 = 0; memcpy(&val1, rv.pointer_into(®1.u), rv.nbytes); memcpy(&val2, rv.pointer_into(®2.u), rv.nbytes); if ((val1 ^ val2) & rv.comparison_mask) { maybe_print_reg_mismatch(mismatch_behavior, rv.name, name1, val1, name2, val2); match = false; } } return match; } // A handy macro for compare_registers_arch specializations. #define REGCMP(user_regs, _reg) \ do { \ if (reg1.user_regs._reg != reg2.user_regs._reg) { \ maybe_print_reg_mismatch(mismatch_behavior, #_reg, name1, \ reg1.user_regs._reg, name2, \ reg2.user_regs._reg); \ match = false; \ } \ } while (0) #define X86_REGCMP(_reg) REGCMP(u.x86regs, _reg) #define X64_REGCMP(_reg) REGCMP(u.x64regs, _reg) // A wrapper around compare_registers_core so registers requiring special // processing can be handled via template specialization. template /* static */ bool Registers::compare_registers_arch( const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { // Default behavior. return compare_registers_core(name1, reg1, name2, reg2, mismatch_behavior); } template <> /* static */ bool Registers::compare_registers_arch( const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { bool match = compare_registers_core(name1, reg1, name2, reg2, mismatch_behavior); /* When the kernel is entered via an interrupt, orig_rax is set to -IRQ. We observe negative orig_eax values at SCHED events and signals and other timer interrupts. These values are only really meaningful to compare when they reflect original syscall numbers, in which case both will be positive. */ if (reg1.u.x86regs.orig_eax >= 0 && reg2.u.x86regs.orig_eax >= 0) { X86_REGCMP(orig_eax); } return match; } template <> /* static */ bool Registers::compare_registers_arch( const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { bool match = compare_registers_core(name1, reg1, name2, reg2, mismatch_behavior); // See comment in the x86 case if ((intptr_t)reg1.u.x64regs.orig_rax >= 0 && (intptr_t)reg2.u.x64regs.orig_rax >= 0) { X64_REGCMP(orig_rax); } return match; } /*static*/ bool Registers::compare_register_files_internal( const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { DEBUG_ASSERT(reg1.arch() == reg2.arch()); RR_ARCH_FUNCTION(compare_registers_arch, reg1.arch(), name1, reg1, name2, reg2, mismatch_behavior); } /*static*/ bool Registers::compare_register_files( ReplayTask* t, const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior) { bool bail_error = mismatch_behavior >= BAIL_ON_MISMATCH; bool match = compare_register_files_internal(name1, reg1, name2, reg2, mismatch_behavior); if (t) { ASSERT(t, !bail_error || match) << "Fatal register mismatch (ticks/rec:" << t->tick_count() << "/" << t->current_trace_frame().ticks() << ")"; } else { DEBUG_ASSERT(!bail_error || match); } if (match && mismatch_behavior == LOG_MISMATCHES) { LOG(info) << "(register files are the same for " << name1 << " and " << name2 << ")"; } return match; } template size_t Registers::read_register_arch(uint8_t* buf, GdbRegister regno, bool* defined) const { if (regno >= array_length(RegisterInfo::registers)) { *defined = false; return 0; } RegisterValue& rv = RegisterInfo::registers[regno]; if (rv.nbytes == 0) { *defined = false; } else { *defined = true; memcpy(buf, rv.pointer_into(&u), rv.nbytes); } return rv.nbytes; } size_t Registers::read_register(uint8_t* buf, GdbRegister regno, bool* defined) const { RR_ARCH_FUNCTION(read_register_arch, arch(), buf, regno, defined); } template size_t Registers::read_register_by_user_offset_arch(uint8_t* buf, uintptr_t offset, bool* defined) const { for (size_t regno = 0; regno < RegisterInfo::num_registers; ++regno) { RegisterValue& rv = RegisterInfo::registers[regno]; if (rv.offset == offset) { return read_register_arch(buf, GdbRegister(regno), defined); } } *defined = false; return 0; } size_t Registers::read_register_by_user_offset(uint8_t* buf, uintptr_t offset, bool* defined) const { RR_ARCH_FUNCTION(read_register_by_user_offset_arch, arch(), buf, offset, defined); } template void Registers::write_register_arch(GdbRegister regno, const void* value, size_t value_size) { RegisterValue& rv = RegisterInfo::registers[regno]; if (rv.nbytes == 0) { // TODO: can we get away with not writing these? if (RegisterInfo::ignore_undefined_register(regno)) { return; } LOG(warn) << "Unhandled register name " << regno; } else { DEBUG_ASSERT(value_size == rv.nbytes); memcpy(rv.pointer_into(&u), value, value_size); } } void Registers::write_register(GdbRegister regno, const void* value, size_t value_size) { RR_ARCH_FUNCTION(write_register_arch, arch(), regno, value, value_size); } template void Registers::write_register_by_user_offset_arch(uintptr_t offset, uintptr_t value) { for (size_t regno = 0; regno < RegisterInfo::num_registers; ++regno) { RegisterValue& rv = RegisterInfo::registers[regno]; if (rv.offset == offset) { DEBUG_ASSERT(rv.nbytes <= sizeof(value)); memcpy(rv.pointer_into(&u), &value, rv.nbytes); return; } } } void Registers::write_register_by_user_offset(uintptr_t offset, uintptr_t value) { RR_ARCH_FUNCTION(write_register_by_user_offset_arch, arch(), offset, value); } // In theory it doesn't matter how 32-bit register values are sign extended // to 64 bits for PTRACE_SETREGS. However: // -- When setting up a signal handler frame, the kernel does some arithmetic // on the 64-bit SP value and validates that the result points to writeable // memory. This validation fails if SP has been sign-extended to point // outside the 32-bit address space. // -- Some kernels (e.g. 4.3.3-301.fc23.x86_64) with commmit // c5c46f59e4e7c1ab244b8d38f2b61d317df90bba have a bug where if you clear // the upper 32 bits of %rax while in the kernel, syscalls may fail to // restart. So sign-extension is necessary for %eax in this case. We may as // well sign-extend %eax in all cases. typedef void (*NarrowConversion)(int32_t& r32, uint64_t& r64); template void convert_x86(X86Arch::user_regs_struct& x86, X64Arch::user_regs_struct& x64) { narrow_signed(x86.eax, x64.rax); narrow(x86.ebx, x64.rbx); narrow(x86.ecx, x64.rcx); narrow(x86.edx, x64.rdx); narrow(x86.esi, x64.rsi); narrow(x86.edi, x64.rdi); narrow(x86.esp, x64.rsp); narrow(x86.ebp, x64.rbp); narrow(x86.eip, x64.rip); narrow(x86.orig_eax, x64.orig_rax); narrow(x86.eflags, x64.eflags); narrow(x86.xcs, x64.cs); narrow(x86.xds, x64.ds); narrow(x86.xes, x64.es); narrow(x86.xfs, x64.fs); narrow(x86.xgs, x64.gs); narrow(x86.xss, x64.ss); } void to_x86_narrow(int32_t& r32, uint64_t& r64) { r32 = r64; } void from_x86_narrow(int32_t& r32, uint64_t& r64) { r64 = (uint32_t)r32; } void from_x86_narrow_signed(int32_t& r32, uint64_t& r64) { r64 = (int64_t)r32; } void Registers::set_from_ptrace(const NativeArch::user_regs_struct& ptrace_regs) { if (arch() == NativeArch::arch()) { memcpy(&u, &ptrace_regs, sizeof(ptrace_regs)); return; } DEBUG_ASSERT(arch() == x86 && NativeArch::arch() == x86_64); convert_x86( u.x86regs, *const_cast( reinterpret_cast(&ptrace_regs))); } /** * Get a user_regs_struct from these Registers. If the tracee architecture * is not rr's native architecture, then it must be a 32-bit tracee with a * 64-bit rr. In that case the user_regs_struct is 64-bit and we copy * the 32-bit register values from u.x86regs into it. */ NativeArch::user_regs_struct Registers::get_ptrace() const { union { NativeArch::user_regs_struct linux_api; struct X64Arch::user_regs_struct x64arch_api; } result; if (arch() == NativeArch::arch()) { memcpy(&result, &u, sizeof(result)); return result.linux_api; } DEBUG_ASSERT(arch() == x86 && NativeArch::arch() == x86_64); memset(&result, 0, sizeof(result)); convert_x86( const_cast(this)->u.x86regs, result.x64arch_api); return result.linux_api; } iovec Registers::get_ptrace_iovec() { if (arch() == NativeArch::arch()) { iovec iov = { &u, sizeof(NativeArch::user_regs_struct) }; return iov; } DEBUG_ASSERT(arch() == x86 && NativeArch::arch() == x86_64); iovec iov = { &u.x86regs, sizeof(u.x86regs) }; return iov; } Registers::InternalData Registers::get_ptrace_for_self_arch() const { switch (arch_) { case x86: return { reinterpret_cast(&u.x86regs), sizeof(u.x86regs) }; case x86_64: return { reinterpret_cast(&u.x64regs), sizeof(u.x64regs) }; case aarch64: return { reinterpret_cast(&u.arm64regs._ptrace), sizeof(u.arm64regs._ptrace) }; default: DEBUG_ASSERT(0 && "Unknown arch"); return { nullptr, 0 }; } } Registers::InternalData Registers::get_regs_for_trace() const { switch (arch_) { case x86: return { reinterpret_cast(&u.x86regs), sizeof(u.x86regs) }; case x86_64: return { reinterpret_cast(&u.x64regs), sizeof(u.x64regs) }; case aarch64: return { reinterpret_cast(&u.arm64regs), sizeof(u.arm64regs) }; default: DEBUG_ASSERT(0 && "Unknown arch"); return { nullptr, 0 }; } } vector Registers::get_ptrace_for_arch(SupportedArch arch) const { Registers tmp_regs(arch); tmp_regs.set_from_ptrace(get_ptrace()); InternalData tmp_data = tmp_regs.get_ptrace_for_self_arch(); vector result; result.resize(tmp_data.size); memcpy(result.data(), tmp_data.data, tmp_data.size); return result; } void Registers::set_from_ptrace_for_arch(SupportedArch a, const void* data, size_t size) { if (a == NativeArch::arch()) { DEBUG_ASSERT(size == sizeof(NativeArch::user_regs_struct)); set_from_ptrace(*static_cast(data)); return; } DEBUG_ASSERT(a == x86 && NativeArch::arch() == x86_64); // We don't support a 32-bit tracee trying to set registers of a 64-bit tracee DEBUG_ASSERT(arch() == x86); DEBUG_ASSERT(size == sizeof(u.x86regs)); memcpy(&u.x86regs, data, sizeof(u.x86regs)); } void Registers::set_from_trace(SupportedArch a, const void* data, size_t size) { if (is_x86ish(a)) { return set_from_ptrace_for_arch(a, data, size); } DEBUG_ASSERT(a == aarch64); DEBUG_ASSERT(size == sizeof(u.arm64regs)); memcpy(&u.arm64regs, data, sizeof(u.arm64regs)); } bool Registers::aarch64_singlestep_flag() const { switch (arch()) { case aarch64: return pstate() & AARCH64_DBG_SPSR_SS; default: DEBUG_ASSERT(0 && "X86 only code path"); return false; } } void Registers::set_aarch64_singlestep_flag() { switch (arch()) { case aarch64: return set_pstate(pstate() | AARCH64_DBG_SPSR_SS); default: DEBUG_ASSERT(0 && "AArch64 only code path"); return; } } bool Registers::x86_singlestep_flag() const { switch (arch()) { case x86: case x86_64: return flags() & X86_TF_FLAG; default: DEBUG_ASSERT(0 && "X86 only code path"); return false; } } void Registers::clear_x86_singlestep_flag() { switch (arch()) { case x86: case x86_64: set_flags(flags() & ~X86_TF_FLAG); return; default: DEBUG_ASSERT(0 && "X86 only code path"); break; } } bool Registers::syscall_failed() const { auto result = syscall_result_signed(); return -4096 < result && result < 0; } bool Registers::syscall_may_restart() const { switch (-syscall_result_signed()) { case ERESTART_RESTARTBLOCK: case ERESTARTNOINTR: case ERESTARTNOHAND: case ERESTARTSYS: return true; default: return false; } } ostream& operator<<(ostream& stream, const Registers& r) { stream << "{ args:(" << HEX(r.arg1()) << "," << HEX(r.arg2()) << "," << HEX(r.arg3()) << "," << HEX(r.arg4()) << "," << HEX(r.arg5()) << "," << r.arg6() << ") orig_syscall: " << r.original_syscallno() << " syscallno: " << r.syscallno() << " }"; return stream; } } // namespace rr rr-5.5.0/src/Registers.h000066400000000000000000000535371412202446200150650ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #ifndef RR_REGISTERS_H_ #define RR_REGISTERS_H_ #include #include #include #include #include #include #include "GdbRegister.h" #include "core.h" #include "kernel_abi.h" #include "remote_code_ptr.h" #include "remote_ptr.h" struct iovec; namespace rr { class ReplayTask; enum MismatchBehavior { EXPECT_MISMATCHES = 0, LOG_MISMATCHES, BAIL_ON_MISMATCH }; const uintptr_t X86_RESERVED_FLAG = 1 << 1; const uintptr_t X86_ZF_FLAG = 1 << 6; const uintptr_t X86_TF_FLAG = 1 << 8; const uintptr_t X86_IF_FLAG = 1 << 9; const uintptr_t X86_DF_FLAG = 1 << 10; const uintptr_t X86_RF_FLAG = 1 << 16; const uintptr_t X86_ID_FLAG = 1 << 21; const uintptr_t AARCH64_DBG_SPSR_SS = 1 << 21; /** * A Registers object contains values for all general-purpose registers. * These must include all registers used to pass syscall parameters and return * syscall results. * * When reading register values, be sure to cast the result to the correct * type according to the kernel docs. E.g. int values should be cast * to int explicitly (or implicitly, by assigning to an int-typed variable), * size_t should be cast to size_t, etc. If the type is signed, call the * _signed getter. This ensures that when building rr 64-bit we will use the * right number of register bits whether the tracee is 32-bit or 64-bit, and * get sign-extension right. * * We have different register sets for different architectures. To ensure a * trace can be dumped/processed by an rr build on any platform, we allow * Registers to contain registers for any architecture. So we store them * in a union of Arch::user_regs_structs for each known Arch. */ class Registers { public: enum { MAX_SIZE = 16 }; Registers(SupportedArch a = SupportedArch(-1)) : arch_(a) { memset(&u, 0, sizeof(u)); } SupportedArch arch() const { return arch_; } void set_arch(SupportedArch a) { arch_ = a; } /** * Copy a user_regs_struct into these Registers. If the tracee architecture * is not rr's native architecture, then it must be a 32-bit tracee with a * 64-bit rr. In that case the user_regs_struct is 64-bit and we extract * the 32-bit register values from it into u.x86regs. * It's invalid to call this when the Registers' arch is 64-bit and the * rr build is 32-bit, or when the Registers' arch is completely different * to the rr build (e.g. ARM vs x86). */ void set_from_ptrace(const NativeArch::user_regs_struct& ptrace_regs); /** * Get a user_regs_struct from these Registers. If the tracee architecture * is not rr's native architecture, then it must be a 32-bit tracee with a * 64-bit rr. In that case the user_regs_struct is 64-bit and we copy * the 32-bit register values from u.x86regs into it. * It's invalid to call this when the Registers' arch is 64-bit and the * rr build is 32-bit, or when the Registers' arch is completely different * to the rr build (e.g. ARM vs x86). */ NativeArch::user_regs_struct get_ptrace() const; iovec get_ptrace_iovec(); /** * Get a user_regs_struct for a particular Arch from these Registers. * It's invalid to call this when 'arch' is 64-bit and the * rr build is 32-bit, or when the Registers' arch is completely different * to the rr build (e.g. ARM vs x86). */ std::vector get_ptrace_for_arch(SupportedArch arch) const; struct InternalData { const uint8_t* data; size_t size; }; /** * Get the register content to save in the trace. */ InternalData get_regs_for_trace() const; /** * Equivalent to get_ptrace_for_arch(arch()) but doesn't copy. */ InternalData get_ptrace_for_self_arch() const; /** * Copy an arch-specific user_regs_struct into these Registers. * It's invalid to call this when 'arch' is 64-bit and the * rr build is 32-bit, or when the Registers' arch is completely different * to the rr build (e.g. ARM vs x86). */ void set_from_ptrace_for_arch(SupportedArch arch, const void* data, size_t size); /** * Copy from the arch-specific structure returned in get_regs_for_trace() * back into *this */ void set_from_trace(SupportedArch arch, const void* data, size_t size); #define ARCH_SWITCH_CASE(rettype, x86case, x64case, arm64case) \ (([=](void) -> rettype { \ switch (arch()) { \ default: \ DEBUG_ASSERT(0 && "unknown architecture"); \ RR_FALLTHROUGH; /* Fall through to avoid warnings */ \ case x86: { \ x86case; \ break; \ } \ case x86_64: { \ x64case; \ break; \ } \ case aarch64: { \ arm64case; \ break; \ } \ } \ })()) #define RR_GET_REG(x86case, x64case, arm64case) \ ARCH_SWITCH_CASE(uint64_t, \ return (uint32_t)u.x86regs.x86case, \ return u.x64regs.x64case, \ return u.arm64regs.arm64case) #define RR_GET_REG_SIGNED(x86case, x64case, arm64case) \ ARCH_SWITCH_CASE(int64_t, \ return (int32_t)u.x86regs.x86case, \ return u.x64regs.x64case, \ return u.arm64regs.arm64case) #define RR_GET_REG_X86(x86case, x64case) \ ARCH_SWITCH_CASE(uint64_t, \ return (uint32_t)u.x86regs.x86case, \ return u.x64regs.x64case, \ DEBUG_ASSERT(0 && "Hit an x86-only case, but this is not x86"); return 0) #define RR_UPDATE_CHECK(loc, value) bool changed = (uintptr_t)loc != (uintptr_t)(value); \ loc = (value); \ return changed; #define RR_SET_REG(x86case, x64case, arm64case, value) \ ARCH_SWITCH_CASE(bool, \ RR_UPDATE_CHECK(u.x86regs.x86case, value), \ RR_UPDATE_CHECK(u.x64regs.x64case, value), \ RR_UPDATE_CHECK(u.arm64regs.arm64case, value)) #define RR_SET_REG_X86(x86case, x64case, value) \ ARCH_SWITCH_CASE(bool, \ RR_UPDATE_CHECK(u.x86regs.x86case, value), \ RR_UPDATE_CHECK(u.x64regs.x64case, value), \ DEBUG_ASSERT(0 && "Hit an x86-only case, but this is not x86"); return false) remote_code_ptr ip() const { return RR_GET_REG(eip, rip, pc); } bool set_ip(remote_code_ptr addr) { return RR_SET_REG(eip, rip, pc, addr.register_value()); } remote_ptr sp() const { return RR_GET_REG(esp, rsp, sp); } bool set_sp(remote_ptr addr) { return RR_SET_REG(esp, rsp, sp, addr.as_int()); } // Access the registers holding system-call numbers, results, and // parameters. intptr_t syscallno() const { return (int)RR_GET_REG(eax, rax, x[8]); } bool set_syscallno(intptr_t syscallno) { return RR_SET_REG(eax, rax, x[8], syscallno); } /** * This pseudo-register holds the system-call number when we get ptrace * enter-system-call and exit-system-call events. Setting it changes * the system-call executed when resuming after an enter-system-call * event. */ intptr_t original_syscallno() const { return RR_GET_REG_SIGNED(orig_eax, orig_rax, orig_syscall); } bool set_original_syscallno(intptr_t syscallno) { return RR_SET_REG(orig_eax, orig_rax, orig_syscall, syscallno); } #define SYSCALL_REGISTER(name, x86case, x64case, arm64case) \ uintptr_t name() const { return RR_GET_REG(x86case, x64case, arm64case); } \ intptr_t name ## _signed() const { \ return RR_GET_REG_SIGNED(x86case, x64case, arm64case); \ } \ bool set_ ## name(uintptr_t value) { \ return RR_SET_REG(x86case, x64case, arm64case, value); \ } \ template bool set_ ## name(remote_ptr value) { \ return RR_SET_REG(x86case, x64case, arm64case, value.as_int()); \ } SYSCALL_REGISTER(syscall_result, eax, rax, x[0]); SYSCALL_REGISTER(orig_arg1, ebx, rdi, orig_x0) SYSCALL_REGISTER(arg1, ebx, rdi, x[0]) SYSCALL_REGISTER(arg2, ecx, rsi, x[1]) SYSCALL_REGISTER(arg3, edx, rdx, x[2]) SYSCALL_REGISTER(arg4, esi, r10, x[3]) SYSCALL_REGISTER(arg5, edi, r8, x[4]) SYSCALL_REGISTER(arg6, ebp, r9, x[5]) uintptr_t arg(int index) const { switch (index) { case 1: return arg1(); case 2: return arg2(); case 3: return arg3(); case 4: return arg4(); case 5: return arg5(); case 6: return arg6(); default: DEBUG_ASSERT(0 && "Argument index out of range"); return 0; } } /** * Set the register containing syscall argument |Index| to * |value|. */ template bool set_arg(std::nullptr_t) { return set_arg(Index, 0); } template bool set_arg(remote_ptr value) { return set_arg(Index, value.as_int()); } template bool set_arg(T value) { return set_arg(Index, uintptr_t(value)); } bool set_arg(int index, uintptr_t value) { switch (index) { case 1: return set_arg1(value); case 2: return set_arg2(value); case 3: return set_arg3(value); case 4: return set_arg4(value); case 5: return set_arg5(value); case 6: return set_arg6(value); default: DEBUG_ASSERT(0 && "Argument index out of range"); return false; } } bool set_orig_arg(int index, uintptr_t value) { switch (index) { case 1: return set_orig_arg1(value); case 2: return set_arg2(value); case 3: return set_arg3(value); case 4: return set_arg4(value); case 5: return set_arg5(value); case 6: return set_arg6(value); default: DEBUG_ASSERT(0 && "Argument index out of range"); return false; } } /** * Returns true if syscall_result() indicates failure. */ bool syscall_failed() const; /** * Returns true if syscall_result() indicates a syscall restart. */ bool syscall_may_restart() const; // Some X86-specific stuff follows. Use of these accessors should be guarded // by an architecture test. /** * Set the output registers of the |rdtsc| instruction. */ void set_rdtsc_output(uint64_t value) { RR_SET_REG_X86(eax, rax, value & 0xffffffff); RR_SET_REG_X86(edx, rdx, value >> 32); } void set_cpuid_output(uint32_t eax, uint32_t ebx, uint32_t ecx, uint32_t edx) { RR_SET_REG_X86(eax, rax, eax); RR_SET_REG_X86(ebx, rbx, ebx); RR_SET_REG_X86(ecx, rcx, ecx); RR_SET_REG_X86(edx, rdx, edx); } bool set_r8(uintptr_t value) { DEBUG_ASSERT(arch() == x86_64); RR_UPDATE_CHECK(u.x64regs.r8, value); } bool set_r9(uintptr_t value) { DEBUG_ASSERT(arch() == x86_64); RR_UPDATE_CHECK(u.x64regs.r9, value); } bool set_r10(uintptr_t value) { DEBUG_ASSERT(arch() == x86_64); RR_UPDATE_CHECK(u.x64regs.r10, value); } bool set_r11(uintptr_t value) { DEBUG_ASSERT(arch() == x86_64); RR_UPDATE_CHECK(u.x64regs.r11, value); } uintptr_t di() const { return RR_GET_REG_X86(edi, rdi); } bool set_di(uintptr_t value) { return RR_SET_REG_X86(edi, rdi, value); } uintptr_t si() const { return RR_GET_REG_X86(esi, rsi); } bool set_si(uintptr_t value) { return RR_SET_REG_X86(esi, rsi, value); } uintptr_t cx() const { return RR_GET_REG_X86(ecx, rcx); } bool set_cx(uintptr_t value) { return RR_SET_REG_X86(ecx, rcx, value); } uintptr_t ax() const { return RR_GET_REG_X86(eax, rax); } uintptr_t bp() const { return RR_GET_REG_X86(ebp, rbp); } uintptr_t flags() const { return RR_GET_REG_X86(eflags, eflags); }; bool set_flags(uintptr_t value) { return RR_SET_REG_X86(eflags, eflags, value); } bool zf_flag() const { return flags() & X86_ZF_FLAG; } bool df_flag() const { return flags() & X86_DF_FLAG; } uintptr_t fs_base() const { DEBUG_ASSERT(arch() == x86_64); return u.x64regs.fs_base; } uintptr_t gs_base() const { DEBUG_ASSERT(arch() == x86_64); return u.x64regs.gs_base; } void set_fs_base(uintptr_t fs_base) { DEBUG_ASSERT(arch() == x86_64); u.x64regs.fs_base = fs_base; } void set_gs_base(uintptr_t gs_base) { DEBUG_ASSERT(arch() == x86_64); u.x64regs.gs_base = gs_base; } uint64_t cs() const { return RR_GET_REG_X86(xcs, cs); } uint64_t ss() const { return RR_GET_REG_X86(xss, ss); } uint64_t ds() const { return RR_GET_REG_X86(xds, ds); } uint64_t es() const { return RR_GET_REG_X86(xes, es); } uint64_t fs() const { return RR_GET_REG_X86(xfs, fs); } uint64_t gs() const { return RR_GET_REG_X86(xgs, gs); } // End of X86-specific stuff // Begin aarch64 specific accessors uintptr_t pstate() const { DEBUG_ASSERT(arch() == aarch64); return u.arm64regs.pstate; } void set_pstate(uintptr_t pstate) { DEBUG_ASSERT(arch() == aarch64); u.arm64regs.pstate = pstate; } void set_x7(uintptr_t x7) { DEBUG_ASSERT(arch() == aarch64); u.arm64regs.x[7] = x7; } uintptr_t x1() const { DEBUG_ASSERT(arch() == aarch64); return u.arm64regs.x[1]; } uintptr_t x7() const { DEBUG_ASSERT(arch() == aarch64); return u.arm64regs.x[7]; } // End of aarch64 specific accessors /** * Modify the processor's single step flag. On x86 this is the TF flag in the * eflags register. */ bool x86_singlestep_flag() const; void clear_x86_singlestep_flag(); /** * Aarch64 has two flags that control single stepping. An EL1 one that * enables singlestep execeptions and an EL0 one in pstate (SPSR_SS). The EL1 bit * is controlled by PTRACE_SINGLESTEP (it gets turned on upon the first * PTRACE_(SYSEMU_)SINGLESTEP and turned off on any other ptrace resume). * The EL0 bit controls whether an exception is taken *before* execution * of the next instruction (an exception is taken when the bit is *clear*). * The hardware clears this bit whenever an instruction completes. Thus, to * ensure that a single step actually happens, regardless of how we got to * this step, we must both using PTRACE_SINGLESTEP and *set* the SPSR_SS bit. * Otherwise, if we got to this stop via single step, the SPSR_SS bit will * likely already be clear, and we'd take a single step exception without * ever having executed any userspace instructions whatsoever. */ bool aarch64_singlestep_flag() const; void set_aarch64_singlestep_flag(); void print_register_file(FILE* f) const; void print_register_file_compact(FILE* f) const; void print_register_file_for_trace_raw(FILE* f) const; /** * Return true if |reg1| matches |reg2|. Passing EXPECT_MISMATCHES * indicates that the caller is using this as a general register * compare and nothing special should be done if the register files * mismatch. Passing LOG_MISMATCHES will log the registers that don't * match. Passing BAIL_ON_MISMATCH will additionally abort on * mismatch. */ static bool compare_register_files(ReplayTask* t, const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior); bool matches(const Registers& other) const { return compare_register_files(nullptr, nullptr, *this, nullptr, other, EXPECT_MISMATCHES); } // TODO: refactor me to use the GdbRegisterValue helper from // GdbConnection.h. /** * Write the value for register |regno| into |buf|, which should * be large enough to hold any register supported by the target. * Return the size of the register in bytes and set |defined| to * indicate whether a useful value has been written to |buf|. */ size_t read_register(uint8_t* buf, GdbRegister regno, bool* defined) const; /** * Write the value for register |offset| into |buf|, which should * be large enough to hold any register supported by the target. * Return the size of the register in bytes and set |defined| to * indicate whether a useful value has been written to |buf|. * |offset| is the offset of the register within a user_regs_struct. */ size_t read_register_by_user_offset(uint8_t* buf, uintptr_t offset, bool* defined) const; /** * Update the register named |reg_name| to |value| with * |value_size| number of bytes. */ void write_register(GdbRegister reg_name, const void* value, size_t value_size); /** * Update the register at user offset |offset| to |value|, taking the low * bytes if necessary. */ void write_register_by_user_offset(uintptr_t offset, uintptr_t value); bool operator==(const Registers &other) const { if (arch() != other.arch()) { return false; } switch (arch()) { case x86: return memcmp(&u.x86regs, &other.u.x86regs, sizeof(u.x86regs)) == 0; case x86_64: return memcmp(&u.x64regs, &other.u.x64regs, sizeof(u.x64regs)) == 0; case aarch64: return memcmp(&u.arm64regs, &other.u.arm64regs, sizeof(u.arm64regs)) == 0; default: DEBUG_ASSERT(0 && "Unknown architecture"); return false; } } bool operator!=(const Registers &other) const { return !(*this == other); } private: template void print_register_file_arch(FILE* f, const char* formats[]) const; enum TraceStyle { Annotated, Raw, }; template void print_register_file_for_trace_arch(FILE* f, TraceStyle style, const char* formats[]) const; template static bool compare_registers_core(const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior); template static bool compare_registers_arch(const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior); static bool compare_register_files_internal( const char* name1, const Registers& reg1, const char* name2, const Registers& reg2, MismatchBehavior mismatch_behavior); template size_t read_register_arch(uint8_t* buf, GdbRegister regno, bool* defined) const; template size_t read_register_by_user_offset_arch(uint8_t* buf, uintptr_t offset, bool* defined) const; template void write_register_arch(GdbRegister regno, const void* value, size_t value_size); template void write_register_by_user_offset_arch(uintptr_t offset, uintptr_t value); template size_t total_registers_arch() const; SupportedArch arch_; union { rr::X86Arch::user_regs_struct x86regs; rr::X64Arch::user_regs_struct x64regs; struct { // This is the NT_PRSTATUS regset union { rr::ARM64Arch::user_regs_struct _ptrace; // This duplicates the field names of the user_regs_struct and makes // them available as fields of arm64regs for easy access. struct { uint64_t x[31]; uint64_t sp; uint64_t pc; uint64_t pstate; }; }; // This is not exposed through GETREGSET. We track it manually uint64_t orig_x0; // This is the NT_ARM_SYSTEM_CALL regset int orig_syscall; } arm64regs; } u; }; template ret with_converted_registers(const Registers& regs, SupportedArch arch, callback f) { if (regs.arch() != arch) { // If this is a cross architecture syscall, first convert the registers. Registers converted_regs(arch); std::vector data = regs.get_ptrace_for_arch(arch); converted_regs.set_from_ptrace_for_arch(arch, data.data(), data.size()); return f(converted_regs); } return f(regs); } std::ostream& operator<<(std::ostream& stream, const Registers& r); } // namespace rr #endif /* RR_REGISTERS_H_ */ rr-5.5.0/src/ReplayCommand.cc000066400000000000000000000503141412202446200157750ustar00rootroot00000000000000/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ #include "ReplayCommand.h" #include #include #include #include #include #include "Command.h" #include "Flags.h" #include "GdbServer.h" #include "ReplaySession.h" #include "ScopedFd.h" #include "core.h" #include "kernel_metadata.h" #include "log.h" #include "main.h" using namespace std; namespace rr { ReplayCommand ReplayCommand::singleton( "replay", " rr replay [OPTION]... [] [-- ]\n" " -a, --autopilot replay without debugger server\n" " -f, --onfork= start a debug server when has been\n" " fork()d, AND the target event has been\n" " reached.\n" " -g, --goto= start a debug server on reaching " "\n" " in the trace. See -M in the general " "options.\n" " -o, --debugger-option=